From 1fe923446e6c30267f2632267bf903b866ff8dd5 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 11:38:32 +1200
Subject: [PATCH 1/6] Ship 8b-2c: Yuva420p family u16 RGBA SIMD across all 5
 backends
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds native-depth u16 RGBA SIMD across NEON / SSE4.1 / AVX2 / AVX-512
/ wasm simd128 for the high-bit YUVA 4:2:0 family — Yuva420p9 /
Yuva420p10 (BITS-generic) and Yuva420p16 (16-bit). Wires the 3 u16
RGBA dispatchers in src/row/mod.rs that landed as scalar-only stubs
in PR #35 (Ship 8b-2a), completing the Yuva420p source-side family
across u8 RGBA (8b-2b, PR #36) and u16 RGBA (this PR).

Note: 8-bit Yuva420p has no u16 RGBA path — its u8 alpha source
doesn't widen meaningfully into a u16 alpha output, and the public
API doesn't expose it.

## Changes

- **5 SIMD backends** — each gain a third const-generic
  `ALPHA_SRC: bool` added to the existing
  `<BITS, ALPHA>` (or `<ALPHA>` for 16-bit) u16 RGBA templates
  across 2 kernel families:
  - high-bit BITS-generic: `yuv_420p_n_to_rgb_or_rgba_u16_row<BITS, ALPHA, ALPHA_SRC>`
  - 16-bit: `yuv_420p16_to_rgb_or_rgba_u16_row<ALPHA, ALPHA_SRC>`

  When `ALPHA_SRC = true`:
  - **High-bit (Yuva420p9/10)**: alpha is loaded + AND-masked with
    `bits_mask::<BITS>()` (same hardening as Y/U/V) and stored at
    native bit depth — no shift since both source and output are at
    BITS.
  - **16-bit (Yuva420p16)**: alpha is loaded directly as full-range
    u16 — no mask, no shift.

  Existing no-alpha / opaque-alpha wrappers stay backward-compat by
  passing `ALPHA_SRC = false, None`. AVX-512 16-bit's
  `write_rgba_u16_32` helper broadcasts a single 128-bit alpha lane,
  so the ALPHA_SRC = true branch inlines four `write_rgba_u16_8`
  calls with per-quarter alpha extraction instead.

- **3 u16 RGBA dispatchers wired** in `src/row/mod.rs`
  (`yuva420p9_to_rgba_u16_row`, `yuva420p10_to_rgba_u16_row`,
  `yuva420p16_to_rgba_u16_row`) — replace the prior `let _ = use_simd`
  stubs with the standard `cfg_select!` per-arch route block,
  mirroring the Yuva444p10 u16 dispatchers' patterns from PR #34.

- **Per-backend u16 RGBA equivalence tests** — 25 new `#[test]`
  functions across the 5 backend test modules (5 NEON, 5 each on
  SSE4.1 / AVX2 / AVX-512 / wasm simd128). Each new x86 test
  early-returns on `is_x86_feature_detected!` to satisfy CI
  sanitizer / Miri / non-feature-flagged runners. Pseudo-random
  alpha flushes lane-order corruption that solid alpha would mask.

- Compile-time `const { assert!(!ALPHA_SRC || ALPHA) }` retained on
  every shared template — source alpha requires RGBA output.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon.rs               | 190 +++++++++++++++++++---
 src/row/arch/neon/tests.rs         | 165 +++++++++++++++++++
 src/row/arch/wasm_simd128.rs       | 180 ++++++++++++++++++---
 src/row/arch/wasm_simd128/tests.rs | 143 +++++++++++++++++
 src/row/arch/x86_avx2.rs           | 196 ++++++++++++++++++++---
 src/row/arch/x86_avx2/tests.rs     | 145 +++++++++++++++++
 src/row/arch/x86_avx512.rs         | 244 +++++++++++++++++++++++++----
 src/row/arch/x86_avx512/tests.rs   | 145 +++++++++++++++++
 src/row/arch/x86_sse41.rs          | 191 ++++++++++++++++++----
 src/row/arch/x86_sse41/tests.rs    | 145 +++++++++++++++++
 src/row/mod.rs                     | 192 +++++++++++++++++++++--
 11 files changed, 1791 insertions(+), 145 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index e1f8fdf8..f1726268 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -676,8 +676,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -702,16 +702,61 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
     );
   }
 }
 
-/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
-/// writes RGBA quads via `vst4q_u16` with constant alpha
-/// `(1 << BITS) - 1`.
+/// NEON YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed
+/// RGBA with the per-pixel alpha element **sourced from `a_src`**
+/// (already at the source's native bit depth — no depth conversion)
+/// instead of being the opaque maximum `(1 << BITS) - 1`. Same
+/// numerical contract as [`yuv_420p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "neon")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+}
+
+/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel for
+/// [`yuv_420p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `vst3q_u16`), [`yuv_420p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `vst4q_u16` with constant alpha
+/// `(1 << BITS) - 1`) and [`yuv_420p_n_to_rgba_u16_with_alpha_src_row`]
+/// (`ALPHA = true, ALPHA_SRC = true`, `vst4q_u16` with the alpha lane
+/// loaded from `a_src` and masked to native bit depth — no shift since
+/// both the source alpha and the u16 output element are at the same
+/// native bit depth).
 ///
 /// # Safety
 ///
@@ -719,25 +764,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 5. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -819,8 +877,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
       if ALPHA {
-        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
-        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS), so just mask off any
+          // over-range bits to match the scalar reference.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v);
+          let hi = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, a_lo_v);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, a_hi_v);
         vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
         vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
       } else {
@@ -840,7 +911,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
@@ -2907,8 +2984,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -2931,15 +3008,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with
+/// the per-pixel alpha element **sourced from `a_src`** (full-range
+/// u16, no mask, no shift) instead of being constant `0xFFFF`. Same
+/// numerical contract as [`yuv_420p16_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "neon")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared NEON 16-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
-/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`.
+/// - `ALPHA = false, ALPHA_SRC = false`: `vst3q_u16`.
+/// - `ALPHA = true, ALPHA_SRC = false`: `vst4q_u16` with constant
+///   alpha `0xFFFF`.
+/// - `ALPHA = true, ALPHA_SRC = true`: `vst4q_u16` with the alpha
+///   lane loaded directly from `a_src` (full-range u16, no mask).
 ///
 /// # Safety
 ///
@@ -2948,23 +3067,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3074,13 +3202,23 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       );
 
       if ALPHA {
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // 16-bit alpha is full-range u16 — load 16 lanes directly,
+          // no mask or shift needed.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          (vld1q_u16(a_ptr.add(x)), vld1q_u16(a_ptr.add(x + 8)))
+        } else {
+          (alpha_u16, alpha_u16)
+        };
         vst4q_u16(
           out.as_mut_ptr().add(x * 4),
-          uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16),
+          uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, a_lo_v),
         );
         vst4q_u16(
           out.as_mut_ptr().add(x * 4 + 32),
-          uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16),
+          uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, a_hi_v),
         );
       } else {
         vst3q_u16(
@@ -3101,7 +3239,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p16_to_rgba_u16_row(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index 4a5f27c9..6d9eff83 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -3234,3 +3234,168 @@ fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
     );
   }
 }
+
+// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ----
+//
+// Mirrors the 4:4:4 u16 alpha-source pattern for the 4:2:0 family —
+// high-bit BITS-generic (Yuva420p9 / Yuva420p10) and 16-bit
+// (Yuva420p16). 8-bit Yuva420p has no u16 RGBA path. Pseudo-random
+// alpha + per-arch direct kernel call so `vst4q_u16` lane order is
+// exercised regardless of the dispatcher tier on the runner.
+
+fn check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_neon,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+fn check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width / 2, 53);
+  let v = p16_plane_neon(width / 2, 71);
+  let a_src = p16_plane_neon(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p16_to_rgba_u16_with_alpha_src_row(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_neon,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva420p_n_rgba_u16_matches_scalar_all_bits() {
+  // BITS = 9, 10 — full matrix sweep × natural width.
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89);
+      check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva420p_n_rgba_u16_matches_scalar_widths() {
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
+    check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva420p_n_rgba_u16_matches_scalar_random_alpha() {
+  // Different alpha seeds — confirms alpha lane order through
+  // `vst4q_u16` doesn't collide with R/G/B.
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(
+      34,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva420p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() {
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89);
+  }
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed);
+  }
+}
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 0d5fc17c..9992274e 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -613,8 +613,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -638,16 +638,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// wasm simd128 YUVA 4:2:0 high-bit-depth → **native-depth `u16`**
+/// packed RGBA with the per-pixel alpha element **sourced from
+/// `a_src`** (masked to BITS, no shift) instead of being the opaque
+/// maximum `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared wasm simd128 high-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
-/// constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = false, ALPHA_SRC = false`: 2× `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: 2× `write_rgba_u16_8` with
+///   constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = true, ALPHA_SRC = true`: 2× `write_rgba_u16_8` with the
+///   alpha lanes loaded from `a_src` and masked to BITS.
 ///
 /// # Safety
 ///
@@ -656,25 +697,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 5. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -745,9 +799,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
       if ALPHA {
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // Mask alpha loads to BITS — same hardening as Y/U/V. Native
+          // bit depth output, so no shift.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v);
+          let hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
         let dst = out.as_mut_ptr().add(x * 4);
-        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
-        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+        write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo_v, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, a_hi_v, dst.add(32));
       } else {
         let dst = out.as_mut_ptr().add(x * 3);
         write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
@@ -763,7 +829,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
@@ -3465,8 +3537,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -3489,16 +3561,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// wasm simd128 16-bit YUVA 4:2:0 → **native-depth `u16`** packed
+/// RGBA with the per-pixel alpha element **sourced from `a_src`**
+/// (full-range u16, no mask, no shift) instead of being constant
+/// `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared wasm simd128 16-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
-/// constant alpha `0xFFFF`.
+/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with
+///   constant alpha `0xFFFF`.
+/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the
+///   alpha lane loaded from `a_src` (full-range u16).
 ///
 /// # Safety
 ///
@@ -3507,23 +3620,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3615,7 +3737,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       );
 
       if ALPHA {
-        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        let a_v = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // 16-bit alpha is full-range u16 — load 8 lanes directly.
+          v128_load(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast())
+        } else {
+          alpha_u16
+        };
+        write_rgba_u16_8(r_u16, g_u16, b_u16, a_v, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
       }
@@ -3628,7 +3758,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p16_to_rgba_u16_row(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index 1a6e36a3..1bfbf67b 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -2777,3 +2777,146 @@ fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
     );
   }
 }
+
+// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ----
+
+fn check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_wasm = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_wasm,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_wasm,
+    "wasm simd128 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+fn check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width / 2, 53);
+  let v = p16_plane_wasm(width / 2, 71);
+  let a_src = p16_plane_wasm(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_wasm = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p16_to_rgba_u16_with_alpha_src_row(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_wasm,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_wasm,
+    "wasm simd128 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn simd128_yuva420p_n_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89);
+      check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuva420p_n_rgba_u16_matches_scalar_widths() {
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Bt601,
+      false,
+      89,
+    );
+    check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva420p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() {
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89);
+  }
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed);
+  }
+}
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 9d893fbf..425609b3 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -662,8 +662,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -687,16 +687,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed RGBA
+/// with the per-pixel alpha element **sourced from `a_src`** (masked
+/// to BITS, no shift) instead of being the opaque maximum
+/// `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared AVX2 high-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8` per
-/// 32-pixel block; `ALPHA = true` writes RGBA quads via 4×
-/// `write_rgba_u16_8` with constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = false, ALPHA_SRC = false`: 4× `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: 4× `write_rgba_u16_8` with
+///   constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = true, ALPHA_SRC = true`: 4× `write_rgba_u16_8` with the
+///   alpha lanes loaded from `a_src` and masked to BITS.
 ///
 /// # Safety
 ///
@@ -705,25 +746,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 5. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -812,33 +866,51 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       // 128‑bit half of an i16x16 channel and hands it to the shared
       // SSE4.1 u16 interleave helper.
       if ALPHA {
+        let (a0_v, a1_v, a2_v, a3_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // Mask alpha loads to BITS — same hardening as Y/U/V. Native
+          // bit depth output, so no shift; just split each 256-bit
+          // load into two 128-bit halves to feed `write_rgba_u16_8`.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v);
+          let a_hi = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v);
+          (
+            _mm256_castsi256_si128(a_lo),
+            _mm256_extracti128_si256::<1>(a_lo),
+            _mm256_castsi256_si128(a_hi),
+            _mm256_extracti128_si256::<1>(a_hi),
+          )
+        } else {
+          (alpha_u16, alpha_u16, alpha_u16, alpha_u16)
+        };
         let dst = out.as_mut_ptr().add(x * 4);
         write_rgba_u16_8(
           _mm256_castsi256_si128(r_lo),
           _mm256_castsi256_si128(g_lo),
           _mm256_castsi256_si128(b_lo),
-          alpha_u16,
+          a0_v,
           dst,
         );
         write_rgba_u16_8(
           _mm256_extracti128_si256::<1>(r_lo),
           _mm256_extracti128_si256::<1>(g_lo),
           _mm256_extracti128_si256::<1>(b_lo),
-          alpha_u16,
+          a1_v,
           dst.add(32),
         );
         write_rgba_u16_8(
           _mm256_castsi256_si128(r_hi),
           _mm256_castsi256_si128(g_hi),
           _mm256_castsi256_si128(b_hi),
-          alpha_u16,
+          a2_v,
           dst.add(64),
         );
         write_rgba_u16_8(
           _mm256_extracti128_si256::<1>(r_hi),
           _mm256_extracti128_si256::<1>(g_hi),
           _mm256_extracti128_si256::<1>(b_hi),
-          alpha_u16,
+          a3_v,
           dst.add(96),
         );
       } else {
@@ -878,7 +950,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
@@ -3743,8 +3821,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -3767,15 +3845,56 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with
+/// the per-pixel alpha element **sourced from `a_src`** (full-range
+/// u16, no mask, no shift) instead of being constant `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared AVX2 16-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
-/// quads with constant alpha `0xFFFF`.
+/// - `ALPHA = false, ALPHA_SRC = false`: 2× `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: 2× `write_rgba_u16_8` with
+///   constant alpha `0xFFFF`.
+/// - `ALPHA = true, ALPHA_SRC = true`: 2× `write_rgba_u16_8` with the
+///   alpha lanes loaded from `a_src` (full-range u16).
 ///
 /// # Safety
 ///
@@ -3784,23 +3903,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3896,19 +4024,33 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
 
       // Write 16 pixels via two 8-pixel helper calls.
       if ALPHA {
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // 16-bit alpha is full-range u16 — load 16 lanes (one
+          // __m256i = 32 bytes), split into two 128-bit halves.
+          let a_vec =
+            _mm256_loadu_si256(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast());
+          (
+            _mm256_castsi256_si128(a_vec),
+            _mm256_extracti128_si256::<1>(a_vec),
+          )
+        } else {
+          (alpha_u16, alpha_u16)
+        };
         let dst = out.as_mut_ptr().add(x * 4);
         write_rgba_u16_8(
           _mm256_castsi256_si128(r_u16),
           _mm256_castsi256_si128(g_u16),
           _mm256_castsi256_si128(b_u16),
-          alpha_u16,
+          a_lo_v,
           dst,
         );
         write_rgba_u16_8(
           _mm256_extracti128_si256::<1>(r_u16),
           _mm256_extracti128_si256::<1>(g_u16),
           _mm256_extracti128_si256::<1>(b_u16),
-          alpha_u16,
+          a_hi_v,
           dst.add(32),
         );
       } else {
@@ -3936,7 +4078,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p16_to_rgba_u16_row(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index d574487a..01ce31ff 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -3043,3 +3043,148 @@ fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
     );
   }
 }
+
+// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ----
+
+fn check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+fn check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width / 2, 53);
+  let v = p16_plane_avx2(width / 2, 71);
+  let a_src = p16_plane_avx2(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p16_to_rgba_u16_with_alpha_src_row(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx2_yuva420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(32, m, full, 89);
+      check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuva420p_n_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [32usize, 34, 46, 62, 1920, 1922] {
+    check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
+    check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn avx2_yuva420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89);
+  }
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed);
+  }
+}
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 71de14f1..f9afa112 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -676,8 +676,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -701,16 +701,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX-512 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed
+/// RGBA with the per-pixel alpha element **sourced from `a_src`**
+/// (masked to BITS, no shift) instead of being the opaque maximum
+/// `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared AVX-512 high-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via 8× `write_quarter` per
-/// 64-pixel block; `ALPHA = true` writes RGBA quads via 8×
-/// `write_quarter_rgba` with constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = false, ALPHA_SRC = false`: 8× `write_quarter` per 64-pixel block.
+/// - `ALPHA = true, ALPHA_SRC = false`: 8× `write_quarter_rgba` with
+///   constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = true, ALPHA_SRC = true`: 8× `write_quarter_rgba` with
+///   the alpha quarters extracted from `a_src` (masked to BITS).
 ///
 /// # Safety
 ///
@@ -719,25 +760,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 5. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -829,15 +883,41 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       // channel vector we extract four 128‑bit quarters and hand each
       // to the shared SSE4.1 u16 interleave helper.
       if ALPHA {
+        // Pre-extract 8 alpha quarters (one per 8-pixel slot) when
+        // `ALPHA_SRC = true`. Each quarter is a 128-bit i16x8.
+        let (a0, a1, a2, a3, a4, a5, a6, a7) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // Mask alpha loads to BITS — same hardening as Y/U/V. Native
+          // bit depth, no shift; just split each 512-bit load into
+          // four 128-bit quarters via `_mm512_extracti32x4_epi32`.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v);
+          let a_hi = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v);
+          (
+            _mm512_extracti32x4_epi32::<0>(a_lo),
+            _mm512_extracti32x4_epi32::<1>(a_lo),
+            _mm512_extracti32x4_epi32::<2>(a_lo),
+            _mm512_extracti32x4_epi32::<3>(a_lo),
+            _mm512_extracti32x4_epi32::<0>(a_hi),
+            _mm512_extracti32x4_epi32::<1>(a_hi),
+            _mm512_extracti32x4_epi32::<2>(a_hi),
+            _mm512_extracti32x4_epi32::<3>(a_hi),
+          )
+        } else {
+          (
+            alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16,
+          )
+        };
         let dst = out.as_mut_ptr().add(x * 4);
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a0, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, a1, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a2, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a3, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a4, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a5, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a6, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a7, 3, dst.add(224));
       } else {
         let dst = out.as_mut_ptr().add(x * 3);
         write_quarter(r_lo, g_lo, b_lo, 0, dst);
@@ -859,7 +939,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
@@ -3863,8 +3949,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -3887,16 +3973,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX-512 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA
+/// with the per-pixel alpha element **sourced from `a_src`**
+/// (full-range u16, no mask, no shift) instead of being constant
+/// `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared AVX-512 16-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with
-/// constant alpha `0xFFFF`.
+/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_32`.
+/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_32` with
+///   constant alpha `0xFFFF` (broadcast 128-bit lane).
+/// - `ALPHA = true, ALPHA_SRC = true`: 4× `write_rgba_u16_8` with the
+///   alpha quarters loaded from `a_src` (full-range u16, no shift).
 ///
 /// # Safety
 ///
@@ -3905,23 +4032,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -4044,7 +4180,53 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
 
       // Write 32 pixels via the appropriate 4× 8-pixel helper.
       if ALPHA {
-        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // 16-bit alpha is full-range u16 — load 32 lanes (one
+          // __m512i = 64 bytes), split into four 128-bit quarters
+          // and inline the 4× write_rgba_u16_8 calls (the standard
+          // `write_rgba_u16_32` helper broadcasts a single alpha
+          // 128-bit lane to all 4 quarters, which doesn't fit the
+          // per-pixel-source-alpha case).
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_vec = _mm512_loadu_si512(a_ptr.add(x).cast());
+          let a0 = _mm512_extracti32x4_epi32::<0>(a_vec);
+          let a1 = _mm512_extracti32x4_epi32::<1>(a_vec);
+          let a2 = _mm512_extracti32x4_epi32::<2>(a_vec);
+          let a3 = _mm512_extracti32x4_epi32::<3>(a_vec);
+          let dst = out.as_mut_ptr().add(x * 4);
+          write_rgba_u16_8(
+            _mm512_castsi512_si128(r_u16),
+            _mm512_castsi512_si128(g_u16),
+            _mm512_castsi512_si128(b_u16),
+            a0,
+            dst,
+          );
+          write_rgba_u16_8(
+            _mm512_extracti32x4_epi32::<1>(r_u16),
+            _mm512_extracti32x4_epi32::<1>(g_u16),
+            _mm512_extracti32x4_epi32::<1>(b_u16),
+            a1,
+            dst.add(32),
+          );
+          write_rgba_u16_8(
+            _mm512_extracti32x4_epi32::<2>(r_u16),
+            _mm512_extracti32x4_epi32::<2>(g_u16),
+            _mm512_extracti32x4_epi32::<2>(b_u16),
+            a2,
+            dst.add(64),
+          );
+          write_rgba_u16_8(
+            _mm512_extracti32x4_epi32::<3>(r_u16),
+            _mm512_extracti32x4_epi32::<3>(g_u16),
+            _mm512_extracti32x4_epi32::<3>(b_u16),
+            a3,
+            dst.add(96),
+          );
+        } else {
+          write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        }
       } else {
         write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
       }
@@ -4058,7 +4240,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p16_to_rgba_u16_row(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index e1a25967..b3d6af0e 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -3116,3 +3116,148 @@ fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
     );
   }
 }
+
+// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ----
+
+fn check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+fn check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width / 2, 53);
+  let v = p16_plane_avx512(width / 2, 71);
+  let a_src = p16_plane_avx512(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p16_to_rgba_u16_with_alpha_src_row(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx512_yuva420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(64, m, full, 89);
+      check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuva420p_n_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [64usize, 66, 78, 94, 1920, 1922] {
+    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
+    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn avx512_yuva420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(32, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [32usize, 34, 46, 62, 78, 94, 1920, 1922] {
+    check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89);
+  }
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(32, ColorMatrix::Bt601, true, seed);
+  }
+}
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 37da1bb7..a8935652 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1044,8 +1044,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -1069,16 +1069,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed
+/// RGBA with the per-pixel alpha element **sourced from `a_src`**
+/// (already at the source's native bit depth — masked to BITS, no
+/// shift) instead of being the opaque maximum `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared SSE4.1 high-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
-/// constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with
+///   constant alpha `(1 << BITS) - 1`.
+/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the
+///   alpha lane loaded from `a_src` and masked to BITS.
 ///
 /// # Safety
 ///
@@ -1087,25 +1128,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 5. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1181,14 +1235,20 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
 
       // Two 8‑pixel u16 writes cover the 16‑pixel block.
       if ALPHA {
-        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
-        write_rgba_u16_8(
-          r_hi,
-          g_hi,
-          b_hi,
-          alpha_u16,
-          out.as_mut_ptr().add(x * 4 + 32),
-        );
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS), so just mask.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v);
+          let hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
+        write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo_v, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(r_hi, g_hi, b_hi, a_hi_v, out.as_mut_ptr().add(x * 4 + 32));
       } else {
         write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
         write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
@@ -1203,7 +1263,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
@@ -3255,8 +3321,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
-      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false, false>(
+      y, u_half, v_half, None, rgb_out, width, matrix, full_range,
     );
   }
 }
@@ -3279,15 +3345,56 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
-      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, false>(
+      y, u_half, v_half, None, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with
+/// the per-pixel alpha element **sourced from `a_src`** (full-range
+/// u16, no mask, no shift) instead of being constant `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true, true>(
+      y,
+      u_half,
+      v_half,
+      Some(a_src),
+      rgba_out,
+      width,
+      matrix,
+      full_range,
     );
   }
 }
 
 /// Shared SSE4.1 16-bit YUV 4:2:0 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
-/// quads with constant alpha `0xFFFF`.
+/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`.
+/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with
+///   constant alpha `0xFFFF`.
+/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the
+///   alpha lane loaded from `a_src` (full-range u16).
 ///
 /// # Safety
 ///
@@ -3296,23 +3403,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
   y: &[u16],
   u_half: &[u16],
   v_half: &[u16],
+  a_src: Option<&[u16]>,
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // Source alpha requires RGBA output.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3430,13 +3546,16 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       );
 
       if ALPHA {
-        write_rgba_u16_8(
-          r_lo_u16,
-          g_lo_u16,
-          b_lo_u16,
-          alpha_u16,
-          out.as_mut_ptr().add(x * 4),
-        );
+        let a_v = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // 16-bit alpha is full-range u16 — load 8 lanes (16 bytes)
+          // directly, no mask or shift.
+          _mm_loadu_si128(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast())
+        } else {
+          alpha_u16
+        };
+        write_rgba_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, a_v, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, out.as_mut_ptr().add(x * 3));
       }
@@ -3449,7 +3568,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
       let tail_v = &v_half[x / 2..width / 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_420p16_to_rgba_u16_row(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 1fe7dd2a..9afc1363 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -3108,3 +3108,148 @@ fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
     );
   }
 }
+
+// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ----
+
+fn check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+fn check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width / 2, 53);
+  let v = p16_plane(width / 2, 71);
+  let a_src = p16_plane(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p16_to_rgba_u16_with_alpha_src_row(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn sse41_yuva420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89);
+      check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuva420p_n_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
+    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn sse41_yuva420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [16usize, 18, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89);
+  }
+  for seed in [13usize, 41, 127, 211] {
+    check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed);
+  }
+}
diff --git a/src/row/mod.rs b/src/row/mod.rs
index bea53e5e..83bf088b 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -5391,10 +5391,8 @@ pub fn yuva420p9_to_rgba_row(
 /// source's native bit depth) instead of being the opaque maximum
 /// `511`.
 ///
-/// # ⚠ Scalar-only as of Ship 8b‑2a
-///
-/// This dispatcher routes to scalar regardless of `use_simd`. SIMD
-/// wiring lands in Ship 8b‑2c.
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuva420p9_to_rgba_u16_row(
@@ -5416,7 +5414,63 @@ pub fn yuva420p9_to_rgba_u16_row(
   assert!(a.len() >= width, "a row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
     y, u_half, v_half, a, rgba_out, width, matrix, full_range,
   );
@@ -5517,10 +5571,8 @@ pub fn yuva420p10_to_rgba_row(
 /// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the
 /// per-pixel alpha element is **sourced from `a`** at native depth.
 ///
-/// # ⚠ Scalar-only as of Ship 8b‑2a
-///
-/// This dispatcher routes to scalar regardless of `use_simd`. SIMD
-/// wiring lands in Ship 8b‑2c.
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuva420p10_to_rgba_u16_row(
@@ -5542,7 +5594,63 @@ pub fn yuva420p10_to_rgba_u16_row(
   assert!(a.len() >= width, "a row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
     y, u_half, v_half, a, rgba_out, width, matrix, full_range,
   );
@@ -5642,10 +5750,8 @@ pub fn yuva420p16_to_rgba_row(
 /// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel
 /// alpha element is **sourced from `a`** at native depth (no shift).
 ///
-/// # ⚠ Scalar-only as of Ship 8b‑2a
-///
-/// This dispatcher routes to scalar regardless of `use_simd`. SIMD
-/// wiring lands in Ship 8b‑2c.
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuva420p16_to_rgba_u16_row(
@@ -5667,7 +5773,63 @@ pub fn yuva420p16_to_rgba_u16_row(
   assert!(a.len() >= width, "a row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
     y, u_half, v_half, a, rgba_out, width, matrix, full_range,
   );

From df00980da9fb3374a34647c1e42a87eb4d534b04 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:05:04 +1200
Subject: [PATCH 2/6] refactor(row): split mod.rs into dispatch/* submodules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`src/row/mod.rs` had grown to 7276 lines, dominating the entire row
crate-private surface. Split the public dispatchers into 7 sibling
files under `src/row/dispatch/` grouped by source-format family for
readability:

- `dispatch/yuv420.rs` (~2700 lines): yuv_420 (8-bit) +
  yuv420p9/10/12/14/16 + p010/p012/p016 — RGB + RGBA
- `dispatch/yuv444.rs` (~1330 lines): yuv_444 (8-bit) +
  yuv444p9/10/12/14/16 (BITS-generic helpers + per-bit-depth
  wrappers) — RGB + RGBA
- `dispatch/nv.rs` (~630 lines): NV12 / NV21 / NV24 / NV42 —
  RGB + RGBA
- `dispatch/pn.rs` (~800 lines): P410 / P412 / P416 (semi-planar
  4:4:4) — RGB + RGBA
- `dispatch/yuva.rs` (~845 lines): Yuva444p10 + the Yuva420p
  family (8-bit + 9 / 10 / 16-bit) — RGBA + u16 RGBA
- `dispatch/rgb_ops.rs` (~170 lines): rgb_to_hsv_row,
  bgr_to_rgb_row, rgb_to_bgr_row
- `dispatch/bayer.rs` (~160 lines): Bayer dispatchers

`mod.rs` keeps:
- Module-level doc + `pub(crate) mod arch / scalar`
- `mod dispatch;` + `pub use dispatch::*::*` re-exports (the public
  API at `crate::row::*` is unchanged)
- Shared dispatcher helpers (`rgb_row_bytes`, `rgba_row_bytes`,
  `rgb_row_elems`, `rgba_row_elems`, `uv_full_row_elems`,
  `assert_color_transform_well_formed`, `MAX_FUSED_TRANSFORM_ABS`)
  — bumped from `fn` (private) to `pub(crate)` so dispatch
  submodules can call them.
- Runtime CPU feature detection (`neon_available`, `avx2_available`,
  `sse41_available`, `avx512_available`, `simd128_available`) — also
  bumped to `pub(crate)`.
- Inline tests (`mod overflow_tests`, `mod bayer_dispatcher_tests`).

mod.rs reduces from 7276 lines to 770 lines.

The dispatcher function bodies were extracted byte-for-byte via
`sed -n` — no semantic changes. The only edits were swapping
`fn` → `pub(crate) fn` on shared helpers, adding per-file
`use crate::row::*` imports for `scalar`, `arch`, helpers, and the
CPU-detection helpers, plus the `pub use dispatch::*::*` re-exports
in `mod.rs`.

Verified across aarch64-apple-darwin, x86_64-unknown-freebsd, and
wasm32-unknown-unknown:
- `cargo check --lib --tests`: clean
- `RUSTFLAGS=-Dwarnings cargo clippy --lib --tests`: clean
- `cargo test --lib` (host): 629 passed (same as before)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/dispatch/bayer.rs   |  162 +
 src/row/dispatch/mod.rs     |   17 +
 src/row/dispatch/nv.rs      |  629 ++++
 src/row/dispatch/pn.rs      |  796 +++++
 src/row/dispatch/rgb_ops.rs |  171 +
 src/row/dispatch/yuv420.rs  | 2698 ++++++++++++++
 src/row/dispatch/yuv444.rs  | 1333 +++++++
 src/row/dispatch/yuva.rs    |  845 +++++
 src/row/mod.rs              | 6608 +----------------------------------
 9 files changed, 6702 insertions(+), 6557 deletions(-)
 create mode 100644 src/row/dispatch/bayer.rs
 create mode 100644 src/row/dispatch/mod.rs
 create mode 100644 src/row/dispatch/nv.rs
 create mode 100644 src/row/dispatch/pn.rs
 create mode 100644 src/row/dispatch/rgb_ops.rs
 create mode 100644 src/row/dispatch/yuv420.rs
 create mode 100644 src/row/dispatch/yuv444.rs
 create mode 100644 src/row/dispatch/yuva.rs

diff --git a/src/row/dispatch/bayer.rs b/src/row/dispatch/bayer.rs
new file mode 100644
index 00000000..4f45857f
--- /dev/null
+++ b/src/row/dispatch/bayer.rs
@@ -0,0 +1,162 @@
+//! Bayer dispatchers (`bayer_to_rgb_row`, `bayer16_to_rgb_row`,
+//! `bayer16_to_rgb_u16_row`) extracted from `row::mod` for organization.
+//!
+//! `use_simd` is currently a no-op for all Bayer paths — they route to
+//! scalar regardless. Per-arch SIMD backends ship in a follow-up; the
+//! parameter is wired through so callers don't have to touch their
+//! call sites when SIMD lands.
+
+use crate::row::scalar;
+use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems};
+
+/// Converts one row of an 8-bit Bayer plane to packed RGB.
+///
+/// Dispatches to the best available backend for the current target.
+/// See [`scalar::bayer_to_rgb_row`] for the full semantic specification
+/// (bilinear demosaic geometry, edge handling, output layout).
+///
+/// `above` / `mid` / `below` are row-aligned slices into the source
+/// Bayer plane via the **mirror-by-2** boundary contract: at the
+/// top edge the caller supplies `above = mid_row(1)`, at the bottom
+/// edge `below = mid_row(h - 2)`; replicate fallback only when
+/// `height < 2`. See [`crate::raw::BayerRow::above`] for the full
+/// rationale (CFA-parity preservation across boundaries).
+/// `above` / `mid` / `below` must all be the same length — that
+/// length is the row's pixel width.
+///
+/// `m` is the precomputed `CCM · diag(wb)` 3×3 transform. Every
+/// element must be finite (not NaN, not ±∞); the dispatcher
+/// asserts this at the boundary so future unsafe SIMD kernels can
+/// trust the contract.
+///
+/// `rgb_out` must have at least `3 * mid.len()` bytes.
+///
+/// **`use_simd` is currently a no-op.** All Bayer paths run the
+/// scalar reference today; per-arch SIMD backends (NEON / SSE4.1 /
+/// AVX2 / AVX-512 / wasm simd128) ship in a follow-up. The
+/// parameter is wired through `MixedSinker` and the public
+/// dispatchers now so callers don't have to touch their call sites
+/// when SIMD lands.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn bayer_to_rgb_row(
+  above: &[u8],
+  mid: &[u8],
+  below: &[u8],
+  row_parity: u32,
+  pattern: crate::raw::BayerPattern,
+  demosaic: crate::raw::BayerDemosaic,
+  m: &[[f32; 3]; 3],
+  rgb_out: &mut [u8],
+  _use_simd: bool,
+) {
+  // Release-mode preflight: future unsafe SIMD backends will rely on
+  // these invariants for bounds-free pointer arithmetic, so we
+  // validate here rather than only via `debug_assert!` inside the
+  // scalar kernel. Same pattern as `yuv_420_to_rgb_row`.
+  let width = mid.len();
+  assert_eq!(above.len(), width, "above row length must match mid");
+  assert_eq!(below.len(), width, "below row length must match mid");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+  assert_color_transform_well_formed(m);
+
+  scalar::bayer_to_rgb_row(above, mid, below, row_parity, pattern, demosaic, m, rgb_out);
+}
+
+/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer
+/// plane to packed `u8` RGB.
+///
+/// `BITS` ∈ {10, 12, 14, 16}; samples are low-packed `u16` (active
+/// values in the low `BITS` bits, range `[0, (1 << BITS) - 1]`).
+/// Direct row-API callers are responsible for upholding the
+/// low-packed contract; samples whose value exceeds
+/// `(1 << BITS) - 1` produce defined-but-saturated output (no
+/// panic, no UB). The walker
+/// [`crate::raw::bayer16_to`] never sees out-of-range input
+/// because [`crate::frame::BayerFrame16::try_new`] validates every
+/// active sample at frame-construction time.
+///
+/// `m` is the unscaled `CCM · diag(wb)` — the kernel bakes the
+/// input→u8 rescale (`255 / ((1 << BITS) - 1)`) at output time.
+/// `above` / `mid` / `below` must all be the same length;
+/// `rgb_out` must have at least `3 * mid.len()` bytes.
+///
+/// **`use_simd` is currently a no-op** (see
+/// [`bayer_to_rgb_row`] for the deferred-SIMD note).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn bayer16_to_rgb_row<const BITS: u32>(
+  above: &[u16],
+  mid: &[u16],
+  below: &[u16],
+  row_parity: u32,
+  pattern: crate::raw::BayerPattern,
+  demosaic: crate::raw::BayerDemosaic,
+  m: &[[f32; 3]; 3],
+  rgb_out: &mut [u8],
+  _use_simd: bool,
+) {
+  const {
+    assert!(
+      BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16,
+      "bayer16_to_rgb_row: BITS must be 10, 12, 14, or 16"
+    )
+  };
+  let width = mid.len();
+  assert_eq!(above.len(), width, "above row length must match mid");
+  assert_eq!(below.len(), width, "below row length must match mid");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+  assert_color_transform_well_formed(m);
+
+  scalar::bayer16_to_rgb_row::<BITS>(above, mid, below, row_parity, pattern, demosaic, m, rgb_out);
+}
+
+/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer
+/// plane to packed `u16` RGB (also low-packed at `BITS`).
+///
+/// `BITS` ∈ {10, 12, 14, 16}. Input and output share the same
+/// low-packed range `[0, (1 << BITS) - 1]` per channel — no
+/// rescale, just clamp. `above` / `mid` / `below` must all be the
+/// same length; `rgb_out` must have at least `3 * mid.len()` `u16`
+/// elements.
+///
+/// Direct row-API callers are responsible for upholding the
+/// low-packed contract — see [`bayer16_to_rgb_row`] for the
+/// full rationale on the safe path
+/// ([`crate::frame::BayerFrame16::try_new`] + [`crate::raw::bayer16_to`])
+/// vs. the direct row API.
+///
+/// **`use_simd` is currently a no-op** (see
+/// [`bayer_to_rgb_row`] for the deferred-SIMD note).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn bayer16_to_rgb_u16_row<const BITS: u32>(
+  above: &[u16],
+  mid: &[u16],
+  below: &[u16],
+  row_parity: u32,
+  pattern: crate::raw::BayerPattern,
+  demosaic: crate::raw::BayerDemosaic,
+  m: &[[f32; 3]; 3],
+  rgb_out: &mut [u16],
+  _use_simd: bool,
+) {
+  const {
+    assert!(
+      BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16,
+      "bayer16_to_rgb_u16_row: BITS must be 10, 12, 14, or 16"
+    )
+  };
+  let width = mid.len();
+  assert_eq!(above.len(), width, "above row length must match mid");
+  assert_eq!(below.len(), width, "below row length must match mid");
+  let rgb_min = rgb_row_elems(width);
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+  assert_color_transform_well_formed(m);
+
+  scalar::bayer16_to_rgb_u16_row::<BITS>(
+    above, mid, below, row_parity, pattern, demosaic, m, rgb_out,
+  );
+}
diff --git a/src/row/dispatch/mod.rs b/src/row/dispatch/mod.rs
new file mode 100644
index 00000000..864123bd
--- /dev/null
+++ b/src/row/dispatch/mod.rs
@@ -0,0 +1,17 @@
+//! Public row-dispatcher submodules. The dispatchers were extracted
+//! from `row::mod` here so the parent module stays focused on
+//! shared helpers, runtime CPU feature detection, and crate-private
+//! `arch` / `scalar` glue.
+//!
+//! Submodules are gated `pub(super) mod` and re-exported via
+//! `pub use` in `row::mod`, so the public API still appears at
+//! `crate::row::*` (e.g. `crate::row::yuv_420_to_rgb_row`). Callers
+//! see no API change from the split.
+
+pub(super) mod bayer;
+pub(super) mod nv;
+pub(super) mod pn;
+pub(super) mod rgb_ops;
+pub(super) mod yuv420;
+pub(super) mod yuv444;
+pub(super) mod yuva;
diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs
new file mode 100644
index 00000000..b342e6e4
--- /dev/null
+++ b/src/row/dispatch/nv.rs
@@ -0,0 +1,629 @@
+//! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and
+//! RGBA outputs) extracted from `row::mod` for organization.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+/// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB.
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
+/// difference is UV source — NV12 delivers U and V interleaved in a
+/// single `width`‑byte row (`U0, V0, U1, V1, …`). See
+/// `scalar::nv12_to_rgb_row` for the reference implementation.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv12_to_rgb_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary (see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 3` multiplication).
+  assert_eq!(width & 1, 0, "NV12 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present on this
+          // CPU. Bounds / parity invariants are the caller's obligation
+          // (checked above).
+          unsafe {
+            arch::neon::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present.
+          unsafe {
+            arch::x86_avx2::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          unsafe {
+            arch::x86_sse41::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: `simd128_available()` verified simd128 is on at
+          // compile time (WASM has no runtime CPU detection).
+          unsafe {
+            arch::wasm_simd128::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to
+/// packed RGB.
+///
+/// Same numerical contract as [`nv12_to_rgb_row`]; the only
+/// difference is chroma byte order — NV21 stores `V0, U0, V1, U1, …`
+/// instead of NV12's `U0, V0, U1, V1, …`. See `scalar::nv21_to_rgb_row`
+/// for the reference implementation.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv21_to_rgb_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary.
+  assert_eq!(width & 1, 0, "NV21 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu_half.len() >= width, "vu_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA**
+/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
+/// produce opaque output).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary — see
+  // [`yuv_420_to_rgba_row`] for rationale, including the checked
+  // `width × 4` multiplication via [`rgba_row_bytes`].
+  assert_eq!(width & 1, 0, "NV12 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to
+/// packed **RGBA** (8-bit). Same numerical contract as
+/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "NV21 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu_half.len() >= width, "vu_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed
+/// RGB. Dispatches to the best available SIMD backend for the current
+/// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling
+/// back to scalar when no backend is available.
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
+/// from NV12 is 4:4:4 chroma — one UV pair per Y pixel, no chroma
+/// upsampling, and no width parity constraint. See
+/// `scalar::nv24_to_rgb_row` for the reference implementation.
+///
+/// `use_simd = false` forces the scalar reference path, bypassing any
+/// SIMD backend. Benchmarks can flip this to compare scalar vs SIMD
+/// directly on the same input; production code should pass `true`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv24_to_rgb_row(
+  y: &[u8],
+  uv: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  // NV24 chroma carries one UV pair per pixel = `2 * width` bytes.
+  // Use `checked_mul` — on 32-bit targets, `2 * width` can overflow
+  // `usize` at extreme widths and silently short-circuit the length
+  // check before entering unsafe SIMD paths.
+  let uv_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv.len() >= uv_min, "uv row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV42 (semi‑planar 4:4:4, VU‑ordered) to packed
+/// RGB. Same as [`nv24_to_rgb_row`] but with swapped chroma byte order.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv42_to_rgb_row(
+  y: &[u8],
+  vu: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  let vu_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu.len() >= vu_min, "vu row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed
+/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`];
+/// alpha defaults to `0xFF` (opaque).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let uv_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv.len() >= uv_min, "uv row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed
+/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped
+/// chroma byte order.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let vu_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu.len() >= vu_min, "vu row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs
new file mode 100644
index 00000000..f2a143c4
--- /dev/null
+++ b/src/row/dispatch/pn.rs
@@ -0,0 +1,796 @@
+//! Semi-planar 4:4:4 (P410 / P412 / P416) dispatchers — RGB + RGBA
+//! for both 8-bit and native-depth `u16` outputs. Extracted from
+//! `row::mod` for organization.
+//!
+//! Internal `pub(crate)` helpers `p_n_444_to_rgb_row` /
+//! `p_n_444_to_rgb_u16_row` provide the BITS-generic dispatch shared
+//! by P410/P412 (`BITS = 10/12`); P416 has its own dedicated kernels
+//! (full u16 range; the BITS-generic path doesn't apply).
+//!
+//! P010 / P012 / P016 (semi-planar 4:2:0) live in `dispatch::yuv420`
+//! since they share the 4:2:0 chroma layout with the planar
+//! yuv420p9/10/12/14/16 family.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, uv_full_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+// ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB --------------------
+//
+// Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width
+// interleaved UV (one `U, V` pair per pixel = `2 * width` u16 elements
+// per row). BITS ∈ {10, 12} run on the const-generic Q15 i32 family;
+// BITS = 16 runs on the dedicated parallel i64-chroma family
+// (chroma multiply-add overflows i32 at 16-bit u16 output).
+
+/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB
+/// dispatcher. Const-generic over `BITS`; dispatches to the best
+/// available backend (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128),
+/// falling back to scalar when no SIMD backend is available or
+/// `use_simd` is false.
+///
+/// Crate-private — public consumers go through the per-format
+/// dispatchers (`p410_to_rgb_row`, `p412_to_rgb_row`) which fix
+/// `BITS` to a literal.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn p_n_444_to_rgb_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX-512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile-time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+}
+
+/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16**
+/// RGB dispatcher. Output is low-bit-packed (active bits in low
+/// `BITS` of each `u16`). Same dispatch shape as
+/// [`p_n_444_to_rgb_row`].
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn p_n_444_to_rgb_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
+}
+
+/// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher.
+/// Y stays on i32 (output-range scaling keeps `coeff × u_d` within
+/// i32 for u8 output); chroma multiply-add also stays on i32.
+/// Dedicated entry point because the Q15 const-generic family is
+/// pinned to BITS ∈ {10, 12}.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p416_to_rgb_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
+}
+
+/// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma
+/// multiply-add runs on i64 (overflow safety at 16-bit u16 output);
+/// see scalar reference for the rationale.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p416_to_rgb_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
+}
+
+/// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p410_to_rgb_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  p_n_444_to_rgb_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// P410 → native-depth u16 RGB (10-bit low-packed output).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p410_to_rgb_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  p_n_444_to_rgb_u16_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// P412 → packed u8 RGB. Thin wrapper at `BITS = 12`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p412_to_rgb_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  p_n_444_to_rgb_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// P412 → native-depth u16 RGB (12-bit low-packed output).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p412_to_rgb_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p410_to_rgba_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
+/// P410 → **native-depth `u16`** packed **RGBA** — output is
+/// low-bit-packed (`[0, 1023]`); alpha element is `1023`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p410_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
+/// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p412_to_rgba_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
+/// P412 → **native-depth `u16`** packed **RGBA** — output is
+/// low-bit-packed (`[0, 4095]`); alpha element is `4095`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p412_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
+/// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA**
+/// (`R, G, B, 0xFF`). Routes through the dedicated 16-bit scalar
+/// kernel (`scalar::p_n_444_16_to_rgba_row`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p416_to_rgba_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
+/// P416 → **native-depth `u16`** packed **RGBA** — full-range output
+/// `[0, 65535]`; alpha element is `0xFFFF`. Routes through the
+/// dedicated 16-bit u16-output scalar kernel
+/// (`scalar::p_n_444_16_to_rgba_u16_row`) — i64 chroma multiply.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p416_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  let uv_min = uv_full_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_full.len() >= uv_min, "uv_full row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+}
+
diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs
new file mode 100644
index 00000000..c51257d8
--- /dev/null
+++ b/src/row/dispatch/rgb_ops.rs
@@ -0,0 +1,171 @@
+//! RGB→HSV and BGR↔RGB swap dispatchers extracted from `row::mod` for
+//! organization. All three route through the standard
+//! `cfg_select!` per-arch block; `use_simd = false` forces scalar.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+
+/// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
+/// encoding). See `scalar::rgb_to_hsv_row` for semantics.
+///
+/// `use_simd = false` forces the scalar reference path, bypassing any
+/// SIMD backend (same semantics as `yuv_420_to_rgb_row`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub fn rgb_to_hsv_row(
+  rgb: &[u8],
+  h_out: &mut [u8],
+  s_out: &mut [u8],
+  v_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary (see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 3` multiplication).
+  let rgb_min = rgb_row_bytes(width);
+  assert!(rgb.len() >= rgb_min, "rgb row too short");
+  assert!(h_out.len() >= width, "h_out row too short");
+  assert!(s_out.len() >= width, "s_out row too short");
+  assert!(v_out.len() >= width, "v_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD HSV backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
+}
+
+/// Rewrites a row of packed BGR to packed RGB by swapping the outer
+/// two channels (byte 0 ↔ byte 2) of every triple. `input` and
+/// `output` must not alias.
+///
+/// The underlying transformation is self‑inverse, so
+/// [`rgb_to_bgr_row`] shares the same implementation — use whichever
+/// name reads more naturally at the call site.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub fn bgr_to_rgb_row(bgr: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) {
+  swap_rb_channels_row(bgr, rgb_out, width, use_simd);
+}
+
+/// Rewrites a row of packed RGB to packed BGR by swapping the outer
+/// two channels. See [`bgr_to_rgb_row`] — this is an alias that reads
+/// more naturally for the opposite direction.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub fn rgb_to_bgr_row(rgb: &[u8], bgr_out: &mut [u8], width: usize, use_simd: bool) {
+  swap_rb_channels_row(rgb, bgr_out, width, use_simd);
+}
+
+/// Shared dispatcher behind `bgr_to_rgb_row` / `rgb_to_bgr_row`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd: bool) {
+  // Runtime asserts at the dispatcher boundary (see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 3` multiplication).
+  let rgb_min = rgb_row_bytes(width);
+  assert!(input.len() >= rgb_min, "input row too short");
+  assert!(output.len() >= rgb_min, "output row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::bgr_rgb_swap_row(input, output, width);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::bgr_rgb_swap_row(input, output, width);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 just verified.
+          unsafe {
+            arch::x86_avx2::bgr_rgb_swap_row(input, output, width);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 just verified.
+          unsafe {
+            arch::x86_sse41::bgr_rgb_swap_row(input, output, width);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::bgr_rgb_swap_row(input, output, width);
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::bgr_rgb_swap_row(input, output, width);
+}
diff --git a/src/row/dispatch/yuv420.rs b/src/row/dispatch/yuv420.rs
new file mode 100644
index 00000000..8f34dca1
--- /dev/null
+++ b/src/row/dispatch/yuv420.rs
@@ -0,0 +1,2698 @@
+//! YUV 4:2:0 dispatchers (planar and P010/P012/P016 semi-planar) —
+//! 8-bit YUV → RGB/RGBA, 9/10/12/14/16-bit planar yuv420p_n RGB+RGBA,
+//! P010/P012/P016 semi-planar RGB+RGBA. Extracted from `row::mod` for
+//! organization.
+//!
+//! All dispatchers route through the standard `cfg_select!` per-arch
+//! block; `use_simd = false` forces scalar.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+/// Converts one row of 4:2:0 YUV to packed RGB.
+///
+/// Dispatches to the best available backend for the current target.
+/// See `scalar::yuv_420_to_rgb_row` for the full semantic
+/// specification (range handling, matrix definitions, output layout).
+///
+/// `use_simd = false` forces the scalar reference path, bypassing any
+/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD
+/// directly on the same input; production code should pass `true`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_420_to_rgb_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary. The unsafe SIMD
+  // kernels below rely on these invariants for bounds‑free pointer
+  // arithmetic, so we validate in *release* builds too — not just
+  // under `debug_assert!`. Kernels keep their own `debug_assert!`s as
+  // internal sanity checks.
+  //
+  // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize`
+  // on 32‑bit targets (wasm32, i686) for extreme widths. Without the
+  // guard, a wrapped product could admit an undersized `rgb_out` and
+  // let the scalar loop's `x * 3` indexing or a SIMD kernel's
+  // pointer arithmetic run off the end.
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present on this
+          // CPU. Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference); they are checked
+          // with `debug_assert` in debug builds.
+          unsafe {
+            arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          // Bounds / parity invariants are the caller's obligation.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present on this
+          // CPU. Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference); they are checked
+          // with `debug_assert` in debug builds.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          // Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference).
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below
+      // SSE4.1) slot in here, each branch guarded by the matching
+      // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair.
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: `simd128_available()` (compile‑time
+          // `cfg!(target_feature = "simd128")`) verified that simd128
+          // is on. WASM has no runtime detection — the module's SIMD
+          // support is fixed at produce‑time. Bounds / parity
+          // invariants are the caller's obligation.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend (riscv64, powerpc, …) fall
+        // through to the scalar path below.
+      }
+    }
+  }
+
+  scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit).
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
+/// produce opaque output). The first three bytes per pixel are
+/// byte-identical to what [`yuv_420_to_rgb_row`] would write.
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the
+/// scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary — see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 4` multiplication via [`rgba_row_bytes`].
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time availability verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 9 active bits in the low bits of each
+/// element. Niche format (AVC High 9 profile only). Reuses the same
+/// `yuv_420p_n_to_rgb_row<BITS>` kernel family as 10/12/14-bit; the
+/// only per-call difference is the const-generic `BITS = 9` which
+/// fixes the AND-mask to `0x1FF` and the Q15 scale via
+/// `range_params_n::<9, 8>`.
+///
+/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 10 active bits in the low bits of each
+/// element. Output is packed `R, G, B` bytes (`3 * width` bytes),
+/// with the conversion clamping to `[0, 255]` — the native‑depth
+/// path is [`yuv420p10_to_rgb_u16_row`].
+///
+/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed
+/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`,
+/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless
+/// downstream HDR processing when the consumer expects low‑bit‑packed
+/// samples.
+///
+/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16`
+/// elements, each in `[0, 1023]` with the upper 6 bits zero.
+///
+/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
+/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
+/// feeding this output into a p010 consumer must shift left by 6
+/// before handing off.
+///
+/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
+/// packed — 10 active bits in the high 10 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// This is the HDR hardware‑decode keystone format: VideoToolbox,
+/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
+/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** to **native‑depth `u16`** packed RGB
+/// (10 active bits in the **low** 10 of each output `u16`, matching
+/// `yuv420p10le` convention — **not** the P010 high‑bit packing).
+/// Callers feeding this output into a P010 consumer must shift left
+/// by 6.
+///
+/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec.
+/// `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<10>(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 12 active bits in the low 12 bits of each
+/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed
+/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The
+/// native‑depth path is [`yuv420p12_to_rgb_u16_row`].
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching
+/// `yuv420p12le` convention — upper 4 bits zero).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (14‑bit values in the low 14 of each `u16`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
+/// packed — 12 active bits in the high 12 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and
+/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but
+/// `>> 4` instead of `>> 6` at each `u16` load.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** to **native‑depth `u16`** packed RGB
+/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed
+/// `yuv420p12le` convention, **not** P012's high‑bit packing).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<12>(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
+///
+/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs
+/// on the **i64 chroma** kernel family; see
+/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth**
+/// packed `u16` RGB (full-range output in `[0, 65535]`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
+/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed
+/// vs. low-bit-packed distinction (all bits are active).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P016 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed RGB
+/// (full-range output in `[0, 65535]`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P016 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
+//
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
+
+/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p9_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p10_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 10) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p12_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p14_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 12) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// Routes through the dedicated 16-bit scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
+/// for u8 output even at 16-bit input. `use_simd = false` forces the
+/// scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — full-range output `[0, 65535]`; alpha element
+/// is `0xFFFF` (opaque maximum at 16-bit).
+///
+/// Routes through the dedicated 16-bit u16-output scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
+/// for the wider `coeff × u_d` product at 16 → 16-bit scaling.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
+/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
+///
+/// Routes through the dedicated 16-bit P016 scalar kernel
+/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar
+/// reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed
+/// **RGBA** — full-range output `[0, 65535]`; alpha element is
+/// `0xFFFF`.
+///
+/// Routes through the dedicated 16-bit u16-output P016 scalar kernel
+/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444.rs b/src/row/dispatch/yuv444.rs
new file mode 100644
index 00000000..5bc3a960
--- /dev/null
+++ b/src/row/dispatch/yuv444.rs
@@ -0,0 +1,1333 @@
+//! YUV 4:4:4 dispatchers (planar 8-bit + high-bit 9/10/12/14/16-bit)
+//! — RGB + RGBA. Extracted from `row::mod` for organization.
+//!
+//! Internal `pub(crate)` helpers `yuv_444p_n_to_rgb_row<BITS>` /
+//! `yuv_444p_n_to_rgb_u16_row<BITS>` provide the BITS-generic dispatch
+//! shared by 9/10/12/14-bit; 16-bit gets its own dedicated kernels.
+//!
+//! All dispatchers route through the standard `cfg_select!` per-arch
+//! block; `use_simd = false` forces scalar.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
+/// to the best available SIMD backend for the current target.
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
+/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma
+/// planes, no chroma upsampling, no width parity constraint. See
+/// `scalar::yuv_444_to_rgb_row` for the reference implementation.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_444_to_rgb_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX-512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit).
+/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`.
+/// `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
+/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
+/// available backend for the current target (NEON / SSE4.1 / AVX2 /
+/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
+/// backend is available or `use_simd` is false.
+///
+/// Crate-private — external callers use the concrete
+/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] /
+/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a
+/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1`
+/// truncates to `-1` when cast to `i16` in the SIMD clamp), and
+/// matches the [`yuv420p10_to_rgb_row`] family's convention of
+/// keeping the `<BITS>` generic internal.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn yuv_444p_n_to_rgb_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher.
+/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output.
+/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 /
+/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
+/// backend is available or `use_simd` is false.
+///
+/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The
+/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a
+/// dedicated i64-chroma kernel family.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the
+/// crate-internal `yuv_444p_n_to_rgb_row::<9>`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the
+/// crate-internal `yuv_444p_n_to_rgb_row::<10>`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 12-bit → u8 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 14-bit → u8 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
+/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline
+/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range
+/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to
+/// i64 to avoid i32 overflow at 16-bit limited range.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) ---------------
+//
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
+
+/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the
+/// per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_444p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha
+/// element is `1023`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha
+/// element is `4095`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha
+/// element is `16383`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit
+/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is
+/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar
+/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs
new file mode 100644
index 00000000..90399881
--- /dev/null
+++ b/src/row/dispatch/yuva.rs
@@ -0,0 +1,845 @@
+//! YUVA dispatchers — Yuva444p10 + the Yuva420p family
+//! (Yuva420p / Yuva420p9 / Yuva420p10 / Yuva420p16) for both 8-bit
+//! RGBA and native-depth `u16` RGBA outputs. Extracted from
+//! `row::mod` for organization.
+
+use crate::row::scalar;
+use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+// ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------
+//
+// Per-row dispatchers for the YUVA source family (currently Yuva444p10
+// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and
+// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through
+// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers,
+// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row`
+// dispatchers' patterns.
+
+/// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit**
+/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
+/// that backs [`yuv444p10_to_rgba_row`]; the per-pixel alpha byte is
+/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`)
+/// instead of being constant `0xFF`.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva444p10_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+    y, u, v, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **10-bit** YUVA 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the
+/// per-pixel alpha element is **sourced from `a`** (already at the
+/// source's native bit depth) instead of being the opaque maximum
+/// `1023`.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva444p10_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+    y, u, v, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+// ---- YUVA 4:2:0 RGBA dispatchers --------------------------------------
+//
+// Per-row dispatchers for the YUVA 4:2:0 source family — Yuva420p
+// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p16. The u8 RGBA
+// dispatchers route through per-arch
+// `yuv_420*_to_rgba*_with_alpha_src_row` SIMD wrappers (Ship 8b-2b),
+// mirroring the non-alpha sibling dispatchers' `cfg_select!` blocks.
+// The native-depth `u16` RGBA dispatchers below remain scalar pending
+// Ship 8b-2c.
+
+/// Converts one row of 8‑bit YUVA 4:2:0 to packed **8‑bit** **RGBA**.
+/// R / G / B are produced by the same Q15 i32 8‑bit kernel that backs
+/// [`yuv_420_to_rgba_row`]; the per-pixel alpha byte is **sourced
+/// from `a`** (one byte per pixel, full-width — alpha is at luma
+/// resolution in 4:2:0, only chroma is subsampled).
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv_420_to_rgba_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  a: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420_to_rgba_with_alpha_src_row(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **9‑bit** YUVA 4:2:0 to packed **8‑bit**
+/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
+/// that backs [`yuv420p9_to_rgba_row`]; the per-pixel alpha byte is
+/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`)
+/// instead of being constant `0xFF`.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p9_to_rgba_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p9_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **9‑bit** YUVA 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the
+/// per-pixel alpha element is **sourced from `a`** (already at the
+/// source's native bit depth) instead of being the opaque maximum
+/// `511`.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p9_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **10‑bit** YUVA 4:2:0 to packed **8‑bit**
+/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
+/// that backs [`yuv420p10_to_rgba_row`]; the per-pixel alpha byte is
+/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`)
+/// instead of being constant `0xFF`.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p10_to_rgba_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p10_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **10‑bit** YUVA 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the
+/// per-pixel alpha element is **sourced from `a`** at native depth.
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p10_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **16‑bit** YUVA 4:2:0 to packed **8‑bit**
+/// **RGBA**. R / G / B are produced by the same i32 kernel that backs
+/// [`yuv420p16_to_rgba_row`]; the per-pixel alpha byte is **sourced
+/// from `a`** (depth-converted via `a >> 8` to fit `u8`).
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p16_to_rgba_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_with_alpha_src_row(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Converts one row of **16‑bit** YUVA 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel
+/// alpha element is **sourced from `a`** at native depth (no shift).
+///
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuva420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  a: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(a.len() >= width, "a row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
+    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
+  );
+}
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 83bf088b..97704767 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -30,6398 +30,45 @@
 //!
 //! Dispatcher `cfg_select!` requires Rust 1.95+ (stable, in the core
 //! prelude — no import needed). The crate's MSRV matches.
+//!
+//! # Submodule layout
+//!
+//! Public dispatchers are split across `dispatch::*` submodules by
+//! source format family for readability — `yuv420` / `yuv444` / `nv` /
+//! `pn` / `yuva` / `rgb_ops` / `bayer`. They are re-exported as
+//! `pub use dispatch::*::*` here so the public API stays at
+//! `crate::row::*` (e.g. `crate::row::yuv_420_to_rgb_row`). Callers
+//! see no API change from the split.
 
-pub(crate) mod arch;
-pub(crate) mod scalar;
-
-// Re-exported only when a caller is compiled. The `MixedSinker` Strategy A
-// fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which
-// is gated on `feature = "std"` / `feature = "alloc"` (needs `Vec`). Without
-// either feature both this re-export and the underlying scalar function would
-// be unused, which is a hard error under `cargo clippy -- -D warnings`.
-#[cfg(any(feature = "std", feature = "alloc"))]
-pub(crate) use scalar::expand_rgb_to_rgba_row;
-#[cfg(any(feature = "std", feature = "alloc"))]
-pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row;
-
-use crate::ColorMatrix;
-
-/// Converts one row of 4:2:0 YUV to packed RGB.
-///
-/// Dispatches to the best available backend for the current target.
-/// See `scalar::yuv_420_to_rgb_row` for the full semantic
-/// specification (range handling, matrix definitions, output layout).
-///
-/// `use_simd = false` forces the scalar reference path, bypassing any
-/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD
-/// directly on the same input; production code should pass `true`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_420_to_rgb_row(
-  y: &[u8],
-  u_half: &[u8],
-  v_half: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary. The unsafe SIMD
-  // kernels below rely on these invariants for bounds‑free pointer
-  // arithmetic, so we validate in *release* builds too — not just
-  // under `debug_assert!`. Kernels keep their own `debug_assert!`s as
-  // internal sanity checks.
-  //
-  // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize`
-  // on 32‑bit targets (wasm32, i686) for extreme widths. Without the
-  // guard, a wrapped product could admit an undersized `rgb_out` and
-  // let the scalar loop's `x * 3` indexing or a SIMD kernel's
-  // pointer arithmetic run off the end.
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present on this
-          // CPU. Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference); they are checked
-          // with `debug_assert` in debug builds.
-          unsafe {
-            arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          // Bounds / parity invariants are the caller's obligation.
-          unsafe {
-            arch::x86_avx512::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: `avx2_available()` verified AVX2 is present on this
-          // CPU. Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference); they are checked
-          // with `debug_assert` in debug builds.
-          unsafe {
-            arch::x86_avx2::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: `sse41_available()` verified SSE4.1 is present.
-          // Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference).
-          unsafe {
-            arch::x86_sse41::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below
-      // SSE4.1) slot in here, each branch guarded by the matching
-      // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair.
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: `simd128_available()` (compile‑time
-          // `cfg!(target_feature = "simd128")`) verified that simd128
-          // is on. WASM has no runtime detection — the module's SIMD
-          // support is fixed at produce‑time. Bounds / parity
-          // invariants are the caller's obligation.
-          unsafe {
-            arch::wasm_simd128::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend (riscv64, powerpc, …) fall
-        // through to the scalar path below.
-      }
-    }
-  }
-
-  scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit).
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
-/// differences are the per-pixel stride (4 vs 3) and the alpha byte
-/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
-/// produce opaque output). The first three bytes per pixel are
-/// byte-identical to what [`yuv_420_to_rgb_row`] would write.
-///
-/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the
-/// scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_420_to_rgba_row(
-  y: &[u8],
-  u_half: &[u8],
-  v_half: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary — see
-  // [`yuv_420_to_rgb_row`] for rationale, including the checked
-  // `width × 4` multiplication via [`rgba_row_bytes`].
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          unsafe {
-            arch::x86_avx512::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: `avx2_available()` verified AVX2 is present.
-          unsafe {
-            arch::x86_avx2::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: `sse41_available()` verified SSE4.1 is present.
-          unsafe {
-            arch::x86_sse41::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time availability verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB.
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
-/// difference is UV source — NV12 delivers U and V interleaved in a
-/// single `width`‑byte row (`U0, V0, U1, V1, …`). See
-/// `scalar::nv12_to_rgb_row` for the reference implementation.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv12_to_rgb_row(
-  y: &[u8],
-  uv_half: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary (see
-  // [`yuv_420_to_rgb_row`] for rationale, including the checked
-  // `width × 3` multiplication).
-  assert_eq!(width & 1, 0, "NV12 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present on this
-          // CPU. Bounds / parity invariants are the caller's obligation
-          // (checked above).
-          unsafe {
-            arch::neon::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          unsafe {
-            arch::x86_avx512::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: `avx2_available()` verified AVX2 is present.
-          unsafe {
-            arch::x86_avx2::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: `sse41_available()` verified SSE4.1 is present.
-          unsafe {
-            arch::x86_sse41::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: `simd128_available()` verified simd128 is on at
-          // compile time (WASM has no runtime CPU detection).
-          unsafe {
-            arch::wasm_simd128::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to
-/// packed RGB.
-///
-/// Same numerical contract as [`nv12_to_rgb_row`]; the only
-/// difference is chroma byte order — NV21 stores `V0, U0, V1, U1, …`
-/// instead of NV12's `U0, V0, U1, V1, …`. See `scalar::nv21_to_rgb_row`
-/// for the reference implementation.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv21_to_rgb_row(
-  y: &[u8],
-  vu_half: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary.
-  assert_eq!(width & 1, 0, "NV21 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(vu_half.len() >= width, "vu_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          unsafe {
-            arch::x86_avx512::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA**
-/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only
-/// differences are the per-pixel stride (4 vs 3) and the alpha byte
-/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
-/// produce opaque output).
-///
-/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv12_to_rgba_row(
-  y: &[u8],
-  uv_half: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary — see
-  // [`yuv_420_to_rgba_row`] for rationale, including the checked
-  // `width × 4` multiplication via [`rgba_row_bytes`].
-  assert_eq!(width & 1, 0, "NV12 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to
-/// packed **RGBA** (8-bit). Same numerical contract as
-/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque).
-///
-/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv21_to_rgba_row(
-  y: &[u8],
-  vu_half: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "NV21 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(vu_half.len() >= width, "vu_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed
-/// RGB. Dispatches to the best available SIMD backend for the current
-/// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling
-/// back to scalar when no backend is available.
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
-/// from NV12 is 4:4:4 chroma — one UV pair per Y pixel, no chroma
-/// upsampling, and no width parity constraint. See
-/// `scalar::nv24_to_rgb_row` for the reference implementation.
-///
-/// `use_simd = false` forces the scalar reference path, bypassing any
-/// SIMD backend. Benchmarks can flip this to compare scalar vs SIMD
-/// directly on the same input; production code should pass `true`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv24_to_rgb_row(
-  y: &[u8],
-  uv: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  // NV24 chroma carries one UV pair per pixel = `2 * width` bytes.
-  // Use `checked_mul` — on 32-bit targets, `2 * width` can overflow
-  // `usize` at extreme widths and silently short-circuit the length
-  // check before entering unsafe SIMD paths.
-  let uv_min = match width.checked_mul(2) {
-    Some(n) => n,
-    None => panic!("width ({width}) × 2 overflows usize"),
-  };
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv.len() >= uv_min, "uv row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV42 (semi‑planar 4:4:4, VU‑ordered) to packed
-/// RGB. Same as [`nv24_to_rgb_row`] but with swapped chroma byte order.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv42_to_rgb_row(
-  y: &[u8],
-  vu: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  let vu_min = match width.checked_mul(2) {
-    Some(n) => n,
-    None => panic!("width ({width}) × 2 overflows usize"),
-  };
-  assert!(y.len() >= width, "y row too short");
-  assert!(vu.len() >= vu_min, "vu row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed
-/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`];
-/// alpha defaults to `0xFF` (opaque).
-///
-/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv24_to_rgba_row(
-  y: &[u8],
-  uv: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  let uv_min = match width.checked_mul(2) {
-    Some(n) => n,
-    None => panic!("width ({width}) × 2 overflows usize"),
-  };
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv.len() >= uv_min, "uv row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed
-/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped
-/// chroma byte order.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn nv42_to_rgba_row(
-  y: &[u8],
-  vu: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  let vu_min = match width.checked_mul(2) {
-    Some(n) => n,
-    None => panic!("width ({width}) × 2 overflows usize"),
-  };
-  assert!(y.len() >= width, "y row too short");
-  assert!(vu.len() >= vu_min, "vu row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
-/// to the best available SIMD backend for the current target.
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
-/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma
-/// planes, no chroma upsampling, no width parity constraint. See
-/// `scalar::yuv_444_to_rgb_row` for the reference implementation.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_444_to_rgb_row(
-  y: &[u8],
-  u: &[u8],
-  v: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX-512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit).
-/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only
-/// differences are the per-pixel stride (4 vs 3) and the alpha byte
-/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`.
-/// `use_simd = false` forces scalar.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_444_to_rgba_row(
-  y: &[u8],
-  u: &[u8],
-  v: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
-/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
-/// available backend for the current target (NEON / SSE4.1 / AVX2 /
-/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
-/// backend is available or `use_simd` is false.
-///
-/// Crate-private — external callers use the concrete
-/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] /
-/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a
-/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1`
-/// truncates to `-1` when cast to `i16` in the SIMD clamp), and
-/// matches the [`yuv420p10_to_rgb_row`] family's convention of
-/// keeping the `<BITS>` generic internal.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn yuv_444p_n_to_rgb_row<const BITS: u32>(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher.
-/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output.
-/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 /
-/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
-/// backend is available or `use_simd` is false.
-///
-/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The
-/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a
-/// dedicated i64-chroma kernel family.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the
-/// crate-internal `yuv_444p_n_to_rgb_row::<9>`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the
-/// crate-internal `yuv_444p_n_to_rgb_row::<10>`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 12-bit → u8 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 14-bit → u8 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
-/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline
-/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range
-/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to
-/// i64 to avoid i32 overflow at 16-bit limited range.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 9 active bits in the low bits of each
-/// element. Niche format (AVC High 9 profile only). Reuses the same
-/// `yuv_420p_n_to_rgb_row<BITS>` kernel family as 10/12/14-bit; the
-/// only per-call difference is the const-generic `BITS = 9` which
-/// fixes the AND-mask to `0x1FF` and the Q15 scale via
-/// `range_params_n::<9, 8>`.
-///
-/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 10 active bits in the low bits of each
-/// element. Output is packed `R, G, B` bytes (`3 * width` bytes),
-/// with the conversion clamping to `[0, 255]` — the native‑depth
-/// path is [`yuv420p10_to_rgb_u16_row`].
-///
-/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified on this CPU; bounds / parity are
-          // the caller's obligation (asserted above).
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed
-/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`,
-/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless
-/// downstream HDR processing when the consumer expects low‑bit‑packed
-/// samples.
-///
-/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16`
-/// elements, each in `[0, 1023]` with the upper 6 bits zero.
-///
-/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
-/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
-/// feeding this output into a p010 consumer must shift left by 6
-/// before handing off.
-///
-/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
-/// packed — 10 active bits in the high 10 of each `u16`) to packed
-/// **8‑bit** RGB.
-///
-/// This is the HDR hardware‑decode keystone format: VideoToolbox,
-/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
-/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P010 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** to **native‑depth `u16`** packed RGB
-/// (10 active bits in the **low** 10 of each output `u16`, matching
-/// `yuv420p10le` convention — **not** the P010 high‑bit packing).
-/// Callers feeding this output into a P010 consumer must shift left
-/// by 6.
-///
-/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec.
-/// `use_simd = false` forces the scalar reference.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P010 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_u16_row::<10>(
-              y, uv_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 12 active bits in the low 12 bits of each
-/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed
-/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The
-/// native‑depth path is [`yuv420p12_to_rgb_u16_row`].
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching
-/// `yuv420p12le` convention — upper 4 bits zero).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (14‑bit values in the low 14 of each `u16`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
-/// packed — 12 active bits in the high 12 of each `u16`) to packed
-/// **8‑bit** RGB.
-///
-/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and
-/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but
-/// `>> 4` instead of `>> 6` at each `u16` load.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P012 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** to **native‑depth `u16`** packed RGB
-/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed
-/// `yuv420p12le` convention, **not** P012's high‑bit packing).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P012 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_u16_row::<12>(
-              y, uv_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
-///
-/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs
-/// on the **i64 chroma** kernel family; see
-/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth**
-/// packed `u16` RGB (full-range output in `[0, 65535]`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
-/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed
-/// vs. low-bit-packed distinction (all bits are active).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P016 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** to **native-depth `u16`** packed RGB
-/// (full-range output in `[0, 65535]`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P016 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
-//
-// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
-// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces
-// the scalar reference path on every dispatcher.
-
-/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p9_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified on this CPU; bounds / parity are
-          // the caller's obligation (asserted above).
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p10_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
-/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
-/// `0xFF` (opaque).
-///
-/// See `scalar::p_n_to_rgba_row::<10>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
-/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
-/// is low-bit-packed; alpha element is `(1 << 10) - 1`.
-///
-/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p12_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p14_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
-/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
-/// `0xFF` (opaque).
-///
-/// See `scalar::p_n_to_rgba_row::<12>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
-/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
-/// is low-bit-packed; alpha element is `(1 << 12) - 1`.
-///
-/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// Routes through the dedicated 16-bit scalar kernel
-/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
-/// for u8 output even at 16-bit input. `use_simd = false` forces the
-/// scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — full-range output `[0, 65535]`; alpha element
-/// is `0xFFFF` (opaque maximum at 16-bit).
-///
-/// Routes through the dedicated 16-bit u16-output scalar kernel
-/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
-/// for the wider `coeff × u_d` product at 16 → 16-bit scaling.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
-/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
-///
-/// Routes through the dedicated 16-bit P016 scalar kernel
-/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar
-/// reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** to **native-depth `u16`** packed
-/// **RGBA** — full-range output `[0, 65535]`; alpha element is
-/// `0xFFFF`.
-///
-/// Routes through the dedicated 16-bit u16-output P016 scalar kernel
-/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-// ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB --------------------
-//
-// Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width
-// interleaved UV (one `U, V` pair per pixel = `2 * width` u16 elements
-// per row). BITS ∈ {10, 12} run on the const-generic Q15 i32 family;
-// BITS = 16 runs on the dedicated parallel i64-chroma family
-// (chroma multiply-add overflows i32 at 16-bit u16 output).
-
-/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB
-/// dispatcher. Const-generic over `BITS`; dispatches to the best
-/// available backend (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128),
-/// falling back to scalar when no SIMD backend is available or
-/// `use_simd` is false.
-///
-/// Crate-private — public consumers go through the per-format
-/// dispatchers (`p410_to_rgb_row`, `p412_to_rgb_row`) which fix
-/// `BITS` to a literal.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn p_n_444_to_rgb_row<const BITS: u32>(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX-512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile-time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgb_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-}
-
-/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16**
-/// RGB dispatcher. Output is low-bit-packed (active bits in low
-/// `BITS` of each `u16`). Same dispatch shape as
-/// [`p_n_444_to_rgb_row`].
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn p_n_444_to_rgb_u16_row<const BITS: u32>(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgb_u16_row::<BITS>(y, uv_full, rgb_out, width, matrix, full_range);
-}
-
-/// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher.
-/// Y stays on i32 (output-range scaling keeps `coeff × u_d` within
-/// i32 for u8 output); chroma multiply-add also stays on i32.
-/// Dedicated entry point because the Q15 const-generic family is
-/// pinned to BITS ∈ {10, 12}.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p416_to_rgb_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range);
-}
-
-/// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma
-/// multiply-add runs on i64 (overflow safety at 16-bit u16 output);
-/// see scalar reference for the rationale.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p416_to_rgb_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range);
-}
-
-/// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p410_to_rgb_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  p_n_444_to_rgb_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// P410 → native-depth u16 RGB (10-bit low-packed output).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p410_to_rgb_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  p_n_444_to_rgb_u16_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// P412 → packed u8 RGB. Thin wrapper at `BITS = 12`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p412_to_rgb_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  p_n_444_to_rgb_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// P412 → native-depth u16 RGB (12-bit low-packed output).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p412_to_rgb_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd);
-}
-
-// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) ---------------
-//
-// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
-// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces
-// the scalar reference path on every dispatcher.
-
-/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the
-/// per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_444p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha
-/// element is `1023`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha
-/// element is `4095`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha
-/// element is `16383`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit
-/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is
-/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar
-/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-// ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------
-//
-// Per-row dispatchers for the YUVA source family (currently Yuva444p10
-// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and
-// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through
-// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers,
-// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row`
-// dispatchers' patterns.
-
-/// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit**
-/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
-/// that backs [`yuv444p10_to_rgba_row`]; the per-pixel alpha byte is
-/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`)
-/// instead of being constant `0xFF`.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva444p10_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
-    y, u, v, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **10-bit** YUVA 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the
-/// per-pixel alpha element is **sourced from `a`** (already at the
-/// source's native bit depth) instead of being the opaque maximum
-/// `1023`.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva444p10_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u, v, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
-    y, u, v, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-// ---- YUVA 4:2:0 RGBA dispatchers --------------------------------------
-//
-// Per-row dispatchers for the YUVA 4:2:0 source family — Yuva420p
-// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p16. The u8 RGBA
-// dispatchers route through per-arch
-// `yuv_420*_to_rgba*_with_alpha_src_row` SIMD wrappers (Ship 8b-2b),
-// mirroring the non-alpha sibling dispatchers' `cfg_select!` blocks.
-// The native-depth `u16` RGBA dispatchers below remain scalar pending
-// Ship 8b-2c.
-
-/// Converts one row of 8‑bit YUVA 4:2:0 to packed **8‑bit** **RGBA**.
-/// R / G / B are produced by the same Q15 i32 8‑bit kernel that backs
-/// [`yuv_420_to_rgba_row`]; the per-pixel alpha byte is **sourced
-/// from `a`** (one byte per pixel, full-width — alpha is at luma
-/// resolution in 4:2:0, only chroma is subsampled).
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv_420_to_rgba_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p_to_rgba_row(
-  y: &[u8],
-  u_half: &[u8],
-  v_half: &[u8],
-  a: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420_to_rgba_with_alpha_src_row(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **9‑bit** YUVA 4:2:0 to packed **8‑bit**
-/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
-/// that backs [`yuv420p9_to_rgba_row`]; the per-pixel alpha byte is
-/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`)
-/// instead of being constant `0xFF`.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p9_to_rgba_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p9_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **9‑bit** YUVA 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the
-/// per-pixel alpha element is **sourced from `a`** (already at the
-/// source's native bit depth) instead of being the opaque maximum
-/// `511`.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p9_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **10‑bit** YUVA 4:2:0 to packed **8‑bit**
-/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
-/// that backs [`yuv420p10_to_rgba_row`]; the per-pixel alpha byte is
-/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`)
-/// instead of being constant `0xFF`.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p10_to_rgba_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p10_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **10‑bit** YUVA 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the
-/// per-pixel alpha element is **sourced from `a`** at native depth.
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p10_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **16‑bit** YUVA 4:2:0 to packed **8‑bit**
-/// **RGBA**. R / G / B are produced by the same i32 kernel that backs
-/// [`yuv420p16_to_rgba_row`]; the per-pixel alpha byte is **sourced
-/// from `a`** (depth-converted via `a >> 8` to fit `u8`).
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p16_to_rgba_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p16_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_with_alpha_src_row(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// Converts one row of **16‑bit** YUVA 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel
-/// alpha element is **sourced from `a`** at native depth (no shift).
-///
-/// `use_simd = false` forces the scalar reference path; otherwise
-/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuva420p16_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  a: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(a.len() >= width, "a row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-              y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row(
-    y, u_half, v_half, a, rgba_out, width, matrix, full_range,
-  );
-}
-
-/// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p410_to_rgba_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// P410 → **native-depth `u16`** packed **RGBA** — output is
-/// low-bit-packed (`[0, 1023]`); alpha element is `1023`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p410_to_rgba_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p412_to_rgba_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// P412 → **native-depth `u16`** packed **RGBA** — output is
-/// low-bit-packed (`[0, 4095]`); alpha element is `4095`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p412_to_rgba_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA**
-/// (`R, G, B, 0xFF`). Routes through the dedicated 16-bit scalar
-/// kernel (`scalar::p_n_444_16_to_rgba_row`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p416_to_rgba_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// P416 → **native-depth `u16`** packed **RGBA** — full-range output
-/// `[0, 65535]`; alpha element is `0xFFFF`. Routes through the
-/// dedicated 16-bit u16-output scalar kernel
-/// (`scalar::p_n_444_16_to_rgba_u16_row`) — i64 chroma multiply.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p416_to_rgba_u16_row(
-  y: &[u16],
-  uv_full: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  let uv_min = uv_full_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_full.len() >= uv_min, "uv_full row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
-/// encoding). See `scalar::rgb_to_hsv_row` for semantics.
-///
-/// `use_simd = false` forces the scalar reference path, bypassing any
-/// SIMD backend (same semantics as `yuv_420_to_rgb_row`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgb_to_hsv_row(
-  rgb: &[u8],
-  h_out: &mut [u8],
-  s_out: &mut [u8],
-  v_out: &mut [u8],
-  width: usize,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary (see
-  // [`yuv_420_to_rgb_row`] for rationale, including the checked
-  // `width × 3` multiplication).
-  let rgb_min = rgb_row_bytes(width);
-  assert!(rgb.len() >= rgb_min, "rgb row too short");
-  assert!(h_out.len() >= width, "h_out row too short");
-  assert!(s_out.len() >= width, "s_out row too short");
-  assert!(v_out.len() >= width, "v_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD HSV backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width);
-}
-
-/// Rewrites a row of packed BGR to packed RGB by swapping the outer
-/// two channels (byte 0 ↔ byte 2) of every triple. `input` and
-/// `output` must not alias.
-///
-/// The underlying transformation is self‑inverse, so
-/// [`rgb_to_bgr_row`] shares the same implementation — use whichever
-/// name reads more naturally at the call site.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-pub fn bgr_to_rgb_row(bgr: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) {
-  swap_rb_channels_row(bgr, rgb_out, width, use_simd);
-}
-
-/// Rewrites a row of packed RGB to packed BGR by swapping the outer
-/// two channels. See [`bgr_to_rgb_row`] — this is an alias that reads
-/// more naturally for the opposite direction.
-#[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgb_to_bgr_row(rgb: &[u8], bgr_out: &mut [u8], width: usize, use_simd: bool) {
-  swap_rb_channels_row(rgb, bgr_out, width, use_simd);
-}
-
-/// Shared dispatcher behind `bgr_to_rgb_row` / `rgb_to_bgr_row`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd: bool) {
-  // Runtime asserts at the dispatcher boundary (see
-  // [`yuv_420_to_rgb_row`] for rationale, including the checked
-  // `width × 3` multiplication).
-  let rgb_min = rgb_row_bytes(width);
-  assert!(input.len() >= rgb_min, "input row too short");
-  assert!(output.len() >= rgb_min, "output row too short");
+pub(crate) mod arch;
+pub(crate) mod scalar;
+mod dispatch;
 
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::bgr_rgb_swap_row(input, output, width);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          unsafe {
-            arch::x86_avx512::bgr_rgb_swap_row(input, output, width);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 just verified.
-          unsafe {
-            arch::x86_avx2::bgr_rgb_swap_row(input, output, width);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 just verified.
-          unsafe {
-            arch::x86_sse41::bgr_rgb_swap_row(input, output, width);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::bgr_rgb_swap_row(input, output, width);
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
+// Re-exported only when a caller is compiled. The `MixedSinker` Strategy A
+// fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which
+// is gated on `feature = "std"` / `feature = "alloc"` (needs `Vec`). Without
+// either feature both this re-export and the underlying scalar function would
+// be unused, which is a hard error under `cargo clippy -- -D warnings`.
+#[cfg(any(feature = "std", feature = "alloc"))]
+pub(crate) use scalar::expand_rgb_to_rgba_row;
+#[cfg(any(feature = "std", feature = "alloc"))]
+pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row;
 
-  scalar::bgr_rgb_swap_row(input, output, width);
-}
+pub use dispatch::bayer::*;
+pub use dispatch::nv::*;
+pub use dispatch::pn::*;
+pub use dispatch::rgb_ops::*;
+pub use dispatch::yuv420::*;
+pub use dispatch::yuv444::*;
+pub use dispatch::yuva::*;
+
+// `yuv_444p_n_to_rgb_u16_row` is consumed by the 32-bit overflow test
+// `yuv_444p_n_u16_dispatcher_rejects_width_times_3_overflow` below —
+// the dispatch submodule keeps it as `pub(crate)`, so glob `pub use`
+// doesn't pick it up. Gated on the same cfg the test uses to avoid
+// `unused_imports` on builds that don't compile the test.
+#[cfg(all(test, feature = "std", target_pointer_width = "32"))]
+pub(crate) use dispatch::yuv444::yuv_444p_n_to_rgb_u16_row;
 
 // ---- shared dispatcher helpers ---------------------------------------
 
@@ -6433,7 +80,7 @@ fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd:
 /// multiplication here could admit an undersized buffer and trigger
 /// out‑of‑bounds writes downstream.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn rgb_row_bytes(width: usize) -> usize {
+pub(crate) fn rgb_row_bytes(width: usize) -> usize {
   match width.checked_mul(3) {
     Some(n) => n,
     None => panic!("width ({width}) × 3 overflows usize"),
@@ -6444,7 +91,7 @@ fn rgb_row_bytes(width: usize) -> usize {
 /// checking. Same purpose as [`rgb_row_bytes`] for the 4-channel
 /// path used by the RGBA dispatchers.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn rgba_row_bytes(width: usize) -> usize {
+pub(crate) fn rgba_row_bytes(width: usize) -> usize {
   match width.checked_mul(4) {
     Some(n) => n,
     None => panic!("width ({width}) × 4 overflows usize"),
@@ -6459,7 +106,7 @@ fn rgba_row_bytes(width: usize) -> usize {
 /// caller allocates, and downstream SIMD kernels index with it
 /// directly without re‑multiplying.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn rgb_row_elems(width: usize) -> usize {
+pub(crate) fn rgb_row_elems(width: usize) -> usize {
   match width.checked_mul(3) {
     Some(n) => n,
     None => panic!("width ({width}) × 3 overflows usize"),
@@ -6471,7 +118,7 @@ fn rgb_row_elems(width: usize) -> usize {
 /// elements, not bytes. Callers use it to size `&mut [u16]` buffers
 /// for the high-bit-depth `u16` RGBA output path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn rgba_row_elems(width: usize) -> usize {
+pub(crate) fn rgba_row_elems(width: usize) -> usize {
   match width.checked_mul(4) {
     Some(n) => n,
     None => panic!("width ({width}) × 4 overflows usize"),
@@ -6514,7 +161,7 @@ pub(crate) const MAX_FUSED_TRANSFORM_ABS: f32 = 1.0e12;
 /// row-API callers and the dispatcher-level guarantee that
 /// matches what validated upstream inputs can produce.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) {
+pub(crate) fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) {
   let mut row = 0;
   while row < 3 {
     let mut col = 0;
@@ -6544,7 +191,7 @@ fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) {
 /// `assert!`, so an unchecked multiplication on 32-bit targets could
 /// silently admit an undersized buffer.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn uv_full_row_elems(width: usize) -> usize {
+pub(crate) fn uv_full_row_elems(width: usize) -> usize {
   match width.checked_mul(2) {
     Some(n) => n,
     None => panic!("width ({width}) × 2 overflows usize (UV row)"),
@@ -6572,7 +219,7 @@ fn uv_full_row_elems(width: usize) -> usize {
 /// NEON availability on aarch64.
 #[cfg(all(target_arch = "aarch64", feature = "std"))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn neon_available() -> bool {
+pub(crate) fn neon_available() -> bool {
   if cfg!(colconv_force_scalar) {
     return false;
   }
@@ -6582,14 +229,14 @@ fn neon_available() -> bool {
 /// NEON availability on aarch64 — no‑std variant (compile‑time).
 #[cfg(all(target_arch = "aarch64", not(feature = "std")))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-const fn neon_available() -> bool {
+pub(crate) const fn neon_available() -> bool {
   !cfg!(colconv_force_scalar) && cfg!(target_feature = "neon")
 }
 
 /// AVX2 availability on x86_64.
 #[cfg(all(target_arch = "x86_64", feature = "std"))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn avx2_available() -> bool {
+pub(crate) fn avx2_available() -> bool {
   if cfg!(colconv_force_scalar) || cfg!(colconv_disable_avx2) {
     return false;
   }
@@ -6599,14 +246,14 @@ fn avx2_available() -> bool {
 /// AVX2 availability on x86_64 — no‑std variant (compile‑time).
 #[cfg(all(target_arch = "x86_64", not(feature = "std")))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-const fn avx2_available() -> bool {
+pub(crate) const fn avx2_available() -> bool {
   !cfg!(colconv_force_scalar) && !cfg!(colconv_disable_avx2) && cfg!(target_feature = "avx2")
 }
 
 /// SSE4.1 availability on x86_64.
 #[cfg(all(target_arch = "x86_64", feature = "std"))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn sse41_available() -> bool {
+pub(crate) fn sse41_available() -> bool {
   if cfg!(colconv_force_scalar) {
     return false;
   }
@@ -6616,14 +263,14 @@ fn sse41_available() -> bool {
 /// SSE4.1 availability on x86_64 — no‑std variant (compile‑time).
 #[cfg(all(target_arch = "x86_64", not(feature = "std")))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-const fn sse41_available() -> bool {
+pub(crate) const fn sse41_available() -> bool {
   !cfg!(colconv_force_scalar) && cfg!(target_feature = "sse4.1")
 }
 
 /// AVX‑512 (F + BW) availability on x86_64.
 #[cfg(all(target_arch = "x86_64", feature = "std"))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn avx512_available() -> bool {
+pub(crate) fn avx512_available() -> bool {
   if cfg!(colconv_force_scalar) || cfg!(colconv_disable_avx512) {
     return false;
   }
@@ -6634,7 +281,7 @@ fn avx512_available() -> bool {
 /// (compile‑time).
 #[cfg(all(target_arch = "x86_64", not(feature = "std")))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-const fn avx512_available() -> bool {
+pub(crate) const fn avx512_available() -> bool {
   !cfg!(colconv_force_scalar) && !cfg!(colconv_disable_avx512) && cfg!(target_feature = "avx512bw")
 }
 
@@ -6643,162 +290,9 @@ const fn avx512_available() -> bool {
 /// a compile‑time check regardless of the `std` feature.
 #[cfg(target_arch = "wasm32")]
 #[cfg_attr(not(tarpaulin), inline(always))]
-const fn simd128_available() -> bool {
+pub(crate) const fn simd128_available() -> bool {
   !cfg!(colconv_force_scalar) && cfg!(target_feature = "simd128")
 }
-
-/// Converts one row of an 8-bit Bayer plane to packed RGB.
-///
-/// Dispatches to the best available backend for the current target.
-/// See [`scalar::bayer_to_rgb_row`] for the full semantic specification
-/// (bilinear demosaic geometry, edge handling, output layout).
-///
-/// `above` / `mid` / `below` are row-aligned slices into the source
-/// Bayer plane via the **mirror-by-2** boundary contract: at the
-/// top edge the caller supplies `above = mid_row(1)`, at the bottom
-/// edge `below = mid_row(h - 2)`; replicate fallback only when
-/// `height < 2`. See [`crate::raw::BayerRow::above`] for the full
-/// rationale (CFA-parity preservation across boundaries).
-/// `above` / `mid` / `below` must all be the same length — that
-/// length is the row's pixel width.
-///
-/// `m` is the precomputed `CCM · diag(wb)` 3×3 transform. Every
-/// element must be finite (not NaN, not ±∞); the dispatcher
-/// asserts this at the boundary so future unsafe SIMD kernels can
-/// trust the contract.
-///
-/// `rgb_out` must have at least `3 * mid.len()` bytes.
-///
-/// **`use_simd` is currently a no-op.** All Bayer paths run the
-/// scalar reference today; per-arch SIMD backends (NEON / SSE4.1 /
-/// AVX2 / AVX-512 / wasm simd128) ship in a follow-up. The
-/// parameter is wired through `MixedSinker` and the public
-/// dispatchers now so callers don't have to touch their call sites
-/// when SIMD lands.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn bayer_to_rgb_row(
-  above: &[u8],
-  mid: &[u8],
-  below: &[u8],
-  row_parity: u32,
-  pattern: crate::raw::BayerPattern,
-  demosaic: crate::raw::BayerDemosaic,
-  m: &[[f32; 3]; 3],
-  rgb_out: &mut [u8],
-  _use_simd: bool,
-) {
-  // Release-mode preflight: future unsafe SIMD backends will rely on
-  // these invariants for bounds-free pointer arithmetic, so we
-  // validate here rather than only via `debug_assert!` inside the
-  // scalar kernel. Same pattern as `yuv_420_to_rgb_row`.
-  let width = mid.len();
-  assert_eq!(above.len(), width, "above row length must match mid");
-  assert_eq!(below.len(), width, "below row length must match mid");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-  assert_color_transform_well_formed(m);
-
-  scalar::bayer_to_rgb_row(above, mid, below, row_parity, pattern, demosaic, m, rgb_out);
-}
-
-/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer
-/// plane to packed `u8` RGB.
-///
-/// `BITS` ∈ {10, 12, 14, 16}; samples are low-packed `u16` (active
-/// values in the low `BITS` bits, range `[0, (1 << BITS) - 1]`).
-/// Direct row-API callers are responsible for upholding the
-/// low-packed contract; samples whose value exceeds
-/// `(1 << BITS) - 1` produce defined-but-saturated output (no
-/// panic, no UB). The walker
-/// [`crate::raw::bayer16_to`] never sees out-of-range input
-/// because [`crate::frame::BayerFrame16::try_new`] validates every
-/// active sample at frame-construction time.
-///
-/// `m` is the unscaled `CCM · diag(wb)` — the kernel bakes the
-/// input→u8 rescale (`255 / ((1 << BITS) - 1)`) at output time.
-/// `above` / `mid` / `below` must all be the same length;
-/// `rgb_out` must have at least `3 * mid.len()` bytes.
-///
-/// **`use_simd` is currently a no-op** (see
-/// [`bayer_to_rgb_row`] for the deferred-SIMD note).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn bayer16_to_rgb_row<const BITS: u32>(
-  above: &[u16],
-  mid: &[u16],
-  below: &[u16],
-  row_parity: u32,
-  pattern: crate::raw::BayerPattern,
-  demosaic: crate::raw::BayerDemosaic,
-  m: &[[f32; 3]; 3],
-  rgb_out: &mut [u8],
-  _use_simd: bool,
-) {
-  const {
-    assert!(
-      BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16,
-      "bayer16_to_rgb_row: BITS must be 10, 12, 14, or 16"
-    )
-  };
-  let width = mid.len();
-  assert_eq!(above.len(), width, "above row length must match mid");
-  assert_eq!(below.len(), width, "below row length must match mid");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-  assert_color_transform_well_formed(m);
-
-  scalar::bayer16_to_rgb_row::<BITS>(above, mid, below, row_parity, pattern, demosaic, m, rgb_out);
-}
-
-/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer
-/// plane to packed `u16` RGB (also low-packed at `BITS`).
-///
-/// `BITS` ∈ {10, 12, 14, 16}. Input and output share the same
-/// low-packed range `[0, (1 << BITS) - 1]` per channel — no
-/// rescale, just clamp. `above` / `mid` / `below` must all be the
-/// same length; `rgb_out` must have at least `3 * mid.len()` `u16`
-/// elements.
-///
-/// Direct row-API callers are responsible for upholding the
-/// low-packed contract — see [`bayer16_to_rgb_row`] for the
-/// full rationale on the safe path
-/// ([`crate::frame::BayerFrame16::try_new`] + [`crate::raw::bayer16_to`])
-/// vs. the direct row API.
-///
-/// **`use_simd` is currently a no-op** (see
-/// [`bayer_to_rgb_row`] for the deferred-SIMD note).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn bayer16_to_rgb_u16_row<const BITS: u32>(
-  above: &[u16],
-  mid: &[u16],
-  below: &[u16],
-  row_parity: u32,
-  pattern: crate::raw::BayerPattern,
-  demosaic: crate::raw::BayerDemosaic,
-  m: &[[f32; 3]; 3],
-  rgb_out: &mut [u16],
-  _use_simd: bool,
-) {
-  const {
-    assert!(
-      BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16,
-      "bayer16_to_rgb_u16_row: BITS must be 10, 12, 14, or 16"
-    )
-  };
-  let width = mid.len();
-  assert_eq!(above.len(), width, "above row length must match mid");
-  assert_eq!(below.len(), width, "below row length must match mid");
-  let rgb_min = rgb_row_elems(width);
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-  assert_color_transform_well_formed(m);
-
-  scalar::bayer16_to_rgb_u16_row::<BITS>(
-    above, mid, below, row_parity, pattern, demosaic, m, rgb_out,
-  );
-}
-
 #[cfg(all(test, feature = "std"))]
 mod overflow_tests {
   //! 32-bit RGB-row-bytes overflow regressions for the public

From fd9edbad4b783e06b24a7b4e33df8e977a2c8562 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:41:52 +1200
Subject: [PATCH 3/6] refactor(row): split dispatch/yuv420.rs and yuv444.rs
 into per-format directories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`src/row/dispatch/yuv420.rs` (2698 lines) and `yuv444.rs` (1333 lines)
were the two largest files left after the previous split. Split each
into a subdirectory with one file per source format:

```
src/row/dispatch/yuv420/
  mod.rs           (re-exports + module decls, 31 lines)
  yuv_420.rs       (8-bit YUV 4:2:0 RGB / RGBA, 222 lines)
  yuv420p9.rs      (4 variants, 360 lines)
  yuv420p10.rs     (4 variants, 367 lines)
  yuv420p12.rs     (4 variants, 343 lines)
  yuv420p14.rs     (4 variants, 332 lines)
  yuv420p16.rs     (4 variants, 291 lines)
  p010.rs          (P010 4:2:0 semi-planar, 312 lines)
  p012.rs          (P012, 296 lines)
  p016.rs          (P016, 279 lines)

src/row/dispatch/yuv444/
  mod.rs           (re-exports + pub(crate) BITS-generic helpers
                    `yuv_444p_n_to_rgb_row` /
                    `yuv_444p_n_to_rgb_u16_row` shared by 9/10/12/14
                    wrappers, 197 lines)
  yuv_444.rs       (8-bit YUV 4:4:4 RGB / RGBA, 159 lines)
  yuv444p9.rs      (thin RGB wrappers + full RGBA dispatchers,
                    209 lines)
  yuv444p10.rs     (193 lines)
  yuv444p12.rs     (192 lines)
  yuv444p14.rs     (192 lines)
  yuv444p16.rs     (full dispatchers — BITS-generic template
                    pinned to {9,10,12,14}, so 16-bit gets its own,
                    304 lines)
```

No semantic changes — function bodies were extracted byte-for-byte
via `sed -n` from the prior single-file modules. The only edits
were:

- Per-file `use` lines trimmed to what each file actually needs
  (e.g. 8-bit dispatchers don't import `rgb_row_elems` /
  `rgba_row_elems`; the BITS-generic helper file in yuv444 doesn't
  need `rgba_row_*`).
- `yuv444/p9.rs`-`p14.rs` add `use super::{yuv_444p_n_to_rgb_row,
  yuv_444p_n_to_rgb_u16_row};` so the thin wrappers reach the
  helpers in the sibling `yuv444/mod.rs`.
- Parent `dispatch/mod.rs` is unchanged — the existing
  `pub(super) mod yuv420; pub(super) mod yuv444;` declarations
  resolve to the new `yuv420/mod.rs` / `yuv444/mod.rs` files.

The maximum file size in `src/row/dispatch/` is now 845 lines
(`yuva.rs`); after dropping yuv420.rs/yuv444.rs the largest YUV
files are 367 / 304 lines.

Verified across aarch64-apple-darwin, x86_64-unknown-freebsd, and
wasm32-unknown-unknown:
- `cargo check --lib --tests`: clean
- `RUSTFLAGS=-Dwarnings cargo clippy --lib --tests`: clean
- `cargo test --lib` (host): 629 passed (same as before)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/dispatch/yuv420.rs           | 2698 --------------------------
 src/row/dispatch/yuv420/mod.rs       |   31 +
 src/row/dispatch/yuv420/p010.rs      |  312 +++
 src/row/dispatch/yuv420/p012.rs      |  296 +++
 src/row/dispatch/yuv420/p016.rs      |  279 +++
 src/row/dispatch/yuv420/yuv420p10.rs |  367 ++++
 src/row/dispatch/yuv420/yuv420p12.rs |  343 ++++
 src/row/dispatch/yuv420/yuv420p14.rs |  332 ++++
 src/row/dispatch/yuv420/yuv420p16.rs |  291 +++
 src/row/dispatch/yuv420/yuv420p9.rs  |  360 ++++
 src/row/dispatch/yuv420/yuv_420.rs   |  222 +++
 src/row/dispatch/yuv444.rs           | 1333 -------------
 src/row/dispatch/yuv444/mod.rs       |  197 ++
 src/row/dispatch/yuv444/yuv444p10.rs |  193 ++
 src/row/dispatch/yuv444/yuv444p12.rs |  192 ++
 src/row/dispatch/yuv444/yuv444p14.rs |  192 ++
 src/row/dispatch/yuv444/yuv444p16.rs |  304 +++
 src/row/dispatch/yuv444/yuv444p9.rs  |  209 ++
 src/row/dispatch/yuv444/yuv_444.rs   |  159 ++
 19 files changed, 4279 insertions(+), 4031 deletions(-)
 delete mode 100644 src/row/dispatch/yuv420.rs
 create mode 100644 src/row/dispatch/yuv420/mod.rs
 create mode 100644 src/row/dispatch/yuv420/p010.rs
 create mode 100644 src/row/dispatch/yuv420/p012.rs
 create mode 100644 src/row/dispatch/yuv420/p016.rs
 create mode 100644 src/row/dispatch/yuv420/yuv420p10.rs
 create mode 100644 src/row/dispatch/yuv420/yuv420p12.rs
 create mode 100644 src/row/dispatch/yuv420/yuv420p14.rs
 create mode 100644 src/row/dispatch/yuv420/yuv420p16.rs
 create mode 100644 src/row/dispatch/yuv420/yuv420p9.rs
 create mode 100644 src/row/dispatch/yuv420/yuv_420.rs
 delete mode 100644 src/row/dispatch/yuv444.rs
 create mode 100644 src/row/dispatch/yuv444/mod.rs
 create mode 100644 src/row/dispatch/yuv444/yuv444p10.rs
 create mode 100644 src/row/dispatch/yuv444/yuv444p12.rs
 create mode 100644 src/row/dispatch/yuv444/yuv444p14.rs
 create mode 100644 src/row/dispatch/yuv444/yuv444p16.rs
 create mode 100644 src/row/dispatch/yuv444/yuv444p9.rs
 create mode 100644 src/row/dispatch/yuv444/yuv_444.rs

diff --git a/src/row/dispatch/yuv420.rs b/src/row/dispatch/yuv420.rs
deleted file mode 100644
index 8f34dca1..00000000
--- a/src/row/dispatch/yuv420.rs
+++ /dev/null
@@ -1,2698 +0,0 @@
-//! YUV 4:2:0 dispatchers (planar and P010/P012/P016 semi-planar) —
-//! 8-bit YUV → RGB/RGBA, 9/10/12/14/16-bit planar yuv420p_n RGB+RGBA,
-//! P010/P012/P016 semi-planar RGB+RGBA. Extracted from `row::mod` for
-//! organization.
-//!
-//! All dispatchers route through the standard `cfg_select!` per-arch
-//! block; `use_simd = false` forces scalar.
-
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
-#[cfg(target_arch = "aarch64")]
-use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
-#[cfg(target_arch = "wasm32")]
-use crate::row::simd128_available;
-use crate::ColorMatrix;
-
-/// Converts one row of 4:2:0 YUV to packed RGB.
-///
-/// Dispatches to the best available backend for the current target.
-/// See `scalar::yuv_420_to_rgb_row` for the full semantic
-/// specification (range handling, matrix definitions, output layout).
-///
-/// `use_simd = false` forces the scalar reference path, bypassing any
-/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD
-/// directly on the same input; production code should pass `true`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_420_to_rgb_row(
-  y: &[u8],
-  u_half: &[u8],
-  v_half: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary. The unsafe SIMD
-  // kernels below rely on these invariants for bounds‑free pointer
-  // arithmetic, so we validate in *release* builds too — not just
-  // under `debug_assert!`. Kernels keep their own `debug_assert!`s as
-  // internal sanity checks.
-  //
-  // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize`
-  // on 32‑bit targets (wasm32, i686) for extreme widths. Without the
-  // guard, a wrapped product could admit an undersized `rgb_out` and
-  // let the scalar loop's `x * 3` indexing or a SIMD kernel's
-  // pointer arithmetic run off the end.
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present on this
-          // CPU. Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference); they are checked
-          // with `debug_assert` in debug builds.
-          unsafe {
-            arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          // Bounds / parity invariants are the caller's obligation.
-          unsafe {
-            arch::x86_avx512::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: `avx2_available()` verified AVX2 is present on this
-          // CPU. Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference); they are checked
-          // with `debug_assert` in debug builds.
-          unsafe {
-            arch::x86_avx2::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: `sse41_available()` verified SSE4.1 is present.
-          // Bounds / parity invariants are the caller's obligation
-          // (same contract as the scalar reference).
-          unsafe {
-            arch::x86_sse41::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below
-      // SSE4.1) slot in here, each branch guarded by the matching
-      // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair.
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: `simd128_available()` (compile‑time
-          // `cfg!(target_feature = "simd128")`) verified that simd128
-          // is on. WASM has no runtime detection — the module's SIMD
-          // support is fixed at produce‑time. Bounds / parity
-          // invariants are the caller's obligation.
-          unsafe {
-            arch::wasm_simd128::yuv_420_to_rgb_row(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend (riscv64, powerpc, …) fall
-        // through to the scalar path below.
-      }
-    }
-  }
-
-  scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit).
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
-/// differences are the per-pixel stride (4 vs 3) and the alpha byte
-/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
-/// produce opaque output). The first three bytes per pixel are
-/// byte-identical to what [`yuv_420_to_rgb_row`] would write.
-///
-/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the
-/// scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_420_to_rgba_row(
-  y: &[u8],
-  u_half: &[u8],
-  v_half: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  // Runtime asserts at the dispatcher boundary — see
-  // [`yuv_420_to_rgb_row`] for rationale, including the checked
-  // `width × 4` multiplication via [`rgba_row_bytes`].
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
-          unsafe {
-            arch::x86_avx512::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: `avx2_available()` verified AVX2 is present.
-          unsafe {
-            arch::x86_avx2::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: `sse41_available()` verified SSE4.1 is present.
-          unsafe {
-            arch::x86_sse41::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time availability verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420_to_rgba_row(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {
-        // Targets without a SIMD backend fall through to scalar.
-      }
-    }
-  }
-
-  scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 9 active bits in the low bits of each
-/// element. Niche format (AVC High 9 profile only). Reuses the same
-/// `yuv_420p_n_to_rgb_row<BITS>` kernel family as 10/12/14-bit; the
-/// only per-call difference is the const-generic `BITS = 9` which
-/// fixes the AND-mask to `0x1FF` and the Q15 scale via
-/// `range_params_n::<9, 8>`.
-///
-/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 10 active bits in the low bits of each
-/// element. Output is packed `R, G, B` bytes (`3 * width` bytes),
-/// with the conversion clamping to `[0, 255]` — the native‑depth
-/// path is [`yuv420p10_to_rgb_u16_row`].
-///
-/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified on this CPU; bounds / parity are
-          // the caller's obligation (asserted above).
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed
-/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`,
-/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless
-/// downstream HDR processing when the consumer expects low‑bit‑packed
-/// samples.
-///
-/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16`
-/// elements, each in `[0, 1023]` with the upper 6 bits zero.
-///
-/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
-/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
-/// feeding this output into a p010 consumer must shift left by 6
-/// before handing off.
-///
-/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference
-/// path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
-/// packed — 10 active bits in the high 10 of each `u16`) to packed
-/// **8‑bit** RGB.
-///
-/// This is the HDR hardware‑decode keystone format: VideoToolbox,
-/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
-/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic
-/// specification. `use_simd = false` forces the scalar reference.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P010 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** to **native‑depth `u16`** packed RGB
-/// (10 active bits in the **low** 10 of each output `u16`, matching
-/// `yuv420p10le` convention — **not** the P010 high‑bit packing).
-/// Callers feeding this output into a P010 consumer must shift left
-/// by 6.
-///
-/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec.
-/// `use_simd = false` forces the scalar reference.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P010 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_u16_row::<10>(
-              y, uv_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-///
-/// Samples are `u16` with 12 active bits in the low 12 bits of each
-/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed
-/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The
-/// native‑depth path is [`yuv420p12_to_rgb_u16_row`].
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching
-/// `yuv420p12le` convention — upper 4 bits zero).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed
-/// `u16` RGB (14‑bit values in the low 14 of each `u16`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>(
-              y, u_half, v_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
-/// packed — 12 active bits in the high 12 of each `u16`) to packed
-/// **8‑bit** RGB.
-///
-/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and
-/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but
-/// `>> 4` instead of `>> 6` at each `u16` load.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P012 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** to **native‑depth `u16`** packed RGB
-/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed
-/// `yuv420p12le` convention, **not** P012's high‑bit packing).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P012 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgb_u16_row::<12>(
-              y, uv_half, rgb_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
-///
-/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs
-/// on the **i64 chroma** kernel family; see
-/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgb_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth**
-/// packed `u16` RGB (full-range output in `[0, 65535]`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgb_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
-/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed
-/// vs. low-bit-packed distinction (all bits are active).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgb_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P016 requires even width");
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** to **native-depth `u16`** packed RGB
-/// (full-range output in `[0, 65535]`).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgb_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "P016 requires even width");
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
-}
-// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
-//
-// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
-// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces
-// the scalar reference path on every dispatcher.
-
-/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p9_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified on this CPU; bounds / parity are
-          // the caller's obligation (asserted above).
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p9_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p10_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p10_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
-/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
-/// `0xFF` (opaque).
-///
-/// See `scalar::p_n_to_rgba_row::<10>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
-/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
-/// is low-bit-packed; alpha element is `(1 << 10) - 1`.
-///
-/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p010_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p12_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p12_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv420p14_to_rgb_row`] except
-/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p14_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>(
-              y, u_half, v_half, rgba_out, width, matrix, full_range,
-            );
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
-/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
-/// `0xFF` (opaque).
-///
-/// See `scalar::p_n_to_rgba_row::<12>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
-/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
-/// is low-bit-packed; alpha element is `(1 << 12) - 1`.
-///
-/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p012_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// Routes through the dedicated 16-bit scalar kernel
-/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
-/// for u8 output even at 16-bit input. `use_simd = false` forces the
-/// scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgba_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
-/// packed **RGBA** — full-range output `[0, 65535]`; alpha element
-/// is `0xFFFF` (opaque maximum at 16-bit).
-///
-/// Routes through the dedicated 16-bit u16-output scalar kernel
-/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
-/// for the wider `coeff × u_d` product at 16 → 16-bit scaling.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv420p16_to_rgba_u16_row(
-  y: &[u16],
-  u_half: &[u16],
-  v_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u_half.len() >= width / 2, "u_half row too short");
-  assert!(v_half.len() >= width / 2, "v_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
-/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
-///
-/// Routes through the dedicated 16-bit P016 scalar kernel
-/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar
-/// reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgba_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **P016** to **native-depth `u16`** packed
-/// **RGBA** — full-range output `[0, 65535]`; alpha element is
-/// `0xFFFF`.
-///
-/// Routes through the dedicated 16-bit u16-output P016 scalar kernel
-/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn p016_to_rgba_u16_row(
-  y: &[u16],
-  uv_half: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(uv_half.len() >= width, "uv_half row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
-}
diff --git a/src/row/dispatch/yuv420/mod.rs b/src/row/dispatch/yuv420/mod.rs
new file mode 100644
index 00000000..972210e7
--- /dev/null
+++ b/src/row/dispatch/yuv420/mod.rs
@@ -0,0 +1,31 @@
+//! YUV 4:2:0 dispatchers, split per source format for readability.
+//!
+//! - `yuv_420` — 8-bit YUV 4:2:0 → RGB / RGBA.
+//! - `yuv420p9` / `yuv420p10` / `yuv420p12` / `yuv420p14` /
+//!   `yuv420p16` — high-bit planar 4:2:0 (4 variants per format:
+//!   RGB, RGB-u16, RGBA, RGBA-u16).
+//! - `p010` / `p012` / `p016` — high-bit semi-planar 4:2:0
+//!   (4 variants per format).
+//!
+//! Public functions re-exported up to `crate::row::*` via parent
+//! `dispatch/mod.rs`.
+
+pub(super) mod p010;
+pub(super) mod p012;
+pub(super) mod p016;
+pub(super) mod yuv420p10;
+pub(super) mod yuv420p12;
+pub(super) mod yuv420p14;
+pub(super) mod yuv420p16;
+pub(super) mod yuv420p9;
+pub(super) mod yuv_420;
+
+pub use p010::*;
+pub use p012::*;
+pub use p016::*;
+pub use yuv420p10::*;
+pub use yuv420p12::*;
+pub use yuv420p14::*;
+pub use yuv420p16::*;
+pub use yuv420p9::*;
+pub use yuv_420::*;
diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs
new file mode 100644
index 00000000..35f9e548
--- /dev/null
+++ b/src/row/dispatch/yuv420/p010.rs
@@ -0,0 +1,312 @@
+//! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4
+//! variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
+/// packed — 10 active bits in the high 10 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// This is the HDR hardware‑decode keystone format: VideoToolbox,
+/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit
+/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** to **native‑depth `u16`** packed RGB
+/// (10 active bits in the **low** 10 of each output `u16`, matching
+/// `yuv420p10le` convention — **not** the P010 high‑bit packing).
+/// Callers feeding this output into a P010 consumer must shift left
+/// by 6.
+///
+/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec.
+/// `use_simd = false` forces the scalar reference.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P010 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<10>(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 10) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p010_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs
new file mode 100644
index 00000000..618bc8f6
--- /dev/null
+++ b/src/row/dispatch/yuv420/p012.rs
@@ -0,0 +1,296 @@
+//! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4
+//! variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
+/// packed — 12 active bits in the high 12 of each `u16`) to packed
+/// **8‑bit** RGB.
+///
+/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and
+/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but
+/// `>> 4` instead of `>> 6` at each `u16` load.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** to **native‑depth `u16`** packed RGB
+/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed
+/// `yuv420p12le` convention, **not** P012's high‑bit packing).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P012 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgb_u16_row::<12>(
+              y, uv_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
+/// `0xFF` (opaque).
+///
+/// See `scalar::p_n_to_rgba_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
+/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
+/// is low-bit-packed; alpha element is `(1 << 12) - 1`.
+///
+/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p012_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs
new file mode 100644
index 00000000..128aaf17
--- /dev/null
+++ b/src/row/dispatch/yuv420/p016.rs
@@ -0,0 +1,279 @@
+//! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
+/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed
+/// vs. low-bit-packed distinction (all bits are active).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgb_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P016 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed RGB
+/// (full-range output in `[0, 65535]`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgb_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "P016 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
+/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
+///
+/// Routes through the dedicated 16-bit P016 scalar kernel
+/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar
+/// reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **P016** to **native-depth `u16`** packed
+/// **RGBA** — full-range output `[0, 65535]`; alpha element is
+/// `0xFFFF`.
+///
+/// Routes through the dedicated 16-bit u16-output P016 scalar kernel
+/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn p016_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs
new file mode 100644
index 00000000..27f7a1ff
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv420p10.rs
@@ -0,0 +1,367 @@
+//! 10-bit planar YUV 4:2:0 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 10 active bits in the low bits of each
+/// element. Output is packed `R, G, B` bytes (`3 * width` bytes),
+/// with the conversion clamping to `[0, 255]` — the native‑depth
+/// path is [`yuv420p10_to_rgb_u16_row`].
+///
+/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed
+/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`,
+/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless
+/// downstream HDR processing when the consumer expects low‑bit‑packed
+/// samples.
+///
+/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16`
+/// elements, each in `[0, 1023]` with the upper 6 bits zero.
+///
+/// This is **not** the FFmpeg `p010` layout — `p010` stores samples
+/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers
+/// feeding this output into a p010 consumer must shift left by 6
+/// before handing off.
+///
+/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p10_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p10_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs
new file mode 100644
index 00000000..9d250c9a
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv420p12.rs
@@ -0,0 +1,343 @@
+//! 12-bit planar YUV 4:2:0 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 12 active bits in the low 12 bits of each
+/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed
+/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The
+/// native‑depth path is [`yuv420p12_to_rgb_u16_row`].
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching
+/// `yuv420p12le` convention — upper 4 bits zero).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p12_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p12_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs
new file mode 100644
index 00000000..a1c8024f
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv420p14.rs
@@ -0,0 +1,332 @@
+//! 14-bit planar YUV 4:2:0 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (14‑bit values in the low 14 of each `u16`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p14_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p14_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs
new file mode 100644
index 00000000..7b324e7d
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv420p16.rs
@@ -0,0 +1,291 @@
+//! 16-bit planar YUV 4:2:0 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
+///
+/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs
+/// on the **i64 chroma** kernel family; see
+/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth**
+/// packed `u16` RGB (full-range output in `[0, 65535]`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// Routes through the dedicated 16-bit scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
+/// for u8 output even at 16-bit input. `use_simd = false` forces the
+/// scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — full-range output `[0, 65535]`; alpha element
+/// is `0xFFFF` (opaque maximum at 16-bit).
+///
+/// Routes through the dedicated 16-bit u16-output scalar kernel
+/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
+/// for the wider `coeff × u_d` product at 16 → 16-bit scaling.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs
new file mode 100644
index 00000000..c28da34a
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv420p9.rs
@@ -0,0 +1,360 @@
+//! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16,
+//! RGBA, RGBA-u16).
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
+///
+/// Samples are `u16` with 9 active bits in the low bits of each
+/// element. Niche format (AVC High 9 profile only). Reuses the same
+/// `yuv_420p_n_to_rgb_row<BITS>` kernel family as 10/12/14-bit; the
+/// only per-call difference is the const-generic `BITS = 9` which
+/// fixes the AND-mask to `0x1FF` and the Q15 scale via
+/// `range_params_n::<9, 8>`.
+///
+/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic
+/// specification. `use_simd = false` forces the scalar reference
+/// path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgb_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed
+/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgb_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
+//
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
+
+/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv420p9_to_rgb_row`] except
+/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_420p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv420p9_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs
new file mode 100644
index 00000000..6428c008
--- /dev/null
+++ b/src/row/dispatch/yuv420/yuv_420.rs
@@ -0,0 +1,222 @@
+//! 8-bit YUV 4:2:0 → RGB / RGBA dispatchers (`yuv_420_to_rgb_row`,
+//! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420`
+//! module per source format for organization.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of 4:2:0 YUV to packed RGB.
+///
+/// Dispatches to the best available backend for the current target.
+/// See `scalar::yuv_420_to_rgb_row` for the full semantic
+/// specification (range handling, matrix definitions, output layout).
+///
+/// `use_simd = false` forces the scalar reference path, bypassing any
+/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD
+/// directly on the same input; production code should pass `true`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_420_to_rgb_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary. The unsafe SIMD
+  // kernels below rely on these invariants for bounds‑free pointer
+  // arithmetic, so we validate in *release* builds too — not just
+  // under `debug_assert!`. Kernels keep their own `debug_assert!`s as
+  // internal sanity checks.
+  //
+  // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize`
+  // on 32‑bit targets (wasm32, i686) for extreme widths. Without the
+  // guard, a wrapped product could admit an undersized `rgb_out` and
+  // let the scalar loop's `x * 3` indexing or a SIMD kernel's
+  // pointer arithmetic run off the end.
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present on this
+          // CPU. Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference); they are checked
+          // with `debug_assert` in debug builds.
+          unsafe {
+            arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          // Bounds / parity invariants are the caller's obligation.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present on this
+          // CPU. Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference); they are checked
+          // with `debug_assert` in debug builds.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          // Bounds / parity invariants are the caller's obligation
+          // (same contract as the scalar reference).
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below
+      // SSE4.1) slot in here, each branch guarded by the matching
+      // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair.
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: `simd128_available()` (compile‑time
+          // `cfg!(target_feature = "simd128")`) verified that simd128
+          // is on. WASM has no runtime detection — the module's SIMD
+          // support is fixed at produce‑time. Bounds / parity
+          // invariants are the caller's obligation.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgb_row(
+              y, u_half, v_half, rgb_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend (riscv64, powerpc, …) fall
+        // through to the scalar path below.
+      }
+    }
+  }
+
+  scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit).
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
+/// produce opaque output). The first three bytes per pixel are
+/// byte-identical to what [`yuv_420_to_rgb_row`] would write.
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the
+/// scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary — see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 4` multiplication via [`rgba_row_bytes`].
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time availability verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444.rs b/src/row/dispatch/yuv444.rs
deleted file mode 100644
index 5bc3a960..00000000
--- a/src/row/dispatch/yuv444.rs
+++ /dev/null
@@ -1,1333 +0,0 @@
-//! YUV 4:4:4 dispatchers (planar 8-bit + high-bit 9/10/12/14/16-bit)
-//! — RGB + RGBA. Extracted from `row::mod` for organization.
-//!
-//! Internal `pub(crate)` helpers `yuv_444p_n_to_rgb_row<BITS>` /
-//! `yuv_444p_n_to_rgb_u16_row<BITS>` provide the BITS-generic dispatch
-//! shared by 9/10/12/14-bit; 16-bit gets its own dedicated kernels.
-//!
-//! All dispatchers route through the standard `cfg_select!` per-arch
-//! block; `use_simd = false` forces scalar.
-
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
-#[cfg(target_arch = "aarch64")]
-use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
-#[cfg(target_arch = "wasm32")]
-use crate::row::simd128_available;
-use crate::ColorMatrix;
-
-/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
-/// to the best available SIMD backend for the current target.
-///
-/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
-/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma
-/// planes, no chroma upsampling, no width parity constraint. See
-/// `scalar::yuv_444_to_rgb_row` for the reference implementation.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_444_to_rgb_row(
-  y: &[u8],
-  u: &[u8],
-  v: &[u8],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: `neon_available()` verified NEON is present.
-          unsafe {
-            arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX-512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 verified at compile time.
-          unsafe {
-            arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit).
-/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only
-/// differences are the per-pixel stride (4 vs 3) and the alpha byte
-/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`.
-/// `use_simd = false` forces scalar.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv_444_to_rgba_row(
-  y: &[u8],
-  u: &[u8],
-  v: &[u8],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          unsafe {
-            arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          unsafe {
-            arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          unsafe {
-            arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          unsafe {
-            arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          unsafe {
-            arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
-/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
-/// available backend for the current target (NEON / SSE4.1 / AVX2 /
-/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
-/// backend is available or `use_simd` is false.
-///
-/// Crate-private — external callers use the concrete
-/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] /
-/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a
-/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1`
-/// truncates to `-1` when cast to `i16` in the SIMD clamp), and
-/// matches the [`yuv420p10_to_rgb_row`] family's convention of
-/// keeping the `<BITS>` generic internal.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn yuv_444p_n_to_rgb_row<const BITS: u32>(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher.
-/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output.
-/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 /
-/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
-/// backend is available or `use_simd` is false.
-///
-/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The
-/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a
-/// dedicated i64-chroma kernel family.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the
-/// crate-internal `yuv_444p_n_to_rgb_row::<9>`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the
-/// crate-internal `yuv_444p_n_to_rgb_row::<10>`.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 12-bit → u8 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 14-bit → u8 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
-}
-
-/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
-/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline
-/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel).
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgb_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-
-/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range
-/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to
-/// i64 to avoid i32 overflow at 16-bit limited range.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgb_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgb_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgb_min = rgb_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
-}
-// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) ---------------
-//
-// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
-// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces
-// the scalar reference path on every dispatcher.
-
-/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
-/// source has no alpha plane).
-///
-/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the
-/// per-pixel stride (4 vs 3) and the constant alpha byte. See
-/// `scalar::yuv_444p_n_to_rgba_row` for the reference.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
-/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
-/// (opaque maximum at the input bit depth).
-///
-/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference.
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p9_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha
-/// element is `1023`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p10_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha
-/// element is `4095`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p12_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha
-/// element is `16383`.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p14_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit**
-/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit
-/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`).
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgba_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u8],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_bytes(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
-}
-
-/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`**
-/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is
-/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar
-/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply.
-///
-/// `use_simd = false` forces the scalar reference path.
-#[cfg_attr(not(tarpaulin), inline(always))]
-#[allow(clippy::too_many_arguments)]
-pub fn yuv444p16_to_rgba_u16_row(
-  y: &[u16],
-  u: &[u16],
-  v: &[u16],
-  rgba_out: &mut [u16],
-  width: usize,
-  matrix: ColorMatrix,
-  full_range: bool,
-  use_simd: bool,
-) {
-  let rgba_min = rgba_row_elems(width);
-  assert!(y.len() >= width, "y row too short");
-  assert!(u.len() >= width, "u row too short");
-  assert!(v.len() >= width, "v row too short");
-  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
-
-  if use_simd {
-    cfg_select! {
-      target_arch = "aarch64" => {
-        if neon_available() {
-          // SAFETY: NEON verified.
-          unsafe {
-            arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "x86_64" => {
-        if avx512_available() {
-          // SAFETY: AVX‑512BW verified.
-          unsafe {
-            arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if avx2_available() {
-          // SAFETY: AVX2 verified.
-          unsafe {
-            arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-        if sse41_available() {
-          // SAFETY: SSE4.1 verified.
-          unsafe {
-            arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      target_arch = "wasm32" => {
-        if simd128_available() {
-          // SAFETY: simd128 compile‑time verified.
-          unsafe {
-            arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-          }
-          return;
-        }
-      },
-      _ => {}
-    }
-  }
-
-  scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
-}
diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs
new file mode 100644
index 00000000..4db06906
--- /dev/null
+++ b/src/row/dispatch/yuv444/mod.rs
@@ -0,0 +1,197 @@
+//! YUV 4:4:4 dispatchers, split per source format for readability.
+//!
+//! - `yuv_444` — 8-bit YUV 4:4:4 → RGB / RGBA.
+//! - `yuv444p9` / `yuv444p10` / `yuv444p12` / `yuv444p14` —
+//!   high-bit planar (4 variants per format). RGB / RGB-u16 paths
+//!   are thin wrappers over the BITS-generic helpers below; the
+//!   RGBA / RGBA-u16 paths are full dispatchers.
+//! - `yuv444p16` — 16-bit planar with its own dedicated dispatchers
+//!   (the BITS-generic template is pinned to {9, 10, 12, 14}).
+//!
+//! `yuv_444p_n_to_rgb_row<BITS>` / `yuv_444p_n_to_rgb_u16_row<BITS>`
+//! are the BITS-generic dispatchers shared by the 9 / 10 / 12 / 14-bit
+//! RGB wrappers above. They stay `pub(crate)` and live here at the
+//! `yuv444` module root so siblings can reach them via `super::*`.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
+/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
+/// available backend for the current target (NEON / SSE4.1 / AVX2 /
+/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
+/// backend is available or `use_simd` is false.
+///
+/// Crate-private — external callers use the concrete
+/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] /
+/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a
+/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1`
+/// truncates to `-1` when cast to `i16` in the SIMD clamp), and
+/// matches the [`yuv420p10_to_rgb_row`] family's convention of
+/// keeping the `<BITS>` generic internal.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn yuv_444p_n_to_rgb_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgb_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher.
+/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output.
+/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 /
+/// AVX-512 / wasm simd128), falling back to scalar when no SIMD
+/// backend is available or `use_simd` is false.
+///
+/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The
+/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a
+/// dedicated i64-chroma kernel family.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+pub(super) mod yuv444p10;
+pub(super) mod yuv444p12;
+pub(super) mod yuv444p14;
+pub(super) mod yuv444p16;
+pub(super) mod yuv444p9;
+pub(super) mod yuv_444;
+
+pub use yuv444p10::*;
+pub use yuv444p12::*;
+pub use yuv444p14::*;
+pub use yuv444p16::*;
+pub use yuv444p9::*;
+pub use yuv_444::*;
diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs
new file mode 100644
index 00000000..770f286e
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv444p10.rs
@@ -0,0 +1,193 @@
+//! 10-bit planar YUV 4:4:4 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
+
+
+/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the
+/// crate-internal `yuv_444p_n_to_rgb_row::<10>`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+
+/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha
+/// element is `1023`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p10_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs
new file mode 100644
index 00000000..15edca7c
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv444p12.rs
@@ -0,0 +1,192 @@
+//! 12-bit planar YUV 4:4:4 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
+
+
+/// YUV 4:4:4 planar 12-bit → u8 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+
+/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha
+/// element is `4095`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p12_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs
new file mode 100644
index 00000000..50f39021
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv444p14.rs
@@ -0,0 +1,192 @@
+//! 14-bit planar YUV 4:4:4 dispatchers — 4 variants.
+
+use crate::row::scalar;
+use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
+
+
+/// YUV 4:4:4 planar 14-bit → u8 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+
+/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha
+/// element is `16383`.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p14_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs
new file mode 100644
index 00000000..adfe2c35
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv444p16.rs
@@ -0,0 +1,304 @@
+//! 16-bit planar YUV 4:4:4 dispatchers — 4 variants. The BITS-generic
+//! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets
+//! its own dedicated dispatchers (i64 chroma at native u16 output).
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
+/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline
+/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel).
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range
+/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to
+/// i64 to avoid i32 overflow at 16-bit limited range.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+
+/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit
+/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`).
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is
+/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar
+/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs
new file mode 100644
index 00000000..2cff1f05
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv444p9.rs
@@ -0,0 +1,209 @@
+//! 9-bit planar YUV 4:4:4 dispatchers — 4 variants. The RGB / RGB-u16
+//! paths are thin wrappers over the BITS-generic helpers in
+//! `super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}`; the
+//! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic
+//! template doesn't apply for the alpha-fill case).
+
+use crate::row::scalar;
+use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
+
+
+/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the
+/// crate-internal `yuv_444p_n_to_rgb_row::<9>`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgb_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgb_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgb_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
+}
+
+// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) ---------------
+//
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
+
+/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit**
+/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
+/// source has no alpha plane).
+///
+/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the
+/// per-pixel stride (4 vs 3) and the constant alpha byte. See
+/// `scalar::yuv_444p_n_to_rgba_row` for the reference.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgba_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`**
+/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]`
+/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
+/// (opaque maximum at the input bit depth).
+///
+/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv444p9_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_elems(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+}
diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs
new file mode 100644
index 00000000..8f4352a2
--- /dev/null
+++ b/src/row/dispatch/yuv444/yuv_444.rs
@@ -0,0 +1,159 @@
+//! 8-bit YUV 4:4:4 → RGB / RGBA dispatchers (`yuv_444_to_rgb_row`,
+//! `yuv_444_to_rgba_row`). Extracted from the parent
+//! `dispatch::yuv444` module per source format for organization.
+
+use crate::row::scalar;
+use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
+#[cfg(target_arch = "aarch64")]
+use crate::row::neon_available;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+#[cfg(target_arch = "wasm32")]
+use crate::row::simd128_available;
+use crate::ColorMatrix;
+
+
+/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
+/// to the best available SIMD backend for the current target.
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference
+/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma
+/// planes, no chroma upsampling, no width parity constraint. See
+/// `scalar::yuv_444_to_rgb_row` for the reference implementation.
+///
+/// `use_simd = false` forces the scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_444_to_rgb_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgb_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgb_min = rgb_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgb_out.len() >= rgb_min, "rgb_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX-512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit).
+/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`.
+/// `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+}

From c1a07310f3727cac298053602090ce728b6e17b4 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:44:43 +1200
Subject: [PATCH 4/6] finish scalar impl for yuv420p

---
 src/row/arch/x86_avx2.rs             |  3 +--
 src/row/arch/x86_avx512/tests.rs     | 14 ++++++++++++--
 src/row/arch/x86_sse41.rs            |  8 +++++++-
 src/row/arch/x86_sse41/tests.rs      | 14 ++++++++++++--
 src/row/dispatch/bayer.rs            |  3 +--
 src/row/dispatch/nv.rs               | 11 ++++++-----
 src/row/dispatch/pn.rs               | 14 ++++++++------
 src/row/dispatch/rgb_ops.rs          |  7 +++----
 src/row/dispatch/yuv420/mod.rs       |  4 ++--
 src/row/dispatch/yuv420/p010.rs      | 13 ++++++-------
 src/row/dispatch/yuv420/p012.rs      | 13 ++++++-------
 src/row/dispatch/yuv420/p016.rs      | 13 ++++++-------
 src/row/dispatch/yuv420/yuv420p10.rs | 13 ++++++-------
 src/row/dispatch/yuv420/yuv420p12.rs | 13 ++++++-------
 src/row/dispatch/yuv420/yuv420p14.rs | 13 ++++++-------
 src/row/dispatch/yuv420/yuv420p16.rs | 13 ++++++-------
 src/row/dispatch/yuv420/yuv420p9.rs  | 12 ++++++------
 src/row/dispatch/yuv420/yuv_420.rs   | 12 ++++++------
 src/row/dispatch/yuv444/mod.rs       | 16 ++++++++--------
 src/row/dispatch/yuv444/yuv444p10.rs | 13 ++++++-------
 src/row/dispatch/yuv444/yuv444p12.rs | 13 ++++++-------
 src/row/dispatch/yuv444/yuv444p14.rs | 13 ++++++-------
 src/row/dispatch/yuv444/yuv444p16.rs | 13 ++++++-------
 src/row/dispatch/yuv444/yuv444p9.rs  | 12 ++++++------
 src/row/dispatch/yuv444/yuv_444.rs   | 12 ++++++------
 src/row/dispatch/yuva.rs             | 11 ++++++-----
 src/row/mod.rs                       | 10 ++--------
 27 files changed, 158 insertions(+), 148 deletions(-)

diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 425609b3..e106b2c5 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -4029,8 +4029,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const
           // wrapper passed Some(_), validated by debug_assert above.
           // 16-bit alpha is full-range u16 — load 16 lanes (one
           // __m256i = 32 bytes), split into two 128-bit halves.
-          let a_vec =
-            _mm256_loadu_si256(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast());
+          let a_vec = _mm256_loadu_si256(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast());
           (
             _mm256_castsi256_si128(a_vec),
             _mm256_extracti128_si256::<1>(a_vec),
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index b3d6af0e..a2fc56e6 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -3225,8 +3225,18 @@ fn avx512_yuva420p_n_rgba_u16_matches_scalar_widths() {
     return;
   }
   for w in [64usize, 66, 78, 94, 1920, 1922] {
-    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
-    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Bt601,
+      false,
+      89,
+    );
+    check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
   }
 }
 
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index a8935652..a69ea18c 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -3555,7 +3555,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool, const
         } else {
           alpha_u16
         };
-        write_rgba_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, a_v, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(
+          r_lo_u16,
+          g_lo_u16,
+          b_lo_u16,
+          a_v,
+          out.as_mut_ptr().add(x * 4),
+        );
       } else {
         write_rgb_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, out.as_mut_ptr().add(x * 3));
       }
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 9afc1363..117c5513 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -3217,8 +3217,18 @@ fn sse41_yuva420p_n_rgba_u16_matches_scalar_widths() {
     return;
   }
   for w in [16usize, 18, 30, 34, 1920, 1922] {
-    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89);
-    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Bt601,
+      false,
+      89,
+    );
+    check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
   }
 }
 
diff --git a/src/row/dispatch/bayer.rs b/src/row/dispatch/bayer.rs
index 4f45857f..9af7b199 100644
--- a/src/row/dispatch/bayer.rs
+++ b/src/row/dispatch/bayer.rs
@@ -6,8 +6,7 @@
 //! parameter is wired through so callers don't have to touch their
 //! call sites when SIMD lands.
 
-use crate::row::scalar;
-use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems};
+use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems, scalar};
 
 /// Converts one row of an 8-bit Bayer plane to packed RGB.
 ///
diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs
index b342e6e4..236b1401 100644
--- a/src/row/dispatch/nv.rs
+++ b/src/row/dispatch/nv.rs
@@ -1,15 +1,16 @@
 //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and
 //! RGBA outputs) extracted from `row::mod` for organization.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+};
 
 /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB.
 ///
diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs
index f2a143c4..19df9ed8 100644
--- a/src/row/dispatch/pn.rs
+++ b/src/row/dispatch/pn.rs
@@ -11,15 +11,18 @@
 //! since they share the 4:2:0 chroma layout with the planar
 //! yuv420p9/10/12/14/16 family.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, uv_full_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{
+    arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems,
+  },
+};
 
 // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB --------------------
 //
@@ -793,4 +796,3 @@ pub fn p416_to_rgba_u16_row(
 
   scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
 }
-
diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs
index c51257d8..24b3a087 100644
--- a/src/row/dispatch/rgb_ops.rs
+++ b/src/row/dispatch/rgb_ops.rs
@@ -2,14 +2,13 @@
 //! organization. All three route through the standard
 //! `cfg_select!` per-arch block; `use_simd = false` forces scalar.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
+use crate::row::{arch, rgb_row_bytes, scalar};
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
 
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
 /// encoding). See `scalar::rgb_to_hsv_row` for semantics.
diff --git a/src/row/dispatch/yuv420/mod.rs b/src/row/dispatch/yuv420/mod.rs
index 972210e7..57727688 100644
--- a/src/row/dispatch/yuv420/mod.rs
+++ b/src/row/dispatch/yuv420/mod.rs
@@ -23,9 +23,9 @@ pub(super) mod yuv_420;
 pub use p010::*;
 pub use p012::*;
 pub use p016::*;
+pub use yuv_420::*;
+pub use yuv420p9::*;
 pub use yuv420p10::*;
 pub use yuv420p12::*;
 pub use yuv420p14::*;
 pub use yuv420p16::*;
-pub use yuv420p9::*;
-pub use yuv_420::*;
diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs
index 35f9e548..67ad9c95 100644
--- a/src/row/dispatch/yuv420/p010.rs
+++ b/src/row/dispatch/yuv420/p010.rs
@@ -1,16 +1,16 @@
 //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4
 //! variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
 /// packed — 10 active bits in the high 10 of each `u16`) to packed
@@ -164,7 +164,6 @@ pub fn p010_to_rgb_u16_row(
   scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit,
 /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
 /// `0xFF` (opaque).
diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs
index 618bc8f6..c3058425 100644
--- a/src/row/dispatch/yuv420/p012.rs
+++ b/src/row/dispatch/yuv420/p012.rs
@@ -1,16 +1,16 @@
 //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4
 //! variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
 /// packed — 12 active bits in the high 12 of each `u16`) to packed
@@ -148,7 +148,6 @@ pub fn p012_to_rgb_u16_row(
   scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit,
 /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
 /// `0xFF` (opaque).
diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs
index 128aaf17..765cf596 100644
--- a/src/row/dispatch/yuv420/p016.rs
+++ b/src/row/dispatch/yuv420/p016.rs
@@ -1,15 +1,15 @@
 //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
 /// packed **8-bit** RGB. At 16 bits there is no high-bit-packed
@@ -140,7 +140,6 @@ pub fn p016_to_rgb_u16_row(
   scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit
 /// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
 ///
diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs
index 27f7a1ff..8083d5a5 100644
--- a/src/row/dispatch/yuv420/yuv420p10.rs
+++ b/src/row/dispatch/yuv420/yuv420p10.rs
@@ -1,15 +1,15 @@
 //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
 ///
@@ -195,7 +195,6 @@ pub fn yuv420p10_to_rgb_u16_row(
   scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
 /// source has no alpha plane).
diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs
index 9d250c9a..761b51c0 100644
--- a/src/row/dispatch/yuv420/yuv420p12.rs
+++ b/src/row/dispatch/yuv420/yuv420p12.rs
@@ -1,15 +1,15 @@
 //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
 ///
@@ -171,7 +171,6 @@ pub fn yuv420p12_to_rgb_u16_row(
   scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
 /// source has no alpha plane).
diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs
index a1c8024f..f9fad7af 100644
--- a/src/row/dispatch/yuv420/yuv420p14.rs
+++ b/src/row/dispatch/yuv420/yuv420p14.rs
@@ -1,15 +1,15 @@
 //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
 #[cfg_attr(not(tarpaulin), inline(always))]
@@ -160,7 +160,6 @@ pub fn yuv420p14_to_rgb_u16_row(
   scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
 /// source has no alpha plane).
diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs
index 7b324e7d..b248ce95 100644
--- a/src/row/dispatch/yuv420/yuv420p16.rs
+++ b/src/row/dispatch/yuv420/yuv420p16.rs
@@ -1,15 +1,15 @@
 //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
 ///
@@ -146,7 +146,6 @@ pub fn yuv420p16_to_rgb_u16_row(
   scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`).
 ///
diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs
index c28da34a..69cfb983 100644
--- a/src/row/dispatch/yuv420/yuv420p9.rs
+++ b/src/row/dispatch/yuv420/yuv420p9.rs
@@ -1,16 +1,16 @@
 //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16,
 //! RGBA, RGBA-u16).
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
 ///
diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs
index 6428c008..19a89147 100644
--- a/src/row/dispatch/yuv420/yuv_420.rs
+++ b/src/row/dispatch/yuv420/yuv_420.rs
@@ -2,16 +2,16 @@
 //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420`
 //! module per source format for organization.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+};
 
 /// Converts one row of 4:2:0 YUV to packed RGB.
 ///
diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs
index 4db06906..8d7778d9 100644
--- a/src/row/dispatch/yuv444/mod.rs
+++ b/src/row/dispatch/yuv444/mod.rs
@@ -13,16 +13,16 @@
 //! RGB wrappers above. They stay `pub(crate)` and live here at the
 //! `yuv444` module root so siblings can reach them via `super::*`.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, scalar},
+};
 
 /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
 /// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
@@ -189,9 +189,9 @@ pub(super) mod yuv444p16;
 pub(super) mod yuv444p9;
 pub(super) mod yuv_444;
 
+pub use yuv_444::*;
+pub use yuv444p9::*;
 pub use yuv444p10::*;
 pub use yuv444p12::*;
 pub use yuv444p14::*;
 pub use yuv444p16::*;
-pub use yuv444p9::*;
-pub use yuv_444::*;
diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs
index 770f286e..118bb23d 100644
--- a/src/row/dispatch/yuv444/yuv444p10.rs
+++ b/src/row/dispatch/yuv444/yuv444p10.rs
@@ -1,18 +1,18 @@
 //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
 
-
 /// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the
 /// crate-internal `yuv_444p_n_to_rgb_row::<10>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
@@ -46,7 +46,6 @@ pub fn yuv444p10_to_rgb_u16_row(
   yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
 }
 
-
 /// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`).
 ///
diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs
index 15edca7c..6c1d5787 100644
--- a/src/row/dispatch/yuv444/yuv444p12.rs
+++ b/src/row/dispatch/yuv444/yuv444p12.rs
@@ -1,18 +1,18 @@
 //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
 
-
 /// YUV 4:4:4 planar 12-bit → u8 RGB.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
@@ -45,7 +45,6 @@ pub fn yuv444p12_to_rgb_u16_row(
   yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
 }
 
-
 /// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`).
 ///
diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs
index 50f39021..0ffa3912 100644
--- a/src/row/dispatch/yuv444/yuv444p14.rs
+++ b/src/row/dispatch/yuv444/yuv444p14.rs
@@ -1,18 +1,18 @@
 //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-use crate::row::scalar;
-use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
 
-
 /// YUV 4:4:4 planar 14-bit → u8 RGB.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
@@ -45,7 +45,6 @@ pub fn yuv444p14_to_rgb_u16_row(
   yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd);
 }
 
-
 /// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`).
 ///
diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs
index adfe2c35..0352eb74 100644
--- a/src/row/dispatch/yuv444/yuv444p16.rs
+++ b/src/row/dispatch/yuv444/yuv444p16.rs
@@ -2,16 +2,16 @@
 //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets
 //! its own dedicated dispatchers (i64 chroma at native u16 output).
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 /// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
 /// parallel 16-bit kernel family (same Q15 i32 output-range pipeline
@@ -155,7 +155,6 @@ pub fn yuv444p16_to_rgb_u16_row(
   scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range);
 }
 
-
 /// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit
 /// scalar kernel (`scalar::yuv_444p16_to_rgba_row`).
diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs
index 2cff1f05..e0f02e16 100644
--- a/src/row/dispatch/yuv444/yuv444p9.rs
+++ b/src/row/dispatch/yuv444/yuv444p9.rs
@@ -4,19 +4,19 @@
 //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic
 //! template doesn't apply for the alpha-fill case).
 
-use crate::row::scalar;
-use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
 
-
 /// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the
 /// crate-internal `yuv_444p_n_to_rgb_row::<9>`.
 #[cfg_attr(not(tarpaulin), inline(always))]
diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs
index 8f4352a2..625ab38f 100644
--- a/src/row/dispatch/yuv444/yuv_444.rs
+++ b/src/row/dispatch/yuv444/yuv_444.rs
@@ -2,16 +2,16 @@
 //! `yuv_444_to_rgba_row`). Extracted from the parent
 //! `dispatch::yuv444` module per source format for organization.
 
-use crate::row::scalar;
-use crate::row::{arch, rgb_row_bytes, rgba_row_bytes};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
-
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+};
 
 /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
 /// to the best available SIMD backend for the current target.
diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs
index 90399881..ac2cef2e 100644
--- a/src/row/dispatch/yuva.rs
+++ b/src/row/dispatch/yuva.rs
@@ -3,15 +3,16 @@
 //! RGBA and native-depth `u16` RGBA outputs. Extracted from
 //! `row::mod` for organization.
 
-use crate::row::scalar;
-use crate::row::{arch, rgba_row_bytes, rgba_row_elems};
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
-#[cfg(target_arch = "x86_64")]
-use crate::row::{avx2_available, avx512_available, sse41_available};
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::ColorMatrix;
+#[cfg(target_arch = "x86_64")]
+use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::{
+  ColorMatrix,
+  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+};
 
 // ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------
 //
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 97704767..299e7e25 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -41,8 +41,8 @@
 //! see no API change from the split.
 
 pub(crate) mod arch;
-pub(crate) mod scalar;
 mod dispatch;
+pub(crate) mod scalar;
 
 // Re-exported only when a caller is compiled. The `MixedSinker` Strategy A
 // fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which
@@ -54,13 +54,7 @@ pub(crate) use scalar::expand_rgb_to_rgba_row;
 #[cfg(any(feature = "std", feature = "alloc"))]
 pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row;
 
-pub use dispatch::bayer::*;
-pub use dispatch::nv::*;
-pub use dispatch::pn::*;
-pub use dispatch::rgb_ops::*;
-pub use dispatch::yuv420::*;
-pub use dispatch::yuv444::*;
-pub use dispatch::yuva::*;
+pub use dispatch::{bayer::*, nv::*, pn::*, rgb_ops::*, yuv420::*, yuv444::*, yuva::*};
 
 // `yuv_444p_n_to_rgb_u16_row` is consumed by the 32-bit overflow test
 // `yuv_444p_n_u16_dispatcher_rejects_width_times_3_overflow` below —

From b8b5aea85352b5e3a9cb565076d0d53aa1b1d70b Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:56:33 +1200
Subject: [PATCH 5/6] fix(row/dispatch): gate `arch` import on supported SIMD
 targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-format split landed `use crate::row::arch;` (folded into the
`row::{arch, ...}` import group) in every dispatch sub-file. On
targets without a per-arch SIMD backend — i686, powerpc64, riscv64,
s390x, etc. — the `cfg_select!` body falls through to the scalar
path, every `arch::*` reference is gated out, and clippy's
`-D warnings` flag promotes the resulting `unused_imports` to a hard
error. CI fails: `miri-tb-i686`, `miri-sb-powerpc64`,
`cross (i686-linux-android)`.

Fix: lift `arch` out of the bundled `row::{...}` import block in
each dispatch file and re-import it under
`#[cfg(any(target_arch = "aarch64", target_arch = "x86_64",
target_arch = "wasm32"))]`. The three targets gate matches the set
that has a SIMD backend in `crate::row::arch::*`. Tested via
`RUSTFLAGS=-Dwarnings cargo check --target i686-unknown-linux-gnu
--lib` (now clean) plus the host aarch64 / x86_64-freebsd / wasm32
suites still passing 629 tests.

Touches every dispatch file that imports `arch`: bayer.rs is
intentionally untouched (the Bayer dispatchers are still
scalar-only and never reference `arch::*`).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/dispatch/nv.rs               | 4 +++-
 src/row/dispatch/pn.rs               | 4 +++-
 src/row/dispatch/rgb_ops.rs          | 4 +++-
 src/row/dispatch/yuv420/p010.rs      | 4 +++-
 src/row/dispatch/yuv420/p012.rs      | 4 +++-
 src/row/dispatch/yuv420/p016.rs      | 4 +++-
 src/row/dispatch/yuv420/yuv420p10.rs | 4 +++-
 src/row/dispatch/yuv420/yuv420p12.rs | 4 +++-
 src/row/dispatch/yuv420/yuv420p14.rs | 4 +++-
 src/row/dispatch/yuv420/yuv420p16.rs | 4 +++-
 src/row/dispatch/yuv420/yuv420p9.rs  | 4 +++-
 src/row/dispatch/yuv420/yuv_420.rs   | 4 +++-
 src/row/dispatch/yuv444/mod.rs       | 4 +++-
 src/row/dispatch/yuv444/yuv444p10.rs | 4 +++-
 src/row/dispatch/yuv444/yuv444p12.rs | 4 +++-
 src/row/dispatch/yuv444/yuv444p14.rs | 4 +++-
 src/row/dispatch/yuv444/yuv444p16.rs | 4 +++-
 src/row/dispatch/yuv444/yuv444p9.rs  | 4 +++-
 src/row/dispatch/yuv444/yuv_444.rs   | 4 +++-
 src/row/dispatch/yuva.rs             | 4 +++-
 20 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs
index 236b1401..cec38348 100644
--- a/src/row/dispatch/nv.rs
+++ b/src/row/dispatch/nv.rs
@@ -1,6 +1,8 @@
 //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and
 //! RGBA outputs) extracted from `row::mod` for organization.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -9,7 +11,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+  row::{rgb_row_bytes, rgba_row_bytes, scalar},
 };
 
 /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB.
diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs
index 19df9ed8..c0c72363 100644
--- a/src/row/dispatch/pn.rs
+++ b/src/row/dispatch/pn.rs
@@ -11,6 +11,8 @@
 //! since they share the 4:2:0 chroma layout with the planar
 //! yuv420p9/10/12/14/16 family.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -20,7 +22,7 @@ use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
   row::{
-    arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems,
+    rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems,
   },
 };
 
diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs
index 24b3a087..86ccd52e 100644
--- a/src/row/dispatch/rgb_ops.rs
+++ b/src/row/dispatch/rgb_ops.rs
@@ -2,11 +2,13 @@
 //! organization. All three route through the standard
 //! `cfg_select!` per-arch block; `use_simd = false` forces scalar.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::row::{arch, rgb_row_bytes, scalar};
+use crate::row::{rgb_row_bytes, scalar};
 #[cfg(target_arch = "x86_64")]
 use crate::row::{avx2_available, avx512_available, sse41_available};
 
diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs
index 67ad9c95..46bfbb45 100644
--- a/src/row/dispatch/yuv420/p010.rs
+++ b/src/row/dispatch/yuv420/p010.rs
@@ -1,6 +1,8 @@
 //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4
 //! variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -9,7 +11,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑
diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs
index c3058425..8b231627 100644
--- a/src/row/dispatch/yuv420/p012.rs
+++ b/src/row/dispatch/yuv420/p012.rs
@@ -1,6 +1,8 @@
 //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4
 //! variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -9,7 +11,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑
diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs
index 765cf596..049f9d3a 100644
--- a/src/row/dispatch/yuv420/p016.rs
+++ b/src/row/dispatch/yuv420/p016.rs
@@ -1,5 +1,7 @@
 //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to
diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs
index 8083d5a5..3a24eacb 100644
--- a/src/row/dispatch/yuv420/yuv420p10.rs
+++ b/src/row/dispatch/yuv420/yuv420p10.rs
@@ -1,5 +1,7 @@
 //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs
index 761b51c0..5fc011fa 100644
--- a/src/row/dispatch/yuv420/yuv420p12.rs
+++ b/src/row/dispatch/yuv420/yuv420p12.rs
@@ -1,5 +1,7 @@
 //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs
index f9fad7af..7097b181 100644
--- a/src/row/dispatch/yuv420/yuv420p14.rs
+++ b/src/row/dispatch/yuv420/yuv420p14.rs
@@ -1,5 +1,7 @@
 //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs
index b248ce95..c5967784 100644
--- a/src/row/dispatch/yuv420/yuv420p16.rs
+++ b/src/row/dispatch/yuv420/yuv420p16.rs
@@ -1,5 +1,7 @@
 //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB.
diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs
index 69cfb983..9a48e31e 100644
--- a/src/row/dispatch/yuv420/yuv420p9.rs
+++ b/src/row/dispatch/yuv420/yuv420p9.rs
@@ -1,6 +1,8 @@
 //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16,
 //! RGBA, RGBA-u16).
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -9,7 +11,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB.
diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs
index 19a89147..7aafbdf7 100644
--- a/src/row/dispatch/yuv420/yuv_420.rs
+++ b/src/row/dispatch/yuv420/yuv_420.rs
@@ -2,6 +2,8 @@
 //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420`
 //! module per source format for organization.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -10,7 +12,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+  row::{rgb_row_bytes, rgba_row_bytes, scalar},
 };
 
 /// Converts one row of 4:2:0 YUV to packed RGB.
diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs
index 8d7778d9..fe27cc55 100644
--- a/src/row/dispatch/yuv444/mod.rs
+++ b/src/row/dispatch/yuv444/mod.rs
@@ -13,6 +13,8 @@
 //! RGB wrappers above. They stay `pub(crate)` and live here at the
 //! `yuv444` module root so siblings can reach them via `super::*`.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -21,7 +23,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, scalar},
 };
 
 /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs
index 118bb23d..21a8f1c9 100644
--- a/src/row/dispatch/yuv444/yuv444p10.rs
+++ b/src/row/dispatch/yuv444/yuv444p10.rs
@@ -1,5 +1,7 @@
 //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs
index 6c1d5787..7ecc7066 100644
--- a/src/row/dispatch/yuv444/yuv444p12.rs
+++ b/src/row/dispatch/yuv444/yuv444p12.rs
@@ -1,5 +1,7 @@
 //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs
index 0ffa3912..8d6ea884 100644
--- a/src/row/dispatch/yuv444/yuv444p14.rs
+++ b/src/row/dispatch/yuv444/yuv444p14.rs
@@ -1,5 +1,7 @@
 //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -8,7 +10,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs
index 0352eb74..bed355b7 100644
--- a/src/row/dispatch/yuv444/yuv444p16.rs
+++ b/src/row/dispatch/yuv444/yuv444p16.rs
@@ -2,6 +2,8 @@
 //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets
 //! its own dedicated dispatchers (i64 chroma at native u16 output).
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -10,7 +12,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 /// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the
diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs
index e0f02e16..d4ff1b87 100644
--- a/src/row/dispatch/yuv444/yuv444p9.rs
+++ b/src/row/dispatch/yuv444/yuv444p9.rs
@@ -4,6 +4,8 @@
 //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic
 //! template doesn't apply for the alpha-fill case).
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -12,7 +14,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};
diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs
index 625ab38f..b4cc6298 100644
--- a/src/row/dispatch/yuv444/yuv_444.rs
+++ b/src/row/dispatch/yuv444/yuv_444.rs
@@ -2,6 +2,8 @@
 //! `yuv_444_to_rgba_row`). Extracted from the parent
 //! `dispatch::yuv444` module per source format for organization.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -10,7 +12,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgb_row_bytes, rgba_row_bytes, scalar},
+  row::{rgb_row_bytes, rgba_row_bytes, scalar},
 };
 
 /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs
index ac2cef2e..06bf96d0 100644
--- a/src/row/dispatch/yuva.rs
+++ b/src/row/dispatch/yuva.rs
@@ -3,6 +3,8 @@
 //! RGBA and native-depth `u16` RGBA outputs. Extracted from
 //! `row::mod` for organization.
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
@@ -11,7 +13,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{arch, rgba_row_bytes, rgba_row_elems, scalar},
+  row::{rgba_row_bytes, rgba_row_elems, scalar},
 };
 
 // ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------

From 56be621efb99f5610025613c85668e0c47779246 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:57:28 +1200
Subject: [PATCH 6/6] finish scalar impl for yuv420p

---
 .github/workflows/ci.yml             |  2 +-
 src/row/dispatch/nv.rs               |  6 +++++-
 src/row/dispatch/pn.rs               | 10 ++++++----
 src/row/dispatch/rgb_ops.rs          |  8 ++++++--
 src/row/dispatch/yuv420/p010.rs      |  6 +++++-
 src/row/dispatch/yuv420/p012.rs      |  6 +++++-
 src/row/dispatch/yuv420/p016.rs      |  6 +++++-
 src/row/dispatch/yuv420/yuv420p10.rs |  6 +++++-
 src/row/dispatch/yuv420/yuv420p12.rs |  6 +++++-
 src/row/dispatch/yuv420/yuv420p14.rs |  6 +++++-
 src/row/dispatch/yuv420/yuv420p16.rs |  6 +++++-
 src/row/dispatch/yuv420/yuv420p9.rs  |  6 +++++-
 src/row/dispatch/yuv420/yuv_420.rs   |  6 +++++-
 src/row/dispatch/yuv444/mod.rs       |  6 +++++-
 src/row/dispatch/yuv444/yuv444p10.rs |  6 +++++-
 src/row/dispatch/yuv444/yuv444p12.rs |  6 +++++-
 src/row/dispatch/yuv444/yuv444p14.rs |  6 +++++-
 src/row/dispatch/yuv444/yuv444p16.rs |  6 +++++-
 src/row/dispatch/yuv444/yuv444p9.rs  |  6 +++++-
 src/row/dispatch/yuv444/yuv_444.rs   |  6 +++++-
 src/row/dispatch/yuva.rs             |  6 +++++-
 21 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fdf5548c..0b77ea04 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -186,7 +186,7 @@ jobs:
       - name: Install Rust
         run: rustup update stable --no-self-update && rustup default stable
       - name: Install Intel SDE
-        uses: petarpetrovt/setup-sde@v3.0
+        uses: petarpetrovt/setup-sde@v4.0
         with:
           sdeVersion: 9.33.0
           environmentVariableName: SDE_PATH
diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs
index cec38348..2ec2f153 100644
--- a/src/row/dispatch/nv.rs
+++ b/src/row/dispatch/nv.rs
@@ -1,7 +1,11 @@
 //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and
 //! RGBA outputs) extracted from `row::mod` for organization.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs
index c0c72363..534bfd64 100644
--- a/src/row/dispatch/pn.rs
+++ b/src/row/dispatch/pn.rs
@@ -11,7 +11,11 @@
 //! since they share the 4:2:0 chroma layout with the planar
 //! yuv420p9/10/12/14/16 family.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
@@ -21,9 +25,7 @@ use crate::row::simd128_available;
 use crate::row::{avx2_available, avx512_available, sse41_available};
 use crate::{
   ColorMatrix,
-  row::{
-    rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems,
-  },
+  row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems},
 };
 
 // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB --------------------
diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs
index 86ccd52e..0c98bc35 100644
--- a/src/row/dispatch/rgb_ops.rs
+++ b/src/row/dispatch/rgb_ops.rs
@@ -2,15 +2,19 @@
 //! organization. All three route through the standard
 //! `cfg_select!` per-arch block; `use_simd = false` forces scalar.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
 #[cfg(target_arch = "wasm32")]
 use crate::row::simd128_available;
-use crate::row::{rgb_row_bytes, scalar};
 #[cfg(target_arch = "x86_64")]
 use crate::row::{avx2_available, avx512_available, sse41_available};
+use crate::row::{rgb_row_bytes, scalar};
 
 /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit
 /// encoding). See `scalar::rgb_to_hsv_row` for semantics.
diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs
index 46bfbb45..ba9d95b8 100644
--- a/src/row/dispatch/yuv420/p010.rs
+++ b/src/row/dispatch/yuv420/p010.rs
@@ -1,7 +1,11 @@
 //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4
 //! variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs
index 8b231627..ef1c1301 100644
--- a/src/row/dispatch/yuv420/p012.rs
+++ b/src/row/dispatch/yuv420/p012.rs
@@ -1,7 +1,11 @@
 //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4
 //! variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs
index 049f9d3a..abdf59d1 100644
--- a/src/row/dispatch/yuv420/p016.rs
+++ b/src/row/dispatch/yuv420/p016.rs
@@ -1,6 +1,10 @@
 //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs
index 3a24eacb..349d0623 100644
--- a/src/row/dispatch/yuv420/yuv420p10.rs
+++ b/src/row/dispatch/yuv420/yuv420p10.rs
@@ -1,6 +1,10 @@
 //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs
index 5fc011fa..3b503b74 100644
--- a/src/row/dispatch/yuv420/yuv420p12.rs
+++ b/src/row/dispatch/yuv420/yuv420p12.rs
@@ -1,6 +1,10 @@
 //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs
index 7097b181..50427e59 100644
--- a/src/row/dispatch/yuv420/yuv420p14.rs
+++ b/src/row/dispatch/yuv420/yuv420p14.rs
@@ -1,6 +1,10 @@
 //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs
index c5967784..c681c48b 100644
--- a/src/row/dispatch/yuv420/yuv420p16.rs
+++ b/src/row/dispatch/yuv420/yuv420p16.rs
@@ -1,6 +1,10 @@
 //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs
index 9a48e31e..09cb0156 100644
--- a/src/row/dispatch/yuv420/yuv420p9.rs
+++ b/src/row/dispatch/yuv420/yuv420p9.rs
@@ -1,7 +1,11 @@
 //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16,
 //! RGBA, RGBA-u16).
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs
index 7aafbdf7..80bdea23 100644
--- a/src/row/dispatch/yuv420/yuv_420.rs
+++ b/src/row/dispatch/yuv420/yuv_420.rs
@@ -2,7 +2,11 @@
 //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420`
 //! module per source format for organization.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs
index fe27cc55..01ca3861 100644
--- a/src/row/dispatch/yuv444/mod.rs
+++ b/src/row/dispatch/yuv444/mod.rs
@@ -13,7 +13,11 @@
 //! RGB wrappers above. They stay `pub(crate)` and live here at the
 //! `yuv444` module root so siblings can reach them via `super::*`.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs
index 21a8f1c9..b6836e8e 100644
--- a/src/row/dispatch/yuv444/yuv444p10.rs
+++ b/src/row/dispatch/yuv444/yuv444p10.rs
@@ -1,6 +1,10 @@
 //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs
index 7ecc7066..c4f3e0f4 100644
--- a/src/row/dispatch/yuv444/yuv444p12.rs
+++ b/src/row/dispatch/yuv444/yuv444p12.rs
@@ -1,6 +1,10 @@
 //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs
index 8d6ea884..8b7b7e7b 100644
--- a/src/row/dispatch/yuv444/yuv444p14.rs
+++ b/src/row/dispatch/yuv444/yuv444p14.rs
@@ -1,6 +1,10 @@
 //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs
index bed355b7..87d69fc9 100644
--- a/src/row/dispatch/yuv444/yuv444p16.rs
+++ b/src/row/dispatch/yuv444/yuv444p16.rs
@@ -2,7 +2,11 @@
 //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets
 //! its own dedicated dispatchers (i64 chroma at native u16 output).
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs
index d4ff1b87..784ed036 100644
--- a/src/row/dispatch/yuv444/yuv444p9.rs
+++ b/src/row/dispatch/yuv444/yuv444p9.rs
@@ -4,7 +4,11 @@
 //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic
 //! template doesn't apply for the alpha-fill case).
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs
index b4cc6298..25174964 100644
--- a/src/row/dispatch/yuv444/yuv_444.rs
+++ b/src/row/dispatch/yuv444/yuv_444.rs
@@ -2,7 +2,11 @@
 //! `yuv_444_to_rgba_row`). Extracted from the parent
 //! `dispatch::yuv444` module per source format for organization.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;
diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs
index 06bf96d0..34477473 100644
--- a/src/row/dispatch/yuva.rs
+++ b/src/row/dispatch/yuva.rs
@@ -3,7 +3,11 @@
 //! RGBA and native-depth `u16` RGBA outputs. Extracted from
 //! `row::mod` for organization.
 
-#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]
+#[cfg(any(
+  target_arch = "aarch64",
+  target_arch = "x86_64",
+  target_arch = "wasm32"
+))]
 use crate::row::arch;
 #[cfg(target_arch = "aarch64")]
 use crate::row::neon_available;