diff --git a/src/row/arch/neon/tests/v210.rs b/src/row/arch/neon/tests/v210.rs
index b82bdab4..be979537 100644
--- a/src/row/arch/neon/tests/v210.rs
+++ b/src/row/arch/neon/tests/v210.rs
@@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v210_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v210_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -82,9 +82,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v210_to_luma_row(&p, &mut s, width);
+  scalar::v210_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_row(&p, &mut k, width);
+    v210_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON v210→luma diverges (width={width})");
 }
@@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v210_to_luma_u16_row(&p, &mut s, width);
+  scalar::v210_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_u16_row(&p, &mut k, width);
+    v210_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON v210→luma u16 diverges (width={width})");
 }
@@ -213,7 +213,7 @@ fn neon_v210_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v210_to_luma_u16_row(&packed, &mut luma, W);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "neon v210 luma reorder bug");
@@ -222,9 +222,15 @@ fn neon_v210_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v210_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v210_to_rgb_or_rgba_row::<false>(
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/neon/tests/y216.rs b/src/row/arch/neon/tests/y216.rs
index 8d379a2d..1a982f4d 100644
--- a/src/row/arch/neon/tests/y216.rs
+++ b/src/row/arch/neon/tests/y216.rs
@@ -15,9 +15,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -32,9 +32,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -48,9 +48,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y216_to_luma_row(&p, &mut s, width);
+  scalar::y216_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_row(&p, &mut k, width);
+    y216_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON y216→luma diverges (width={width})");
 }
@@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y216_to_luma_u16_row(&p, &mut s, width);
+  scalar::y216_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_u16_row(&p, &mut k, width);
+    y216_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON y216→luma u16 diverges (width={width})");
 }
@@ -142,7 +142,7 @@ fn neon_y216_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y216_to_luma_u16_row(&packed, &mut luma_u16, W);
+    y216_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "NEON y216 luma_u16 reorder bug");
@@ -151,9 +151,15 @@ fn neon_y216_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/neon/tests/y2xx.rs b/src/row/arch/neon/tests/y2xx.rs
index 892e0d14..d12a51d4 100644
--- a/src/row/arch/neon/tests/y2xx.rs
+++ b/src/row/arch/neon/tests/y2xx.rs
@@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&packed, &mut luma_u16, W);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
       &packed,
       &mut simd_rgb,
       W,
@@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
       false,
     );
   }
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -95,9 +95,9 @@ fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: boo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -109,9 +109,9 @@ fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -123,9 +123,11 @@ fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range:
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -137,9 +139,11 @@ fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -151,9 +155,9 @@ fn check_luma<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y2xx_n_to_luma_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON y2xx<{BITS}>→luma diverges (width={width})");
 }
@@ -162,9 +166,9 @@ fn check_luma_u16<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y2xx_n_to_luma_u16_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON y2xx<{BITS}>→luma u16 diverges (width={width})");
 }
@@ -225,15 +229,15 @@ fn neon_y212_matches_scalar_widths() {
     let p = pseudo_random_y212(w, 0xAA55);
     let mut s = std::vec![0u8; w * 3];
     let mut k = std::vec![0u8; w * 3];
-    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
+    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
     unsafe {
-      y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
+      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
     }
     assert_eq!(s, k, "NEON y2xx<12>→RGB diverges (width={w})");
 
     let mut s_u16 = std::vec![0u16; w * 4];
     let mut k_u16 = std::vec![0u16; w * 4];
-    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
       &p,
       &mut s_u16,
       w,
@@ -241,7 +245,7 @@ fn neon_y212_matches_scalar_widths() {
       true,
     );
     unsafe {
-      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
         &p,
         &mut k_u16,
         w,
@@ -253,17 +257,17 @@ fn neon_y212_matches_scalar_widths() {
 
     let mut sl = std::vec![0u8; w];
     let mut kl = std::vec![0u8; w];
-    scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w);
+    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
     unsafe {
-      y2xx_n_to_luma_row::<12>(&p, &mut kl, w);
+      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
     }
     assert_eq!(sl, kl, "NEON y2xx<12>→luma diverges (width={w})");
 
     let mut slu = std::vec![0u16; w];
     let mut klu = std::vec![0u16; w];
-    scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w);
+    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
     unsafe {
-      y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w);
+      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
     }
     assert_eq!(slu, klu, "NEON y2xx<12>→luma u16 diverges (width={w})");
   }
diff --git a/src/row/arch/neon/v210.rs b/src/row/arch/neon/v210.rs
index 0d9f9748..ba406d7a 100644
--- a/src/row/arch/neon/v210.rs
+++ b/src/row/arch/neon/v210.rs
@@ -18,34 +18,9 @@
 
 use core::arch::aarch64::*;
 
-use super::*;
+use super::{endian::load_endian_u32x4, *};
 use crate::{ColorMatrix, row::scalar};
 
-/// Loads 16 bytes as 4 × `u32` in **little-endian** order regardless
-/// of host endianness. v210 words are documented LE; on big-endian
-/// aarch64 (rare — `aarch64_be-*` custom targets) the plain
-/// `vld1q_u32` would put bytes in reversed positions within each
-/// lane and corrupt every subsequent shift-and-mask. Mirrors the
-/// `x2_load_le_u32x4` helper in `packed_rgb.rs` (X2RGB10 / X2BGR10
-/// share the same LE-word constraint). Defining a local helper
-/// avoids cross-file visibility hassle since `x2_load_le_u32x4` is
-/// `pub(super) fn` but not re-exported via the mod's glob.
-///
-/// # Safety
-///
-/// Caller must ensure `ptr` has at least 16 bytes readable.
-#[inline(always)]
-unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t {
-  unsafe {
-    let raw = vld1q_u32(ptr as *const u32);
-    if cfg!(target_endian = "big") {
-      vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(raw)))
-    } else {
-      raw
-    }
-  }
-}
-
 /// Unpacks one 16-byte v210 word into three u16x8 vectors holding
 /// 10-bit samples in their low bits:
 /// - `y_vec`: lanes 0..6 = Y0..Y5 (lanes 6, 7 are don't-care).
@@ -65,10 +40,12 @@ unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t {
 /// Caller must ensure `ptr` has at least 16 bytes readable.
 #[inline]
 #[target_feature(enable = "neon")]
-unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint16x8_t) {
+unsafe fn unpack_v210_word_neon<const BE: bool>(
+  ptr: *const u8,
+) -> (uint16x8_t, uint16x8_t, uint16x8_t) {
   // SAFETY: caller obligation — `ptr` has 16 bytes readable.
   unsafe {
-    let words = v210_load_le_u32x4(ptr);
+    let words = load_endian_u32x4::<BE>(ptr);
     let mask10 = vdupq_n_u32(0x3FF);
     let low10 = vandq_u32(words, mask10);
     let mid10 = vandq_u32(vshrq_n_u32::<10>(words), mask10);
@@ -132,12 +109,12 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint
   }
 }
 
-/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA`:
-/// `false` writes 3 bytes per pixel, `true` writes 4 bytes per
-/// pixel with `α = 0xFF`. Output bit depth is u8 (downshifted from
+/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA` and `BE`.
+/// `BE = true` selects big-endian u32 word decoding (each 32-bit packed
+/// word stored BE on the wire). Output bit depth is u8 (downshifted from
 /// the native 10-bit Q15 pipeline via `range_params_n::<10, 8>`).
 ///
-/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::<ALPHA>` for
+/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>` for
 /// every input.
 ///
 /// # Safety
@@ -148,7 +125,7 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -185,7 +162,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     let cbv = vdupq_n_s32(coeffs.b_v());
 
     for w in 0..full_words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = vreinterpretq_s16_u16(y_vec);
 
@@ -255,14 +232,21 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[full_words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
 
 /// NEON v210 → packed `u16` RGB / RGBA at native 10-bit depth
-/// (low-bit-packed). Byte-identical to
-/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>`.
+/// (low-bit-packed). `BE = true` selects big-endian u32 word decoding.
+/// Byte-identical to
+/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -272,7 +256,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -309,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
     let cbv = vdupq_n_s32(coeffs.b_v());
 
     for w in 0..full_words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = vreinterpretq_s16_u16(y_vec);
       let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
@@ -362,7 +346,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[full_words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -375,7 +359,8 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
 /// NEON v210 → 8-bit luma. Y values are downshifted from 10-bit to
 /// 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely.
-/// Byte-identical to `scalar::v210_to_luma_row`.
+/// `BE = true` selects big-endian u32 word decoding.
+/// Byte-identical to `scalar::v210_to_luma_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -385,7 +370,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v210_to_luma_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let full_words = width / 6;
@@ -395,7 +384,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..full_words {
-      let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_neon::<BE>(packed.as_ptr().add(w * 16));
       // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8.
       let y_u8 = vqmovn_u16(vshrq_n_u16::<2>(y_vec));
       // Store 6 of the 8 lanes: stack buffer + copy_from_slice.
@@ -408,14 +397,15 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
       let tail_packed = &packed[full_words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
 
 /// NEON v210 → native-depth `u16` luma (low-bit-packed). Each output
 /// `u16` carries the source's 10-bit Y value in its low 10 bits.
-/// Byte-identical to `scalar::v210_to_luma_u16_row`.
+/// `BE = true` selects big-endian u32 word decoding.
+/// Byte-identical to `scalar::v210_to_luma_u16_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -425,7 +415,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let full_words = width / 6;
@@ -435,7 +429,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..full_words {
-      let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_neon::<BE>(packed.as_ptr().add(w * 16));
       // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice.
       let mut tmp = [0u16; 8];
       vst1q_u16(tmp.as_mut_ptr(), y_vec);
@@ -446,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
       let tail_packed = &packed[full_words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/neon/y216.rs b/src/row/arch/neon/y216.rs
index 8aaa8664..01a26e62 100644
--- a/src/row/arch/neon/y216.rs
+++ b/src/row/arch/neon/y216.rs
@@ -32,8 +32,9 @@ use crate::{ColorMatrix, row::scalar};
 // ---- u8 output (i32 chroma, 16 px/iter) ---------------------------------
 
 /// NEON Y216 → packed u8 RGB or RGBA.
+/// `BE = true` bypasses NEON and uses scalar for the full row.
 ///
-/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::<ALPHA>`.
+/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -43,7 +44,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -61,128 +62,137 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
   const RND: i32 = 1 << 14;
 
   unsafe {
-    let rnd_v = vdupq_n_s32(RND);
-    // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off.
-    // Y values are full u16 (0..65535), so we must use u16-aware widening
-    // rather than reinterpreting as i16 (which would corrupt values > 32767).
-    let y_off_v = vdupq_n_s32(y_off);
-    let y_scale_v = vdupq_n_s32(y_scale);
-    let c_scale_v = vdupq_n_s32(c_scale);
-    let bias_v = vdupq_n_s16(bias as i16);
-    let cru = vdupq_n_s32(coeffs.r_u());
-    let crv = vdupq_n_s32(coeffs.r_v());
-    let cgu = vdupq_n_s32(coeffs.g_u());
-    let cgv = vdupq_n_s32(coeffs.g_v());
-    let cbu = vdupq_n_s32(coeffs.b_u());
-    let cbv = vdupq_n_s32(coeffs.b_v());
-
+    // BE=true: bypass NEON; scalar handles full row below.
     let mut x = 0usize;
-    while x + 16 <= width {
-      // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16).
-      // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group.
-      let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
-      let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
-
-      // Extract U and V from interleaved chroma via vuzp.
-      // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3]
-      // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid.
-      // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid.
-      let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1);
-      let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1);
-      let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1);
-      let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1);
-
-      // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so
-      // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16.
-      let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v);
-      let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v);
-      let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v);
-      let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v);
-
-      // Widen to i32x4 for Q15 multiply.
-      // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs
-      // discarded by vzip1q_s16 below which only uses lanes 0..3).
-      let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16));
-      let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16));
-      let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16));
-      let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16));
-      let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16));
-      let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16));
-      let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16));
-      let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16));
-
-      // Q15 chroma scale.
-      let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v));
-      let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v));
-      let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v));
-      let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v));
-      let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v));
-      let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v));
-      let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v));
-      let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v));
-
-      // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi;
-      // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care).
-      let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
-      let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
-      let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
-      let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
-      let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
-      let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
-
-      // Duplicate chroma into Y-pair slots (4:2:2):
-      // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3]
-      let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo);
-      let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo);
-      let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo);
-      let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi);
-      let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi);
-      let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi);
-
-      // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies
-      // (y - y_off) * y_scale Q15, narrows to i16x8.  Avoids the i16
-      // overflow that `scale_y` would cause for Y values > 32767.
-      let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v);
-      let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v);
-
-      // Saturating add; narrow to u8x8.
-      let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo));
-      let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo));
-      let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo));
-      let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi));
-      let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi));
-      let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi));
-
-      if ALPHA {
-        let alpha = vdup_n_u8(0xFF);
-        vst4_u8(
-          out.as_mut_ptr().add(x * 4),
-          uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha),
-        );
-        vst4_u8(
-          out.as_mut_ptr().add(x * 4 + 32),
-          uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha),
-        );
-      } else {
-        vst3_u8(
-          out.as_mut_ptr().add(x * 3),
-          uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8),
-        );
-        vst3_u8(
-          out.as_mut_ptr().add(x * 3 + 24),
-          uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8),
-        );
+    if !BE {
+      let rnd_v = vdupq_n_s32(RND);
+      // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off.
+      // Y values are full u16 (0..65535), so we must use u16-aware widening
+      // rather than reinterpreting as i16 (which would corrupt values > 32767).
+      let y_off_v = vdupq_n_s32(y_off);
+      let y_scale_v = vdupq_n_s32(y_scale);
+      let c_scale_v = vdupq_n_s32(c_scale);
+      let bias_v = vdupq_n_s16(bias as i16);
+      let cru = vdupq_n_s32(coeffs.r_u());
+      let crv = vdupq_n_s32(coeffs.r_v());
+      let cgu = vdupq_n_s32(coeffs.g_u());
+      let cgv = vdupq_n_s32(coeffs.g_v());
+      let cbu = vdupq_n_s32(coeffs.b_u());
+      let cbv = vdupq_n_s32(coeffs.b_v());
+
+      while x + 16 <= width {
+        // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16).
+        // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group.
+        let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
+        let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
+
+        // Extract U and V from interleaved chroma via vuzp.
+        // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3]
+        // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid.
+        // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid.
+        let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1);
+        let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1);
+        let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1);
+        let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1);
+
+        // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so
+        // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16.
+        let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v);
+        let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v);
+        let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v);
+        let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v);
+
+        // Widen to i32x4 for Q15 multiply.
+        // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs
+        // discarded by vzip1q_s16 below which only uses lanes 0..3).
+        let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16));
+        let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16));
+        let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16));
+        let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16));
+        let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16));
+        let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16));
+        let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16));
+        let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16));
+
+        // Q15 chroma scale.
+        let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v));
+        let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v));
+        let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v));
+        let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v));
+        let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v));
+        let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v));
+        let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v));
+        let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v));
+
+        // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi;
+        // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care).
+        let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
+        let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
+        let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v);
+        let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
+        let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
+        let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v);
+
+        // Duplicate chroma into Y-pair slots (4:2:2):
+        // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3]
+        let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo);
+        let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo);
+        let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo);
+        let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi);
+        let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi);
+        let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi);
+
+        // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies
+        // (y - y_off) * y_scale Q15, narrows to i16x8.  Avoids the i16
+        // overflow that `scale_y` would cause for Y values > 32767.
+        let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v);
+        let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v);
+
+        // Saturating add; narrow to u8x8.
+        let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo));
+        let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo));
+        let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo));
+        let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi));
+        let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi));
+        let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi));
+
+        if ALPHA {
+          let alpha = vdup_n_u8(0xFF);
+          vst4_u8(
+            out.as_mut_ptr().add(x * 4),
+            uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha),
+          );
+          vst4_u8(
+            out.as_mut_ptr().add(x * 4 + 32),
+            uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha),
+          );
+        } else {
+          vst3_u8(
+            out.as_mut_ptr().add(x * 3),
+            uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8),
+          );
+          vst3_u8(
+            out.as_mut_ptr().add(x * 3 + 24),
+            uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8),
+          );
+        }
+
+        x += 16;
       }
+    } // end if !BE
 
-      x += 16;
-    }
-
-    // Scalar tail — remaining < 16 pixels.
+    // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -192,7 +202,8 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// NEON Y216 → packed native-depth u16 RGB or RGBA.
 ///
 /// Uses i64 chroma (`chroma_i64x4`) to avoid overflow at 16-bit scales.
-/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>`.
+/// `BE = true` bypasses NEON and uses scalar for the full row.
+/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>`.
 ///
 /// ## Pipeline
 ///
@@ -211,7 +222,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -229,180 +240,183 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   const RND: i32 = 1 << 14;
 
   unsafe {
-    let alpha_u16 = vdupq_n_u16(0xFFFF);
-    let rnd_v = vdupq_n_s32(RND);
-    let rnd64 = vdupq_n_s64(RND as i64);
-    let y_off_v = vdupq_n_s32(y_off);
-    let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32
-    let c_scale_v = vdupq_n_s32(c_scale);
-    let bias_v = vdupq_n_s32(bias);
-    let cru = vdupq_n_s32(coeffs.r_u());
-    let crv = vdupq_n_s32(coeffs.r_v());
-    let cgu = vdupq_n_s32(coeffs.g_u());
-    let cgv = vdupq_n_s32(coeffs.g_v());
-    let cbu = vdupq_n_s32(coeffs.b_u());
-    let cbv = vdupq_n_s32(coeffs.b_v());
-
+    // BE=true: bypass NEON; scalar handles full row below.
     let mut x = 0usize;
-    while x + 16 <= width {
-      // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs.
-      let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
-      let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
-
-      // Extract U/V from chroma via vuzp.
-      // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid.
-      let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1);
-      let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1);
-      let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1);
-      let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1);
-
-      // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d.
-      let u_d_lo = q15_shift(vaddq_s32(
-        vmulq_s32(
-          vsubq_s32(
-            vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))),
-            bias_v,
+    if !BE {
+      let alpha_u16 = vdupq_n_u16(0xFFFF);
+      let rnd_v = vdupq_n_s32(RND);
+      let rnd64 = vdupq_n_s64(RND as i64);
+      let y_off_v = vdupq_n_s32(y_off);
+      let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32
+      let c_scale_v = vdupq_n_s32(c_scale);
+      let bias_v = vdupq_n_s32(bias);
+      let cru = vdupq_n_s32(coeffs.r_u());
+      let crv = vdupq_n_s32(coeffs.r_v());
+      let cgu = vdupq_n_s32(coeffs.g_u());
+      let cgv = vdupq_n_s32(coeffs.g_v());
+      let cbu = vdupq_n_s32(coeffs.b_u());
+      let cbv = vdupq_n_s32(coeffs.b_v());
+
+      while x + 16 <= width {
+        // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs.
+        let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
+        let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
+
+        // Extract U/V from chroma via vuzp.
+        // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid.
+        let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1);
+        let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1);
+        let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1);
+        let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1);
+
+        // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d.
+        let u_d_lo = q15_shift(vaddq_s32(
+          vmulq_s32(
+            vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))),
+              bias_v,
+            ),
+            c_scale_v,
           ),
-          c_scale_v,
-        ),
-        rnd_v,
-      ));
-      let v_d_lo = q15_shift(vaddq_s32(
-        vmulq_s32(
-          vsubq_s32(
-            vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))),
-            bias_v,
+          rnd_v,
+        ));
+        let v_d_lo = q15_shift(vaddq_s32(
+          vmulq_s32(
+            vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))),
+              bias_v,
+            ),
+            c_scale_v,
           ),
-          c_scale_v,
-        ),
-        rnd_v,
-      ));
-      let u_d_hi = q15_shift(vaddq_s32(
-        vmulq_s32(
-          vsubq_s32(
-            vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))),
-            bias_v,
+          rnd_v,
+        ));
+        let u_d_hi = q15_shift(vaddq_s32(
+          vmulq_s32(
+            vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))),
+              bias_v,
+            ),
+            c_scale_v,
           ),
-          c_scale_v,
-        ),
-        rnd_v,
-      ));
-      let v_d_hi = q15_shift(vaddq_s32(
-        vmulq_s32(
-          vsubq_s32(
-            vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))),
-            bias_v,
+          rnd_v,
+        ));
+        let v_d_hi = q15_shift(vaddq_s32(
+          vmulq_s32(
+            vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))),
+              bias_v,
+            ),
+            c_scale_v,
           ),
-          c_scale_v,
-        ),
-        rnd_v,
-      ));
-
-      // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow).
-      let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64);
-      let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64);
-      let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64);
-      let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64);
-      let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64);
-      let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64);
-
-      // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2).
-      // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3
-      // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7
-      let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo);
-      let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo);
-      let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo);
-      let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo);
-      let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo);
-      let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo);
-      let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi);
-      let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi);
-      let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi);
-      let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi);
-      let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi);
-      let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi);
-
-      // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range.
-      // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64.
-      // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15.
-      let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0)));
-      let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0)));
-      let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0)));
-      let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0)));
-      let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64);
-      let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64);
-      let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64);
-      let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64);
-
-      // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]).
-      //
-      // Alignment:
-      //   ys_lo_0 = [Y0,Y1,Y2,Y3]   r_cd_lo0 = [c0,c0,c1,c1]  → pixels 0..3
-      //   ys_lo_1 = [Y4,Y5,Y6,Y7]   r_cd_lo1 = [c2,c2,c3,c3]  → pixels 4..7
-      //   ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5]  → pixels 8..11
-      //   ys_hi_1 = [Y12..Y15]       r_cd_hi1 = [c6,c6,c7,c7]  → pixels 12..15
-      //
-      // vcombine_u16(A, B) packs two u16x4 into one u16x8.
-      let r_lo_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)),
-        vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)),
-      );
-      let g_lo_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)),
-        vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)),
-      );
-      let b_lo_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)),
-        vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)),
-      );
-      // hi group (Y8..Y15)
-      let r_hi_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)),
-        vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)),
-      );
-      let g_hi_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)),
-        vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)),
-      );
-      let b_hi_u16 = vcombine_u16(
-        vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)),
-        vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)),
-      );
-
-      // Each u16x8 covers 8 pixels.  Two stores per format (lo + hi).
-      // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes).
-      //   Offset for lo: x*4 u16. Offset for hi: x*4+32 u16.
-      // For RGB:  each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes).
-      //   Offset for lo: x*3 u16. Offset for hi: x*3+24 u16.
-      if ALPHA {
-        vst4q_u16(
-          out.as_mut_ptr().add(x * 4),
-          uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16),
+          rnd_v,
+        ));
+
+        // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow).
+        let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64);
+        let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64);
+        let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64);
+        let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64);
+        let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64);
+        let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64);
+
+        // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2).
+        // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3
+        // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7
+        let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo);
+        let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo);
+        let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo);
+        let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo);
+        let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo);
+        let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo);
+        let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi);
+        let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi);
+        let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi);
+        let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi);
+        let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi);
+        let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi);
+
+        // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range.
+        // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64.
+        // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15.
+        let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0)));
+        let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0)));
+        let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0)));
+        let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0)));
+        let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64);
+        let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64);
+        let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64);
+        let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64);
+
+        // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]).
+        //
+        // Alignment:
+        //   ys_lo_0 = [Y0,Y1,Y2,Y3]   r_cd_lo0 = [c0,c0,c1,c1]  → pixels 0..3
+        //   ys_lo_1 = [Y4,Y5,Y6,Y7]   r_cd_lo1 = [c2,c2,c3,c3]  → pixels 4..7
+        //   ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5]  → pixels 8..11
+        //   ys_hi_1 = [Y12..Y15]       r_cd_hi1 = [c6,c6,c7,c7]  → pixels 12..15
+        //
+        // vcombine_u16(A, B) packs two u16x4 into one u16x8.
+        let r_lo_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)),
+          vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)),
         );
-        vst4q_u16(
-          out.as_mut_ptr().add(x * 4 + 32),
-          uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16),
+        let g_lo_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)),
+          vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)),
         );
-      } else {
-        vst3q_u16(
-          out.as_mut_ptr().add(x * 3),
-          uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
+        let b_lo_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)),
+          vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)),
         );
-        vst3q_u16(
-          out.as_mut_ptr().add(x * 3 + 24),
-          uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
+        // hi group (Y8..Y15)
+        let r_hi_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)),
+          vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)),
+        );
+        let g_hi_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)),
+          vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)),
+        );
+        let b_hi_u16 = vcombine_u16(
+          vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)),
+          vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)),
         );
-      }
 
-      x += 16;
-    }
+        // Each u16x8 covers 8 pixels.  Two stores per format (lo + hi).
+        // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes).
+        //   Offset for lo: x*4 u16. Offset for hi: x*4+32 u16.
+        // For RGB:  each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes).
+        //   Offset for lo: x*3 u16. Offset for hi: x*3+24 u16.
+        if ALPHA {
+          vst4q_u16(
+            out.as_mut_ptr().add(x * 4),
+            uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16),
+          );
+          vst4q_u16(
+            out.as_mut_ptr().add(x * 4 + 32),
+            uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16),
+          );
+        } else {
+          vst3q_u16(
+            out.as_mut_ptr().add(x * 3),
+            uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
+          );
+          vst3q_u16(
+            out.as_mut_ptr().add(x * 3 + 24),
+            uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
+          );
+        }
+
+        x += 16;
+      }
+    } // end if !BE
 
-    // Scalar tail — remaining < 16 pixels.
+    // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -416,8 +430,9 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 // ---- Luma u8 (16 px/iter) -----------------------------------------------
 
 /// NEON Y216 → u8 luma. Extracts Y via `>> 8`.
+/// `BE = true` bypasses NEON and uses scalar.
 ///
-/// Byte-identical to `scalar::y216_to_luma_row`.
+/// Byte-identical to `scalar::y216_to_luma_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -427,29 +442,35 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn y216_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   unsafe {
     let mut x = 0usize;
-    while x + 16 <= width {
-      // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded.
-      let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
-      let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
-      // >> 8 narrows u16 → u8 (high byte of each Y sample).
-      let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0);
-      let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0);
-      vst1_u8(out.as_mut_ptr().add(x), y_lo_u8);
-      vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8);
-      x += 16;
+    if !BE {
+      while x + 16 <= width {
+        // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded.
+        let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
+        let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
+        // >> 8 narrows u16 → u8 (high byte of each Y sample).
+        let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0);
+        let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0);
+        vst1_u8(out.as_mut_ptr().add(x), y_lo_u8);
+        vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8);
+        x += 16;
+      }
     }
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -457,8 +478,9 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 // ---- Luma u16 (16 px/iter) ----------------------------------------------
 
 /// NEON Y216 → u16 luma. Direct copy of Y samples (no shift).
+/// `BE = true` bypasses NEON and uses scalar.
 ///
-/// Byte-identical to `scalar::y216_to_luma_u16_row`.
+/// Byte-identical to `scalar::y216_to_luma_u16_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -468,26 +490,32 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn y216_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   unsafe {
     let mut x = 0usize;
-    while x + 16 <= width {
-      let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
-      let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
-      // Direct copy — Y samples are already full 16-bit (no shift needed).
-      vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0);
-      vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0);
-      x += 16;
+    if !BE {
+      while x + 16 <= width {
+        let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2));
+        let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16));
+        // Direct copy — Y samples are already full 16-bit (no shift needed).
+        vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0);
+        vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0);
+        x += 16;
+      }
     }
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/neon/y2xx.rs b/src/row/arch/neon/y2xx.rs
index 0c02365f..72920362 100644
--- a/src/row/arch/neon/y2xx.rs
+++ b/src/row/arch/neon/y2xx.rs
@@ -83,11 +83,12 @@ unsafe fn unpack_y2xx_8px_neon(
 }
 
 /// NEON Y2xx → packed RGB / RGBA u8. Const‑generic over
-/// `BITS ∈ {10, 12}` and `ALPHA ∈ {false, true}`. Output bit depth is
-/// u8 (downshifted from the native BITS Q15 pipeline via
-/// `range_params_n::<BITS, 8>`).
+/// `BITS ∈ {10, 12}`, `ALPHA ∈ {false, true}`, and `BE ∈ {false, true}`.
+/// `BE = true` selects big-endian u16 decoding for the input samples.
+/// When `BE = true` the SIMD path is bypassed and the scalar kernel
+/// handles the full row (the NEON loop only handles native-endian data).
 ///
-/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>`
+/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>`
 /// for every input.
 ///
 /// # Safety
@@ -98,7 +99,11 @@ unsafe fn unpack_y2xx_8px_neon(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -126,86 +131,90 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
   // by the `while x + 8 <= width` loop and the caller-promised slice
   // lengths checked above.
   unsafe {
-    let rnd_v = vdupq_n_s32(RND);
-    let y_off_v = vdupq_n_s16(y_off as i16);
-    let y_scale_v = vdupq_n_s32(y_scale);
-    let c_scale_v = vdupq_n_s32(c_scale);
-    let bias_v = vdupq_n_s16(bias as i16);
-    let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
-    let cru = vdupq_n_s32(coeffs.r_u());
-    let crv = vdupq_n_s32(coeffs.r_v());
-    let cgu = vdupq_n_s32(coeffs.g_u());
-    let cgv = vdupq_n_s32(coeffs.g_v());
-    let cbu = vdupq_n_s32(coeffs.b_u());
-    let cbv = vdupq_n_s32(coeffs.b_v());
-
+    // BE=true: NEON path skipped; scalar handles all pixels below.
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = vreinterpretq_s16_u16(y_vec);
-
-      // Subtract chroma bias (e.g. 512 for 10‑bit) — fits i16 since
-      // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
-      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
-      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
-
-      // Widen 8‑lane i16 chroma to two i32x4 halves for the Q15
-      // multiplies. Only lanes 0..3 of `_lo` are valid; `_hi` is
-      // entirely don't-care (duplicate of `_lo`). We feed both
-      // halves through `chroma_i16x8` to recycle the helper exactly;
-      // the don't-care output lanes are discarded by `vzip1q_s16`
-      // below (which only consumes lanes 0..3).
-      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
-      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
-      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
-      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
-
-      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
-
-      // 8‑lane chroma vectors with valid data in lanes 0..3.
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
-      // `vzip1q_s16` so lanes 0..7 of `r_dup` align with Y0..Y7.
-      // `vzip1q_s16` interleaves the low 4 lanes of each operand:
-      //   [c0, c0, c1, c1, c2, c2, c3, c3]
-      let r_dup = vzip1q_s16(r_chroma, r_chroma);
-      let g_dup = vzip1q_s16(g_chroma, g_chroma);
-      let b_dup = vzip1q_s16(b_chroma, b_chroma);
-
-      // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // u8 narrow with saturation. 8 valid lanes per channel.
-      let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup));
-      let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup));
-      let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup));
-
-      if ALPHA {
-        let alpha = vdup_n_u8(0xFF);
-        vst4_u8(
-          out.as_mut_ptr().add(x * 4),
-          uint8x8x4_t(r_u8, g_u8, b_u8, alpha),
-        );
-      } else {
-        vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8));
+    if !BE {
+      let rnd_v = vdupq_n_s32(RND);
+      let y_off_v = vdupq_n_s16(y_off as i16);
+      let y_scale_v = vdupq_n_s32(y_scale);
+      let c_scale_v = vdupq_n_s32(c_scale);
+      let bias_v = vdupq_n_s16(bias as i16);
+      let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
+      let cru = vdupq_n_s32(coeffs.r_u());
+      let crv = vdupq_n_s32(coeffs.r_v());
+      let cgu = vdupq_n_s32(coeffs.g_u());
+      let cgv = vdupq_n_s32(coeffs.g_v());
+      let cbu = vdupq_n_s32(coeffs.b_u());
+      let cbv = vdupq_n_s32(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = vreinterpretq_s16_u16(y_vec);
+
+        // Subtract chroma bias (e.g. 512 for 10‑bit) — fits i16 since
+        // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
+        let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+        let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+        // Widen 8‑lane i16 chroma to two i32x4 halves for the Q15
+        // multiplies. Only lanes 0..3 of `_lo` are valid; `_hi` is
+        // entirely don't-care (duplicate of `_lo`). We feed both
+        // halves through `chroma_i16x8` to recycle the helper exactly;
+        // the don't-care output lanes are discarded by `vzip1q_s16`
+        // below (which only consumes lanes 0..3).
+        let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+        let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+        let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+        let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+        let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+        // 8‑lane chroma vectors with valid data in lanes 0..3.
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
+        // `vzip1q_s16` so lanes 0..7 of `r_dup` align with Y0..Y7.
+        // `vzip1q_s16` interleaves the low 4 lanes of each operand:
+        //   [c0, c0, c1, c1, c2, c2, c3, c3]
+        let r_dup = vzip1q_s16(r_chroma, r_chroma);
+        let g_dup = vzip1q_s16(g_chroma, g_chroma);
+        let b_dup = vzip1q_s16(b_chroma, b_chroma);
+
+        // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // u8 narrow with saturation. 8 valid lanes per channel.
+        let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup));
+        let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup));
+        let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup));
+
+        if ALPHA {
+          let alpha = vdup_n_u8(0xFF);
+          vst4_u8(
+            out.as_mut_ptr().add(x * 4),
+            uint8x8x4_t(r_u8, g_u8, b_u8, alpha),
+          );
+        } else {
+          vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8));
+        }
+
+        x += 8;
       }
-
-      x += 8;
     }
 
-    // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // Scalar tail — remaining < 8 pixels (always even per 4:2:2), or
+    // full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -218,10 +227,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 
 /// NEON Y2xx → packed `u16` RGB / RGBA at native BITS depth
 /// (low‑bit‑packed: BITS active bits in the low N of each `u16`).
-/// Const‑generic over `BITS ∈ {10, 12}`.
+/// Const‑generic over `BITS ∈ {10, 12}`, `ALPHA`, and `BE`.
+/// `BE = true` bypasses NEON and uses the scalar kernel for the full row.
 ///
 /// Byte‑identical to
-/// `scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>`.
+/// `scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -231,7 +241,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -257,71 +271,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let rnd_v = vdupq_n_s32(RND);
-    let y_off_v = vdupq_n_s16(y_off as i16);
-    let y_scale_v = vdupq_n_s32(y_scale);
-    let c_scale_v = vdupq_n_s32(c_scale);
-    let bias_v = vdupq_n_s16(bias as i16);
-    let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
-    let max_v = vdupq_n_s16(out_max);
-    let zero_v = vdupq_n_s16(0);
-    let cru = vdupq_n_s32(coeffs.r_u());
-    let crv = vdupq_n_s32(coeffs.r_v());
-    let cgu = vdupq_n_s32(coeffs.g_u());
-    let cgv = vdupq_n_s32(coeffs.g_v());
-    let cbu = vdupq_n_s32(coeffs.b_u());
-    let cbv = vdupq_n_s32(coeffs.b_v());
-
+    // BE=true: bypass NEON; scalar handles full row below.
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = vreinterpretq_s16_u16(y_vec);
-      let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
-      let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
-
-      let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
-      let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
-      let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
-      let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
-
-      let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
-
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      let r_dup = vzip1q_s16(r_chroma, r_chroma);
-      let g_dup = vzip1q_s16(g_chroma, g_chroma);
-      let b_dup = vzip1q_s16(b_chroma, b_chroma);
-
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // Native‑depth output: clamp to [0, (1 << BITS) - 1]. `vqaddq_s16`
-      // saturates at i16 bounds (no‑op here since |sum| stays well
-      // inside i16 for BITS ≤ 12), then max/min clamps to the BITS range.
-      let r = clamp_u16_max(vqaddq_s16(y_scaled, r_dup), zero_v, max_v);
-      let g = clamp_u16_max(vqaddq_s16(y_scaled, g_dup), zero_v, max_v);
-      let b = clamp_u16_max(vqaddq_s16(y_scaled, b_dup), zero_v, max_v);
-
-      if ALPHA {
-        let alpha = vdupq_n_u16(out_max as u16);
-        vst4q_u16(out.as_mut_ptr().add(x * 4), uint16x8x4_t(r, g, b, alpha));
-      } else {
-        vst3q_u16(out.as_mut_ptr().add(x * 3), uint16x8x3_t(r, g, b));
+    if !BE {
+      let rnd_v = vdupq_n_s32(RND);
+      let y_off_v = vdupq_n_s16(y_off as i16);
+      let y_scale_v = vdupq_n_s32(y_scale);
+      let c_scale_v = vdupq_n_s32(c_scale);
+      let bias_v = vdupq_n_s16(bias as i16);
+      let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
+      let max_v = vdupq_n_s16(out_max);
+      let zero_v = vdupq_n_s16(0);
+      let cru = vdupq_n_s32(coeffs.r_u());
+      let crv = vdupq_n_s32(coeffs.r_v());
+      let cgu = vdupq_n_s32(coeffs.g_u());
+      let cgv = vdupq_n_s32(coeffs.g_v());
+      let cbu = vdupq_n_s32(coeffs.b_u());
+      let cbv = vdupq_n_s32(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = vreinterpretq_s16_u16(y_vec);
+        let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v);
+        let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v);
+
+        let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16));
+        let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16));
+        let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16));
+        let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16));
+
+        let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v));
+
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        let r_dup = vzip1q_s16(r_chroma, r_chroma);
+        let g_dup = vzip1q_s16(g_chroma, g_chroma);
+        let b_dup = vzip1q_s16(b_chroma, b_chroma);
+
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // Native‑depth output: clamp to [0, (1 << BITS) - 1]. `vqaddq_s16`
+        // saturates at i16 bounds (no‑op here since |sum| stays well
+        // inside i16 for BITS ≤ 12), then max/min clamps to the BITS range.
+        let r = clamp_u16_max(vqaddq_s16(y_scaled, r_dup), zero_v, max_v);
+        let g = clamp_u16_max(vqaddq_s16(y_scaled, g_dup), zero_v, max_v);
+        let b = clamp_u16_max(vqaddq_s16(y_scaled, b_dup), zero_v, max_v);
+
+        if ALPHA {
+          let alpha = vdupq_n_u16(out_max as u16);
+          vst4q_u16(out.as_mut_ptr().add(x * 4), uint16x8x4_t(r, g, b, alpha));
+        } else {
+          vst3q_u16(out.as_mut_ptr().add(x * 3), uint16x8x3_t(r, g, b));
+        }
+
+        x += 8;
       }
-
-      x += 8;
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -335,9 +352,9 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// NEON Y2xx → 8‑bit luma. Y values are downshifted from BITS to 8
 /// via `>> (BITS - 8)` after the `>> (16 - BITS)` MSB‑alignment, i.e.
 /// a single `>> 8` from the raw u16 sample. Bypasses the YUV → RGB
-/// pipeline entirely.
+/// pipeline entirely. `BE = true` bypasses NEON and uses scalar.
 ///
-/// Byte‑identical to `scalar::y2xx_n_to_luma_row::<BITS>`.
+/// Byte‑identical to `scalar::y2xx_n_to_luma_row::<BITS, BE>`.
 ///
 /// # Safety
 ///
@@ -347,7 +364,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -365,29 +382,32 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     let mut x = 0usize;
-    while x + 8 <= width {
-      // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples
-      // (still MSB‑aligned at BITS ≤ 12, low bits zero).
-      let pair = vld2q_u16(packed.as_ptr().add(x * 2));
-      // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8`
-      // for any BITS ∈ {10, 12} — the constant fold gives the same
-      // result whether we shift in two stages or one.
-      let y_u8 = vshrn_n_u16::<8>(pair.0);
-      vst1_u8(luma_out.as_mut_ptr().add(x), y_u8);
-      x += 8;
+    if !BE {
+      while x + 8 <= width {
+        // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples
+        // (still MSB‑aligned at BITS ≤ 12, low bits zero).
+        let pair = vld2q_u16(packed.as_ptr().add(x * 2));
+        // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8`
+        // for any BITS ∈ {10, 12} — the constant fold gives the same
+        // result whether we shift in two stages or one.
+        let y_u8 = vshrn_n_u16::<8>(pair.0);
+        vst1_u8(luma_out.as_mut_ptr().add(x), y_u8);
+        x += 8;
+      }
     }
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
 
 /// NEON Y2xx → native‑depth `u16` luma (low‑bit‑packed). Each output
 /// `u16` carries the source's BITS-bit Y value in its low BITS bits.
-/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::<BITS>`.
+/// `BE = true` bypasses NEON and uses scalar.
+/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::<BITS, BE>`.
 ///
 /// # Safety
 ///
@@ -397,7 +417,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -414,21 +434,23 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
     let mut x = 0usize;
-    while x + 8 <= width {
-      let pair = vld2q_u16(packed.as_ptr().add(x * 2));
-      // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples
-      // into low‑bit‑packed form for the native‑depth u16 output.
-      let y_low = vshlq_u16(pair.0, shr_count);
-      vst1q_u16(luma_out.as_mut_ptr().add(x), y_low);
-      x += 8;
+    if !BE {
+      let shr_count = vdupq_n_s16(-((16 - BITS) as i16));
+      while x + 8 <= width {
+        let pair = vld2q_u16(packed.as_ptr().add(x * 2));
+        // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples
+        // into low‑bit‑packed form for the native‑depth u16 output.
+        let y_low = vshlq_u16(pair.0, shr_count);
+        vst1q_u16(luma_out.as_mut_ptr().add(x), y_low);
+        x += 8;
+      }
     }
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_u16_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_u16_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/tests/v210.rs b/src/row/arch/wasm_simd128/tests/v210.rs
index ac7455c2..d4d51116 100644
--- a/src/row/arch/wasm_simd128/tests/v210.rs
+++ b/src/row/arch/wasm_simd128/tests/v210.rs
@@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v210_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v210_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -82,9 +82,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v210_to_luma_row(&p, &mut s, width);
+  scalar::v210_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_row(&p, &mut k, width);
+    v210_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 v210→luma diverges (width={width})");
 }
@@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v210_to_luma_u16_row(&p, &mut s, width);
+  scalar::v210_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_u16_row(&p, &mut k, width);
+    v210_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 v210→luma u16 diverges (width={width})");
 }
@@ -227,7 +227,7 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    v210_to_luma_u16_row(&packed, &mut luma_out, W);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -239,9 +239,15 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v210_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v210_to_rgb_or_rgba_row::<false>(
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/wasm_simd128/tests/y216.rs b/src/row/arch/wasm_simd128/tests/y216.rs
index 8441d72c..034f029b 100644
--- a/src/row/arch/wasm_simd128/tests/y216.rs
+++ b/src/row/arch/wasm_simd128/tests/y216.rs
@@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_y216(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y216_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -29,9 +29,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_y216(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y216_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -43,9 +43,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_y216(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -57,9 +57,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_y216(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -71,9 +71,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y216_to_luma_row(&p, &mut s, width);
+  scalar::y216_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_row(&p, &mut k, width);
+    y216_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 y216→luma u8 diverges (width={width})");
 }
@@ -82,9 +82,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y216_to_luma_u16_row(&p, &mut s, width);
+  scalar::y216_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_u16_row(&p, &mut k, width);
+    y216_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 y216→luma u16 diverges (width={width})");
 }
@@ -183,7 +183,7 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y216_to_luma_u16_row(&packed, &mut luma_u16, W);
+    y216_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -195,9 +195,15 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/wasm_simd128/tests/y2xx.rs b/src/row/arch/wasm_simd128/tests/y2xx.rs
index 08a484ce..ad31d2f1 100644
--- a/src/row/arch/wasm_simd128/tests/y2xx.rs
+++ b/src/row/arch/wasm_simd128/tests/y2xx.rs
@@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&packed, &mut luma_u16, W);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
       &packed,
       &mut simd_rgb,
       W,
@@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
       false,
     );
   }
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -101,9 +101,9 @@ fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: boo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -115,9 +115,9 @@ fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -129,9 +129,11 @@ fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range:
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -143,9 +145,11 @@ fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -157,9 +161,9 @@ fn check_luma<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y2xx_n_to_luma_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 y2xx<{BITS}>→luma diverges (width={width})");
 }
@@ -168,9 +172,9 @@ fn check_luma_u16<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y2xx_n_to_luma_u16_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(
     s, k,
@@ -251,15 +255,15 @@ fn wasm_simd128_y212_matches_scalar_widths() {
     let p = pseudo_random_y212(w, 0xAA55);
     let mut s = std::vec![0u8; w * 3];
     let mut k = std::vec![0u8; w * 3];
-    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
+    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
     unsafe {
-      y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
+      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
     }
     assert_eq!(s, k, "simd128 y2xx<12>→RGB diverges (width={w})");
 
     let mut s_u16 = std::vec![0u16; w * 4];
     let mut k_u16 = std::vec![0u16; w * 4];
-    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
       &p,
       &mut s_u16,
       w,
@@ -267,7 +271,7 @@ fn wasm_simd128_y212_matches_scalar_widths() {
       true,
     );
     unsafe {
-      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
         &p,
         &mut k_u16,
         w,
@@ -282,17 +286,17 @@ fn wasm_simd128_y212_matches_scalar_widths() {
 
     let mut sl = std::vec![0u8; w];
     let mut kl = std::vec![0u8; w];
-    scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w);
+    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
     unsafe {
-      y2xx_n_to_luma_row::<12>(&p, &mut kl, w);
+      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
     }
     assert_eq!(sl, kl, "simd128 y2xx<12>→luma diverges (width={w})");
 
     let mut slu = std::vec![0u16; w];
     let mut klu = std::vec![0u16; w];
-    scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w);
+    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
     unsafe {
-      y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w);
+      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
     }
     assert_eq!(slu, klu, "simd128 y2xx<12>→luma u16 diverges (width={w})");
   }
diff --git a/src/row/arch/wasm_simd128/v210.rs b/src/row/arch/wasm_simd128/v210.rs
index dba59ca9..264ca1c4 100644
--- a/src/row/arch/wasm_simd128/v210.rs
+++ b/src/row/arch/wasm_simd128/v210.rs
@@ -16,7 +16,7 @@
 
 use core::arch::wasm32::*;
 
-use super::*;
+use super::{endian::load_endian_u32x4, *};
 use crate::{ColorMatrix, row::scalar};
 
 /// Unpacks one 16-byte v210 word into three `v128` vectors holding
@@ -45,11 +45,11 @@ use crate::{ColorMatrix, row::scalar};
 /// wasm).
 #[inline]
 #[target_feature(enable = "simd128")]
-unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) {
+unsafe fn unpack_v210_word_wasm<const BE: bool>(ptr: *const u8) -> (v128, v128, v128) {
   // SAFETY: caller obligation — `ptr` has 16 bytes readable; simd128
   // is enabled at compile time.
   unsafe {
-    let words = v128_load(ptr.cast());
+    let words = load_endian_u32x4::<BE>(ptr);
     let mask10 = i32x4_splat(0x3FF);
     let low10 = v128_and(words, mask10);
     let mid10 = v128_and(u32x4_shr(words, 10), mask10);
@@ -146,7 +146,7 @@ unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) {
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -183,7 +183,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     let cbv = i32x4_splat(coeffs.b_v());
 
     for w in 0..words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = y_vec;
 
@@ -270,7 +270,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -287,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -324,7 +330,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
     let cbv = i32x4_splat(coeffs.b_v());
 
     for w in 0..words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = y_vec;
       let u_i16 = i16x8_sub(u_vec, bias_v);
@@ -391,7 +397,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -414,7 +420,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v210_to_luma_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -424,7 +434,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..words {
-      let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_wasm::<BE>(packed.as_ptr().add(w * 16));
       // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x16 via
       // saturating narrow (Y ≤ 1023 stays well inside [0, 255] post-shift).
       let y_shr = u16x8_shr(y_vec, 2);
@@ -439,7 +449,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -456,7 +466,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -466,7 +480,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..words {
-      let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_wasm::<BE>(packed.as_ptr().add(w * 16));
       // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice.
       let mut tmp = [0u16; 8];
       v128_store(tmp.as_mut_ptr().cast(), y_vec);
@@ -477,7 +491,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/y216.rs b/src/row/arch/wasm_simd128/y216.rs
index 5beb78f2..7bdf6363 100644
--- a/src/row/arch/wasm_simd128/y216.rs
+++ b/src/row/arch/wasm_simd128/y216.rs
@@ -107,7 +107,7 @@ unsafe fn unpack_y216_8px_wasm(ptr: *const u16) -> (v128, v128, v128) {
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -124,102 +124,111 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
   const RND: i32 = 1 << 14;
 
   unsafe {
-    let rnd_v = i32x4_splat(RND);
-    let y_off32_v = i32x4_splat(y_off);
-    let y_scale_v = i32x4_splat(y_scale);
-    let c_scale_v = i32x4_splat(c_scale);
-    // Bias = 32768 = 0x8000; as i16 this wraps to -32768.
-    // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly
-    // maps full-u16 chroma [0, 65535] to [-32768, 32767].
-    let bias16_v = i16x8_splat(-32768i16);
-    let alpha_u8 = u8x16_splat(0xFF);
-    let cru = i32x4_splat(coeffs.r_u());
-    let crv = i32x4_splat(coeffs.r_v());
-    let cgu = i32x4_splat(coeffs.g_u());
-    let cgv = i32x4_splat(coeffs.g_v());
-    let cbu = i32x4_splat(coeffs.b_u());
-    let cbv = i32x4_splat(coeffs.b_v());
-
     let mut x = 0usize;
-    // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15).
-    while x + 16 <= width {
-      let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2));
-      let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16));
-
-      // Chroma bias subtraction (wrapping trick for full-u16 range).
-      let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v);
-      let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v);
-      let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v);
-      let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v);
-
-      // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid.
-      // Hi halves hold zeros (from the swizzle mask) — don't-care since
-      // `chroma_i16x8` discards lanes 4..7 after `dup_lo`.
-      let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16);
-      let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16);
-      let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16);
-      let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16);
-      let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16);
-      let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16);
-      let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16);
-      let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16);
-
-      // Q15 chroma scale → i32x4 (scaled chroma deltas).
-      let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v));
-      let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v));
-      let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v));
-      let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v));
-      let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v));
-      let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v));
-      let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v));
-      let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v));
-
-      // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care).
-      let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-      let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-      let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-      let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
-      let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
-      let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
-
-      // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample).
-      let r_dup_lo = dup_lo(r_chroma_lo);
-      let g_dup_lo = dup_lo(g_chroma_lo);
-      let b_dup_lo = dup_lo(b_chroma_lo);
-      let r_dup_hi = dup_lo(r_chroma_hi);
-      let g_dup_hi = dup_lo(g_chroma_hi);
-      let b_dup_hi = dup_lo(b_chroma_hi);
-
-      // Y scale via unsigned widening (Y216 has full u16 range; i16 would
-      // overflow for Y > 32767).
-      let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v);
-      let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v);
-
-      // Saturating add → saturating narrow to u8x16.
-      let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo);
-      let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi);
-      let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo);
-      let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi);
-      let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo);
-      let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi);
-      let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
-      let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
-      let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
-
-      if ALPHA {
-        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let rnd_v = i32x4_splat(RND);
+      let y_off32_v = i32x4_splat(y_off);
+      let y_scale_v = i32x4_splat(y_scale);
+      let c_scale_v = i32x4_splat(c_scale);
+      // Bias = 32768 = 0x8000; as i16 this wraps to -32768.
+      // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly
+      // maps full-u16 chroma [0, 65535] to [-32768, 32767].
+      let bias16_v = i16x8_splat(-32768i16);
+      let alpha_u8 = u8x16_splat(0xFF);
+      let cru = i32x4_splat(coeffs.r_u());
+      let crv = i32x4_splat(coeffs.r_v());
+      let cgu = i32x4_splat(coeffs.g_u());
+      let cgv = i32x4_splat(coeffs.g_v());
+      let cbu = i32x4_splat(coeffs.b_u());
+      let cbv = i32x4_splat(coeffs.b_v());
+
+      // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15).
+      while x + 16 <= width {
+        let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2));
+        let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16));
+
+        // Chroma bias subtraction (wrapping trick for full-u16 range).
+        let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v);
+        let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v);
+        let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v);
+        let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v);
+
+        // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid.
+        // Hi halves hold zeros (from the swizzle mask) — don't-care since
+        // `chroma_i16x8` discards lanes 4..7 after `dup_lo`.
+        let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16);
+        let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16);
+        let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16);
+        let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16);
+        let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16);
+        let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16);
+        let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16);
+        let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16);
+
+        // Q15 chroma scale → i32x4 (scaled chroma deltas).
+        let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v));
+        let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v));
+        let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v));
+        let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v));
+        let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v));
+        let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v));
+        let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v));
+        let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v));
+
+        // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care).
+        let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+        let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+        let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+        let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
+        let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
+        let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v);
+
+        // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample).
+        let r_dup_lo = dup_lo(r_chroma_lo);
+        let g_dup_lo = dup_lo(g_chroma_lo);
+        let b_dup_lo = dup_lo(b_chroma_lo);
+        let r_dup_hi = dup_lo(r_chroma_hi);
+        let g_dup_hi = dup_lo(g_chroma_hi);
+        let b_dup_hi = dup_lo(b_chroma_hi);
+
+        // Y scale via unsigned widening (Y216 has full u16 range; i16 would
+        // overflow for Y > 32767).
+        let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v);
+        let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v);
+
+        // Saturating add → saturating narrow to u8x16.
+        let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo);
+        let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi);
+        let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo);
+        let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi);
+        let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo);
+        let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi);
+        let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
+        let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
+        let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
+
+        if ALPHA {
+          write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+        }
+        x += 16;
       }
-      x += 16;
     }
 
     // Scalar tail — remaining < 16 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -237,7 +246,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -255,101 +264,104 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
-    let alpha_u16 = u16x8_splat(0xFFFF);
-    let rnd_i64 = i64x2_splat(RND_I64);
-    let rnd_i32 = i32x4_splat(RND_I32);
-    let y_off32 = i32x4_splat(y_off);
-    let y_scale_i64 = i64x2_splat(y_scale as i64);
-    let c_scale_i32 = i32x4_splat(c_scale);
-    // Wrapping 0x8000 bias trick for full-u16 chroma.
-    let bias16 = i16x8_splat(-32768i16);
-    // Coefficients widened once to i64x2.
-    let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u()));
-    let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v()));
-    let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u()));
-    let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v()));
-    let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u()));
-    let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v()));
-
     let mut x = 0usize;
-    // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs.
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2));
-
-      // Chroma bias (wrapping trick).
-      let u_i16 = i16x8_sub(u_vec, bias16);
-      let v_i16 = i16x8_sub(v_vec, bias16);
-
-      // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares).
-      let u_i32 = i32x4_extend_low_i16x8(u_i16);
-      let v_i32 = i32x4_extend_low_i16x8(v_i16);
-
-      // Q15 scale → 4 × i32 chroma deltas.
-      let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15);
-      let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15);
-
-      // Widen to 2 × i64x2 for i64 chroma pipeline.
-      let u_d_lo = i64x2_extend_low_i32x4(u_d);
-      let u_d_hi = i64x2_extend_high_i32x4(u_d);
-      let v_d_lo = i64x2_extend_low_i32x4(v_d);
-      let v_d_hi = i64x2_extend_high_i32x4(v_d);
-
-      let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64);
-      let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64);
-      let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64);
-      let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64);
-      let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64);
-      let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64);
-
-      // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3].
-      let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi);
-      let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi);
-      let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi);
-
-      // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2).
-      // chroma_dup_i32x4_u16([c0,c1,c2,c3]) →
-      //   lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3]
-      let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32);
-      let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32);
-      let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32);
-
-      // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64.
-      let y_lo_u32 = u32x4_extend_low_u16x8(y_vec);
-      let y_hi_u32 = u32x4_extend_high_u16x8(y_vec);
-      let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32);
-      let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32);
-
-      let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64);
-      let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64);
-
-      // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]).
-      let r_u16 = u16x8_narrow_i32x4(
-        i32x4_add(y_lo_scaled, r_dup_lo),
-        i32x4_add(y_hi_scaled, r_dup_hi),
-      );
-      let g_u16 = u16x8_narrow_i32x4(
-        i32x4_add(y_lo_scaled, g_dup_lo),
-        i32x4_add(y_hi_scaled, g_dup_hi),
-      );
-      let b_u16 = u16x8_narrow_i32x4(
-        i32x4_add(y_lo_scaled, b_dup_lo),
-        i32x4_add(y_hi_scaled, b_dup_hi),
-      );
-
-      if ALPHA {
-        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let alpha_u16 = u16x8_splat(0xFFFF);
+      let rnd_i64 = i64x2_splat(RND_I64);
+      let rnd_i32 = i32x4_splat(RND_I32);
+      let y_off32 = i32x4_splat(y_off);
+      let y_scale_i64 = i64x2_splat(y_scale as i64);
+      let c_scale_i32 = i32x4_splat(c_scale);
+      // Wrapping 0x8000 bias trick for full-u16 chroma.
+      let bias16 = i16x8_splat(-32768i16);
+      // Coefficients widened once to i64x2.
+      let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u()));
+      let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v()));
+      let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u()));
+      let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v()));
+      let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u()));
+      let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v()));
+
+      // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs.
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2));
+
+        // Chroma bias (wrapping trick).
+        let u_i16 = i16x8_sub(u_vec, bias16);
+        let v_i16 = i16x8_sub(v_vec, bias16);
+
+        // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares).
+        let u_i32 = i32x4_extend_low_i16x8(u_i16);
+        let v_i32 = i32x4_extend_low_i16x8(v_i16);
+
+        // Q15 scale → 4 × i32 chroma deltas.
+        let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15);
+        let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15);
+
+        // Widen to 2 × i64x2 for i64 chroma pipeline.
+        let u_d_lo = i64x2_extend_low_i32x4(u_d);
+        let u_d_hi = i64x2_extend_high_i32x4(u_d);
+        let v_d_lo = i64x2_extend_low_i32x4(v_d);
+        let v_d_hi = i64x2_extend_high_i32x4(v_d);
+
+        let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64);
+        let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64);
+        let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64);
+        let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64);
+        let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64);
+        let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64);
+
+        // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3].
+        let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi);
+        let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi);
+        let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi);
+
+        // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2).
+        // chroma_dup_i32x4_u16([c0,c1,c2,c3]) →
+        //   lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3]
+        let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32);
+        let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32);
+        let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32);
+
+        // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64.
+        let y_lo_u32 = u32x4_extend_low_u16x8(y_vec);
+        let y_hi_u32 = u32x4_extend_high_u16x8(y_vec);
+        let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32);
+        let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32);
+
+        let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64);
+        let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64);
+
+        // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]).
+        let r_u16 = u16x8_narrow_i32x4(
+          i32x4_add(y_lo_scaled, r_dup_lo),
+          i32x4_add(y_hi_scaled, r_dup_hi),
+        );
+        let g_u16 = u16x8_narrow_i32x4(
+          i32x4_add(y_lo_scaled, g_dup_lo),
+          i32x4_add(y_hi_scaled, g_dup_hi),
+        );
+        let b_u16 = u16x8_narrow_i32x4(
+          i32x4_add(y_lo_scaled, b_dup_lo),
+          i32x4_add(y_hi_scaled, b_dup_hi),
+        );
+
+        if ALPHA {
+          write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+        }
+        x += 8;
       }
-      x += 8;
     }
 
     // Scalar tail.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -373,48 +385,56 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn y216_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(luma_out.len() >= width);
 
   unsafe {
-    // Y permute: even u16 lanes → low 8 bytes; zeroed high.
-    let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    // 16 px/iter: two groups of 8 Y samples.
-    while x + 16 <= width {
-      // lo group: Y0..Y7 from bytes x*2 .. x*2+32.
-      let lo0 = v128_load(packed.as_ptr().add(x * 2).cast());
-      let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo0 = u8x16_swizzle(lo0, y_idx);
-      let y_lo1 = u8x16_swizzle(lo1, y_idx);
-      let y_lo =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1);
-
-      // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64.
-      let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast());
-      let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast());
-      let y_hi0 = u8x16_swizzle(hi0, y_idx);
-      let y_hi1 = u8x16_swizzle(hi1, y_idx);
-      let y_hi =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1);
-
-      // >> 8: extract high byte of each u16 Y sample.
-      let y_shr_lo = u16x8_shr(y_lo, 8);
-      let y_shr_hi = u16x8_shr(y_hi, 8);
-      // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255).
-      let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi);
-      v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8);
-      x += 16;
+    if !BE {
+      // Y permute: even u16 lanes → low 8 bytes; zeroed high.
+      let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      // 16 px/iter: two groups of 8 Y samples.
+      while x + 16 <= width {
+        // lo group: Y0..Y7 from bytes x*2 .. x*2+32.
+        let lo0 = v128_load(packed.as_ptr().add(x * 2).cast());
+        let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo0 = u8x16_swizzle(lo0, y_idx);
+        let y_lo1 = u8x16_swizzle(lo1, y_idx);
+        let y_lo =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1);
+
+        // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64.
+        let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast());
+        let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast());
+        let y_hi0 = u8x16_swizzle(hi0, y_idx);
+        let y_hi1 = u8x16_swizzle(hi1, y_idx);
+        let y_hi =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1);
+
+        // >> 8: extract high byte of each u16 Y sample.
+        let y_shr_lo = u16x8_shr(y_lo, 8);
+        let y_shr_hi = u16x8_shr(y_hi, 8);
+        // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255).
+        let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi);
+        v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8);
+        x += 16;
+      }
     }
 
+    // Scalar tail — remaining < 16 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -432,44 +452,52 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn y216_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(luma_out.len() >= width);
 
   unsafe {
-    let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift).
-    while x + 16 <= width {
-      // lo group: Y0..Y7
-      let lo0 = v128_load(packed.as_ptr().add(x * 2).cast());
-      let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo0 = u8x16_swizzle(lo0, y_idx);
-      let y_lo1 = u8x16_swizzle(lo1, y_idx);
-      let y_lo =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1);
-
-      // hi group: Y8..Y15
-      let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast());
-      let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast());
-      let y_hi0 = u8x16_swizzle(hi0, y_idx);
-      let y_hi1 = u8x16_swizzle(hi1, y_idx);
-      let y_hi =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1);
-
-      // Direct store — full 16-bit Y, no shift.
-      v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo);
-      v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi);
-      x += 16;
+    if !BE {
+      let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift).
+      while x + 16 <= width {
+        // lo group: Y0..Y7
+        let lo0 = v128_load(packed.as_ptr().add(x * 2).cast());
+        let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo0 = u8x16_swizzle(lo0, y_idx);
+        let y_lo1 = u8x16_swizzle(lo1, y_idx);
+        let y_lo =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1);
+
+        // hi group: Y8..Y15
+        let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast());
+        let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast());
+        let y_hi0 = u8x16_swizzle(hi0, y_idx);
+        let y_hi1 = u8x16_swizzle(hi1, y_idx);
+        let y_hi =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1);
+
+        // Direct store — full 16-bit Y, no shift.
+        v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo);
+        v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi);
+        x += 16;
+      }
     }
 
+    // Scalar tail — remaining < 16 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/y2xx.rs b/src/row/arch/wasm_simd128/y2xx.rs
index 91e77803..83c4a6eb 100644
--- a/src/row/arch/wasm_simd128/y2xx.rs
+++ b/src/row/arch/wasm_simd128/y2xx.rs
@@ -137,7 +137,11 @@ unsafe fn unpack_y2xx_8px_wasm(ptr: *const u16, shr_count: u32) -> (v128, v128,
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -165,112 +169,115 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
   // adds are bounded by the `while x + 8 <= width` loop and the
   // caller-promised slice lengths checked above.
   unsafe {
-    let rnd_v = i32x4_splat(RND);
-    let y_off_v = i16x8_splat(y_off as i16);
-    let y_scale_v = i32x4_splat(y_scale);
-    let c_scale_v = i32x4_splat(c_scale);
-    let bias_v = i16x8_splat(bias as i16);
-    // Loop-invariant runtime shift count for `u16x8_shr`, see
-    // module-level note.
-    let shr_count: u32 = 16 - BITS;
-    let cru = i32x4_splat(coeffs.r_u());
-    let crv = i32x4_splat(coeffs.r_v());
-    let cgu = i32x4_splat(coeffs.g_u());
-    let cgv = i32x4_splat(coeffs.g_v());
-    let cbu = i32x4_splat(coeffs.b_u());
-    let cbv = i32x4_splat(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-
-      // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
-      // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
-      let u_i16 = i16x8_sub(u_vec, bias_v);
-      let v_i16 = i16x8_sub(v_vec, bias_v);
-
-      // Widen 8-lane i16 chroma to two i32x4 halves so the Q15
-      // multiplies don't overflow. Only lanes 0..3 of `_lo` are
-      // valid; `_hi` is entirely don't-care. We feed both halves
-      // through `chroma_i16x8` to recycle the helper exactly; the
-      // don't-care output lanes are discarded by the [`dup_lo`]
-      // duplicate step below (which only consumes lanes 0..3).
-      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
-      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
-      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
-      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
-
-      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
-
-      // 8-lane chroma vectors with valid data in lanes 0..3.
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
-      // [`dup_lo`] so lanes 0..7 of `r_dup` align with Y0..Y7. Lane
-      // order: [c0, c0, c1, c1, c2, c2, c3, c3].
-      let r_dup = dup_lo(r_chroma);
-      let g_dup = dup_lo(g_chroma);
-      let b_dup = dup_lo(b_chroma);
-
-      // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits
-      // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low
-      // 8 bytes of the result hold the saturated u8 of the input
-      // i16x8. Only the first 8 bytes per channel matter.
-      let r_sum = i16x8_add_sat(y_scaled, r_dup);
-      let g_sum = i16x8_add_sat(y_scaled, g_dup);
-      let b_sum = i16x8_add_sat(y_scaled, b_dup);
-      let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum);
-      let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum);
-      let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum);
-
-      // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] /
-      // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so
-      // for the 8-px-iter body we use the v210-style stack-buffer +
-      // scalar interleave pattern. (8 px × 3 = 24 bytes RGB,
-      // 8 px × 4 = 32 bytes RGBA.)
-      let mut r_tmp = [0u8; 16];
-      let mut g_tmp = [0u8; 16];
-      let mut b_tmp = [0u8; 16];
-      v128_store(r_tmp.as_mut_ptr().cast(), r_u8);
-      v128_store(g_tmp.as_mut_ptr().cast(), g_u8);
-      v128_store(b_tmp.as_mut_ptr().cast(), b_u8);
-
-      if ALPHA {
-        let dst = &mut out[x * 4..x * 4 + 8 * 4];
-        for i in 0..8 {
-          dst[i * 4] = r_tmp[i];
-          dst[i * 4 + 1] = g_tmp[i];
-          dst[i * 4 + 2] = b_tmp[i];
-          dst[i * 4 + 3] = 0xFF;
-        }
-      } else {
-        let dst = &mut out[x * 3..x * 3 + 8 * 3];
-        for i in 0..8 {
-          dst[i * 3] = r_tmp[i];
-          dst[i * 3 + 1] = g_tmp[i];
-          dst[i * 3 + 2] = b_tmp[i];
+    if !BE {
+      let rnd_v = i32x4_splat(RND);
+      let y_off_v = i16x8_splat(y_off as i16);
+      let y_scale_v = i32x4_splat(y_scale);
+      let c_scale_v = i32x4_splat(c_scale);
+      let bias_v = i16x8_splat(bias as i16);
+      // Loop-invariant runtime shift count for `u16x8_shr`, see
+      // module-level note.
+      let shr_count: u32 = 16 - BITS;
+      let cru = i32x4_splat(coeffs.r_u());
+      let crv = i32x4_splat(coeffs.r_v());
+      let cgu = i32x4_splat(coeffs.g_u());
+      let cgv = i32x4_splat(coeffs.g_v());
+      let cbu = i32x4_splat(coeffs.b_u());
+      let cbv = i32x4_splat(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+
+        // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
+        // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
+        let u_i16 = i16x8_sub(u_vec, bias_v);
+        let v_i16 = i16x8_sub(v_vec, bias_v);
+
+        // Widen 8-lane i16 chroma to two i32x4 halves so the Q15
+        // multiplies don't overflow. Only lanes 0..3 of `_lo` are
+        // valid; `_hi` is entirely don't-care. We feed both halves
+        // through `chroma_i16x8` to recycle the helper exactly; the
+        // don't-care output lanes are discarded by the [`dup_lo`]
+        // duplicate step below (which only consumes lanes 0..3).
+        let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+        let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+        let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+        let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+        let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+        // 8-lane chroma vectors with valid data in lanes 0..3.
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
+        // [`dup_lo`] so lanes 0..7 of `r_dup` align with Y0..Y7. Lane
+        // order: [c0, c0, c1, c1, c2, c2, c3, c3].
+        let r_dup = dup_lo(r_chroma);
+        let g_dup = dup_lo(g_chroma);
+        let b_dup = dup_lo(b_chroma);
+
+        // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits
+        // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low
+        // 8 bytes of the result hold the saturated u8 of the input
+        // i16x8. Only the first 8 bytes per channel matter.
+        let r_sum = i16x8_add_sat(y_scaled, r_dup);
+        let g_sum = i16x8_add_sat(y_scaled, g_dup);
+        let b_sum = i16x8_add_sat(y_scaled, b_dup);
+        let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum);
+        let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum);
+        let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum);
+
+        // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] /
+        // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so
+        // for the 8-px-iter body we use the v210-style stack-buffer +
+        // scalar interleave pattern. (8 px × 3 = 24 bytes RGB,
+        // 8 px × 4 = 32 bytes RGBA.)
+        let mut r_tmp = [0u8; 16];
+        let mut g_tmp = [0u8; 16];
+        let mut b_tmp = [0u8; 16];
+        v128_store(r_tmp.as_mut_ptr().cast(), r_u8);
+        v128_store(g_tmp.as_mut_ptr().cast(), g_u8);
+        v128_store(b_tmp.as_mut_ptr().cast(), b_u8);
+
+        if ALPHA {
+          let dst = &mut out[x * 4..x * 4 + 8 * 4];
+          for i in 0..8 {
+            dst[i * 4] = r_tmp[i];
+            dst[i * 4 + 1] = g_tmp[i];
+            dst[i * 4 + 2] = b_tmp[i];
+            dst[i * 4 + 3] = 0xFF;
+          }
+        } else {
+          let dst = &mut out[x * 3..x * 3 + 8 * 3];
+          for i in 0..8 {
+            dst[i * 3] = r_tmp[i];
+            dst[i * 3 + 1] = g_tmp[i];
+            dst[i * 3 + 2] = b_tmp[i];
+          }
         }
-      }
 
-      x += 8;
+        x += 8;
+      }
     }
 
     // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -296,7 +303,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -322,72 +333,76 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let rnd_v = i32x4_splat(RND);
-    let y_off_v = i16x8_splat(y_off as i16);
-    let y_scale_v = i32x4_splat(y_scale);
-    let c_scale_v = i32x4_splat(c_scale);
-    let bias_v = i16x8_splat(bias as i16);
-    let shr_count: u32 = 16 - BITS;
-    let max_v = i16x8_splat(out_max);
-    let zero_v = i16x8_splat(0);
-    let cru = i32x4_splat(coeffs.r_u());
-    let crv = i32x4_splat(coeffs.r_v());
-    let cgu = i32x4_splat(coeffs.g_u());
-    let cgv = i32x4_splat(coeffs.g_v());
-    let cbu = i32x4_splat(coeffs.b_u());
-    let cbv = i32x4_splat(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-      let u_i16 = i16x8_sub(u_vec, bias_v);
-      let v_i16 = i16x8_sub(v_vec, bias_v);
-
-      let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
-      let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
-      let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
-      let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
-
-      let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
-
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      let r_dup = dup_lo(r_chroma);
-      let g_dup = dup_lo(g_chroma);
-      let b_dup = dup_lo(b_chroma);
-
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // Native-depth output: clamp to [0, (1 << BITS) - 1].
-      // `i16x8_add_sat` saturates at i16 bounds (no-op here since
-      // |sum| stays well inside i16 for BITS ≤ 12), then min/max
-      // clamps to the BITS range.
-      let r = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, r_dup), zero_v, max_v);
-      let g = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, g_dup), zero_v, max_v);
-      let b = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, b_dup), zero_v, max_v);
-
-      if ALPHA {
-        let alpha = i16x8_splat(out_max);
-        write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3));
-      }
+    if !BE {
+      let rnd_v = i32x4_splat(RND);
+      let y_off_v = i16x8_splat(y_off as i16);
+      let y_scale_v = i32x4_splat(y_scale);
+      let c_scale_v = i32x4_splat(c_scale);
+      let bias_v = i16x8_splat(bias as i16);
+      let shr_count: u32 = 16 - BITS;
+      let max_v = i16x8_splat(out_max);
+      let zero_v = i16x8_splat(0);
+      let cru = i32x4_splat(coeffs.r_u());
+      let crv = i32x4_splat(coeffs.r_v());
+      let cgu = i32x4_splat(coeffs.g_u());
+      let cgv = i32x4_splat(coeffs.g_v());
+      let cbu = i32x4_splat(coeffs.b_u());
+      let cbv = i32x4_splat(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+        let u_i16 = i16x8_sub(u_vec, bias_v);
+        let v_i16 = i16x8_sub(v_vec, bias_v);
+
+        let u_lo_i32 = i32x4_extend_low_i16x8(u_i16);
+        let u_hi_i32 = i32x4_extend_high_i16x8(u_i16);
+        let v_lo_i32 = i32x4_extend_low_i16x8(v_i16);
+        let v_hi_i32 = i32x4_extend_high_i16x8(v_i16);
+
+        let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v));
+
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        let r_dup = dup_lo(r_chroma);
+        let g_dup = dup_lo(g_chroma);
+        let b_dup = dup_lo(b_chroma);
+
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // Native-depth output: clamp to [0, (1 << BITS) - 1].
+        // `i16x8_add_sat` saturates at i16 bounds (no-op here since
+        // |sum| stays well inside i16 for BITS ≤ 12), then min/max
+        // clamps to the BITS range.
+        let r = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, r_dup), zero_v, max_v);
+        let g = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, g_dup), zero_v, max_v);
+        let b = clamp_u16_max_wasm(i16x8_add_sat(y_scaled, b_dup), zero_v, max_v);
+
+        if ALPHA {
+          let alpha = i16x8_splat(out_max);
+          write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3));
+        }
 
-      x += 8;
+        x += 8;
+      }
     }
 
+    // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -413,7 +428,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -430,40 +445,44 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    // Y permute mask: pick even u16 lanes (low byte at [0], high byte
-    // at [1]) into the low 8 bytes; high 8 bytes zeroed.
-    let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let lo = v128_load(packed.as_ptr().add(x * 2).cast());
-      let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _]
-      let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _]
-      // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as
-      // the 4:2:2 unpack helper.
-      let y_vec =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned
-
-      // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
-      // any BITS ∈ {10, 12} — same single-shift simplification used
-      // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`.
-      let y_shr = u16x8_shr(y_vec, 8);
-      // Pack 8 i16 lanes to u8 — only low 8 bytes used.
-      let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr);
-      // Store low 8 bytes via stack buffer + copy_from_slice.
-      let mut tmp = [0u8; 16];
-      v128_store(tmp.as_mut_ptr().cast(), y_u8);
-      luma_out[x..x + 8].copy_from_slice(&tmp[..8]);
-
-      x += 8;
+    if !BE {
+      // Y permute mask: pick even u16 lanes (low byte at [0], high byte
+      // at [1]) into the low 8 bytes; high 8 bytes zeroed.
+      let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 8 <= width {
+        let lo = v128_load(packed.as_ptr().add(x * 2).cast());
+        let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _]
+        let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _]
+        // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as
+        // the 4:2:2 unpack helper.
+        let y_vec =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned
+
+        // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
+        // any BITS ∈ {10, 12} — same single-shift simplification used
+        // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`.
+        let y_shr = u16x8_shr(y_vec, 8);
+        // Pack 8 i16 lanes to u8 — only low 8 bytes used.
+        let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr);
+        // Store low 8 bytes via stack buffer + copy_from_slice.
+        let mut tmp = [0u8; 16];
+        v128_store(tmp.as_mut_ptr().cast(), y_u8);
+        luma_out[x..x + 8].copy_from_slice(&tmp[..8]);
+
+        x += 8;
+      }
     }
 
+    // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -480,7 +499,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -497,29 +516,33 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let shr_count: u32 = 16 - BITS;
-    let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let lo = v128_load(packed.as_ptr().add(x * 2).cast());
-      let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo = u8x16_swizzle(lo, y_idx);
-      let y_hi = u8x16_swizzle(hi, y_idx);
-      let y_vec =
-        i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi);
-      // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
-      // into low-bit-packed form for the native-depth u16 output.
-      let y_low = u16x8_shr(y_vec, shr_count);
-      v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low);
-      x += 8;
+    if !BE {
+      let shr_count: u32 = 16 - BITS;
+      let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 8 <= width {
+        let lo = v128_load(packed.as_ptr().add(x * 2).cast());
+        let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo = u8x16_swizzle(lo, y_idx);
+        let y_hi = u8x16_swizzle(hi, y_idx);
+        let y_vec =
+          i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi);
+        // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
+        // into low-bit-packed form for the native-depth u16 output.
+        let y_low = u16x8_shr(y_vec, shr_count);
+        v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low);
+        x += 8;
+      }
     }
 
+    // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_u16_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_u16_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/tests/v210.rs b/src/row/arch/x86_avx2/tests/v210.rs
index 9c1f8315..d6bf96ae 100644
--- a/src/row/arch/x86_avx2/tests/v210.rs
+++ b/src/row/arch/x86_avx2/tests/v210.rs
@@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v210_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v210_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -82,9 +82,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v210_to_luma_row(&p, &mut s, width);
+  scalar::v210_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_row(&p, &mut k, width);
+    v210_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 v210→luma diverges (width={width})");
 }
@@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v210_to_luma_u16_row(&p, &mut s, width);
+  scalar::v210_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_u16_row(&p, &mut k, width);
+    v210_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 v210→luma u16 diverges (width={width})");
 }
@@ -238,7 +238,7 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v210_to_luma_u16_row(&packed, &mut luma, W);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "avx2 v210 luma reorder bug");
@@ -247,9 +247,15 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v210_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v210_to_rgb_or_rgba_row::<false>(
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_avx2/tests/y216.rs b/src/row/arch/x86_avx2/tests/y216.rs
index f7428a32..34cd1b89 100644
--- a/src/row/arch/x86_avx2/tests/y216.rs
+++ b/src/row/arch/x86_avx2/tests/y216.rs
@@ -16,9 +16,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -33,9 +33,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -49,9 +49,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y216_to_luma_row(&p, &mut s, width);
+  scalar::y216_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_row(&p, &mut k, width);
+    y216_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 y216→luma u8 diverges (width={width})");
 }
@@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y216_to_luma_u16_row(&p, &mut s, width);
+  scalar::y216_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_u16_row(&p, &mut k, width);
+    y216_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 y216→luma u16 diverges (width={width})");
 }
@@ -169,7 +169,7 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y216_to_luma_u16_row(&packed, &mut luma_u16, W);
+    y216_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "AVX2 y216 luma_u16 reorder bug");
@@ -178,9 +178,15 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_avx2/tests/y2xx.rs b/src/row/arch/x86_avx2/tests/y2xx.rs
index de7fcd45..26825f38 100644
--- a/src/row/arch/x86_avx2/tests/y2xx.rs
+++ b/src/row/arch/x86_avx2/tests/y2xx.rs
@@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&packed, &mut luma_u16, W);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
       &packed,
       &mut simd_rgb,
       W,
@@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
       false,
     );
   }
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -107,9 +107,9 @@ fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: boo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -121,9 +121,9 @@ fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -135,9 +135,11 @@ fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range:
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -149,9 +151,11 @@ fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -163,9 +167,9 @@ fn check_luma<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y2xx_n_to_luma_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma diverges (width={width})");
 }
@@ -174,9 +178,9 @@ fn check_luma_u16<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y2xx_n_to_luma_u16_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma u16 diverges (width={width})");
 }
@@ -262,15 +266,15 @@ fn avx2_y212_matches_scalar_widths() {
     let p = pseudo_random_y212(w, 0xAA55);
     let mut s = std::vec![0u8; w * 3];
     let mut k = std::vec![0u8; w * 3];
-    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
+    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
     unsafe {
-      y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
+      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
     }
     assert_eq!(s, k, "AVX2 y2xx<12>→RGB diverges (width={w})");
 
     let mut s_u16 = std::vec![0u16; w * 4];
     let mut k_u16 = std::vec![0u16; w * 4];
-    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
       &p,
       &mut s_u16,
       w,
@@ -278,7 +282,7 @@ fn avx2_y212_matches_scalar_widths() {
       true,
     );
     unsafe {
-      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
         &p,
         &mut k_u16,
         w,
@@ -290,17 +294,17 @@ fn avx2_y212_matches_scalar_widths() {
 
     let mut sl = std::vec![0u8; w];
     let mut kl = std::vec![0u8; w];
-    scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w);
+    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
     unsafe {
-      y2xx_n_to_luma_row::<12>(&p, &mut kl, w);
+      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
     }
     assert_eq!(sl, kl, "AVX2 y2xx<12>→luma diverges (width={w})");
 
     let mut slu = std::vec![0u16; w];
     let mut klu = std::vec![0u16; w];
-    scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w);
+    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
     unsafe {
-      y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w);
+      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
     }
     assert_eq!(slu, klu, "AVX2 y2xx<12>→luma u16 diverges (width={w})");
   }
diff --git a/src/row/arch/x86_avx2/v210.rs b/src/row/arch/x86_avx2/v210.rs
index 49407edd..13164309 100644
--- a/src/row/arch/x86_avx2/v210.rs
+++ b/src/row/arch/x86_avx2/v210.rs
@@ -34,7 +34,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u32x8, *};
 use crate::{ColorMatrix, row::scalar};
 
 /// Unpacks two consecutive 16-byte v210 words (= 12 pixels) into
@@ -63,11 +63,11 @@ use crate::{ColorMatrix, row::scalar};
 /// `target_feature` includes AVX2 (which implies AVX, SSSE3, etc.).
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) {
+unsafe fn unpack_v210_2words_avx2<const BE: bool>(ptr: *const u8) -> (__m256i, __m256i, __m256i) {
   // SAFETY: caller obligation — `ptr` has 32 bytes readable; AVX2
   // (and thus SSSE3) is available.
   unsafe {
-    let words = _mm256_loadu_si256(ptr.cast());
+    let words = load_endian_u32x8::<BE>(ptr);
     let mask10 = _mm256_set1_epi32(0x3FF);
     let low10 = _mm256_and_si256(words, mask10);
     let mid10 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask10);
@@ -224,7 +224,7 @@ unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i)
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -263,7 +263,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     // Main loop: 12 pixels (2 v210 words = 32 bytes) per iteration.
     let pairs = words / 2;
     for p in 0..pairs {
-      let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32));
+      let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::<BE>(packed.as_ptr().add(p * 32));
 
       let y_i16 = y_vec;
 
@@ -369,7 +369,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[pairs * 32..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -386,7 +392,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -424,7 +430,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let pairs = words / 2;
     for p in 0..pairs {
-      let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32));
+      let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::<BE>(packed.as_ptr().add(p * 32));
 
       let y_i16 = y_vec;
       let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
@@ -503,7 +509,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[pairs * 32..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -526,7 +532,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v210_to_luma_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -537,7 +547,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
   unsafe {
     let pairs = words / 2;
     for p in 0..pairs {
-      let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32));
+      let (y_vec, _, _) = unpack_v210_2words_avx2::<BE>(packed.as_ptr().add(p * 32));
       // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x32 via packus.
       let y_shr = _mm256_srli_epi16::<2>(y_vec);
       let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256());
@@ -554,7 +564,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
       let tail_packed = &packed[pairs * 32..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -571,7 +581,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -582,7 +596,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
   unsafe {
     let pairs = words / 2;
     for p in 0..pairs {
-      let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32));
+      let (y_vec, _, _) = unpack_v210_2words_avx2::<BE>(packed.as_ptr().add(p * 32));
       // Store first 12 of the 16 u16 lanes via stack buffer + copy_from_slice.
       let mut tmp = [0u16; 16];
       _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_vec);
@@ -596,7 +610,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
       let tail_packed = &packed[pairs * 32..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/y216.rs b/src/row/arch/x86_avx2/y216.rs
index 4184b3bb..cf850e18 100644
--- a/src/row/arch/x86_avx2/y216.rs
+++ b/src/row/arch/x86_avx2/y216.rs
@@ -109,7 +109,7 @@ unsafe fn unpack_y216_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i)
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -128,137 +128,146 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
-    let rnd_v = _mm256_set1_epi32(RND);
-    // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off.
-    let y_off_v = _mm256_set1_epi32(y_off);
-    let y_scale_v = _mm256_set1_epi32(y_scale);
-    let c_scale_v = _mm256_set1_epi32(c_scale);
-    // Chroma bias: 32768 via wrapping 0x8000 = -32768i16.
-    let bias16_v = _mm256_set1_epi16(-32768i16);
-    let cru = _mm256_set1_epi32(coeffs.r_u());
-    let crv = _mm256_set1_epi32(coeffs.r_v());
-    let cgu = _mm256_set1_epi32(coeffs.g_u());
-    let cgv = _mm256_set1_epi32(coeffs.g_v());
-    let cbu = _mm256_set1_epi32(coeffs.b_u());
-    let cbv = _mm256_set1_epi32(coeffs.b_v());
-    let alpha_u8 = _mm256_set1_epi8(-1i8);
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ------
-      let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2));
-
-      // Chroma bias subtraction (wrapping).
-      let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v);
-      let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v);
-
-      // Widen 8 valid chroma i16 lanes to two i32x8 halves.
-      // Only the low 128 bits of u_lo_vec carry valid U0..U7;
-      // the high 128 bits are zeroed by the 0x88 permute (don't-care).
-      let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16));
-      let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16));
-      let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16));
-      let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16));
-
-      let u_d_lo_a = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_lo_a, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_lo_b = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_lo_b, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo_a = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_lo_a, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo_b = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_lo_b, c_scale_v),
-        rnd_v,
-      ));
-
-      // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo).
-      let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
-      let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
-      let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
-
-      // Duplicate each chroma into its 4:2:2 Y-pair slot.
-      // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used
-      // here since we have only 8 chroma samples per 16-px half.
-      let (r_dup_lo, _) = chroma_dup(r_chroma_lo);
-      let (g_dup_lo, _) = chroma_dup(g_chroma_lo);
-      let (b_dup_lo, _) = chroma_dup(b_chroma_lo);
-
-      // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767.
-      let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v);
-
-      // --- hi group: pixels x+16..x+31 -----------------------------------
-      let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32));
-
-      let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v);
-      let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v);
-
-      let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16));
-      let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16));
-      let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16));
-      let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16));
-
-      let u_d_hi_a = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_hi_a, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi_b = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_hi_b, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi_a = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_hi_a, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi_b = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_hi_b, c_scale_v),
-        rnd_v,
-      ));
-
-      let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
-      let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
-      let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
-
-      let (r_dup_hi, _) = chroma_dup(r_chroma_hi);
-      let (g_dup_hi, _) = chroma_dup(g_chroma_hi);
-      let (b_dup_hi, _) = chroma_dup(b_chroma_hi);
-
-      let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v);
-
-      // Saturating add + narrow to u8x32 (32 pixels per channel).
-      let r_u8 = narrow_u8x32(
-        _mm256_adds_epi16(y_lo_scaled, r_dup_lo),
-        _mm256_adds_epi16(y_hi_scaled, r_dup_hi),
-      );
-      let g_u8 = narrow_u8x32(
-        _mm256_adds_epi16(y_lo_scaled, g_dup_lo),
-        _mm256_adds_epi16(y_hi_scaled, g_dup_hi),
-      );
-      let b_u8 = narrow_u8x32(
-        _mm256_adds_epi16(y_lo_scaled, b_dup_lo),
-        _mm256_adds_epi16(y_hi_scaled, b_dup_hi),
-      );
+    if !BE {
+      let rnd_v = _mm256_set1_epi32(RND);
+      // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off.
+      let y_off_v = _mm256_set1_epi32(y_off);
+      let y_scale_v = _mm256_set1_epi32(y_scale);
+      let c_scale_v = _mm256_set1_epi32(c_scale);
+      // Chroma bias: 32768 via wrapping 0x8000 = -32768i16.
+      let bias16_v = _mm256_set1_epi16(-32768i16);
+      let cru = _mm256_set1_epi32(coeffs.r_u());
+      let crv = _mm256_set1_epi32(coeffs.r_v());
+      let cgu = _mm256_set1_epi32(coeffs.g_u());
+      let cgv = _mm256_set1_epi32(coeffs.g_v());
+      let cbu = _mm256_set1_epi32(coeffs.b_u());
+      let cbv = _mm256_set1_epi32(coeffs.b_v());
+      let alpha_u8 = _mm256_set1_epi8(-1i8);
+
+      while x + 32 <= width {
+        // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ----
+        let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2));
+
+        // Chroma bias subtraction (wrapping).
+        let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v);
+        let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v);
+
+        // Widen 8 valid chroma i16 lanes to two i32x8 halves.
+        // Only the low 128 bits of u_lo_vec carry valid U0..U7;
+        // the high 128 bits are zeroed by the 0x88 permute (don't-care).
+        let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16));
+        let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16));
+        let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16));
+        let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16));
+
+        let u_d_lo_a = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_lo_a, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_lo_b = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_lo_b, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo_a = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_lo_a, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo_b = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_lo_b, c_scale_v),
+          rnd_v,
+        ));
+
+        // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo).
+        let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
+        let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
+        let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
+
+        // Duplicate each chroma into its 4:2:2 Y-pair slot.
+        // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used
+        // here since we have only 8 chroma samples per 16-px half.
+        let (r_dup_lo, _) = chroma_dup(r_chroma_lo);
+        let (g_dup_lo, _) = chroma_dup(g_chroma_lo);
+        let (b_dup_lo, _) = chroma_dup(b_chroma_lo);
+
+        // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767.
+        let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v);
+
+        // --- hi group: pixels x+16..x+31 -----------------------------------
+        let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32));
+
+        let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v);
+        let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v);
+
+        let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16));
+        let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16));
+        let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16));
+        let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16));
+
+        let u_d_hi_a = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_hi_a, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi_b = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_hi_b, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi_a = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_hi_a, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi_b = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_hi_b, c_scale_v),
+          rnd_v,
+        ));
+
+        let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
+        let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
+        let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
+
+        let (r_dup_hi, _) = chroma_dup(r_chroma_hi);
+        let (g_dup_hi, _) = chroma_dup(g_chroma_hi);
+        let (b_dup_hi, _) = chroma_dup(b_chroma_hi);
+
+        let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v);
+
+        // Saturating add + narrow to u8x32 (32 pixels per channel).
+        let r_u8 = narrow_u8x32(
+          _mm256_adds_epi16(y_lo_scaled, r_dup_lo),
+          _mm256_adds_epi16(y_hi_scaled, r_dup_hi),
+        );
+        let g_u8 = narrow_u8x32(
+          _mm256_adds_epi16(y_lo_scaled, g_dup_lo),
+          _mm256_adds_epi16(y_hi_scaled, g_dup_hi),
+        );
+        let b_u8 = narrow_u8x32(
+          _mm256_adds_epi16(y_lo_scaled, b_dup_lo),
+          _mm256_adds_epi16(y_hi_scaled, b_dup_hi),
+        );
 
-      if ALPHA {
-        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
-      }
+        if ALPHA {
+          write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+        }
 
-      x += 32;
+        x += 32;
+      }
     }
 
     // Scalar tail — remaining < 32 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -280,7 +289,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -298,132 +307,135 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
-    let alpha_u16 = _mm_set1_epi16(-1i16);
-    let rnd_v = _mm256_set1_epi64x(RND);
-    let rnd32_v = _mm256_set1_epi32(1 << 14);
-    let y_off_v = _mm256_set1_epi32(y_off);
-    let y_scale_v = _mm256_set1_epi32(y_scale);
-    let c_scale_v = _mm256_set1_epi32(c_scale);
-    // Chroma bias via wrapping 0x8000 trick.
-    let bias16_v = _mm256_set1_epi16(-32768i16);
-    let cru = _mm256_set1_epi32(coeffs.r_u());
-    let crv = _mm256_set1_epi32(coeffs.r_v());
-    let cgu = _mm256_set1_epi32(coeffs.g_u());
-    let cgv = _mm256_set1_epi32(coeffs.g_v());
-    let cbu = _mm256_set1_epi32(coeffs.b_u());
-    let cbv = _mm256_set1_epi32(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      // Two 256-bit loads → 16 pixels, 8 UV pairs.
-      let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2));
-
-      // Subtract chroma bias.
-      let u_i16 = _mm256_sub_epi16(u_vec, bias16_v);
-      let v_i16 = _mm256_sub_epi16(v_vec, bias16_v);
-
-      // Widen 8 valid chroma i16 lanes to i32x8.
-      // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute.
-      let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
-      let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
-
-      // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32).
-      let u_d = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_i32, c_scale_v),
-        rnd32_v,
-      ));
-      let v_d = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_i32, c_scale_v),
-        rnd32_v,
-      ));
-
-      // i64 chroma: even/odd i32 lanes via 0xF5 shuffle.
-      let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d);
-      let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d);
-
-      let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v);
-      let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v);
-      let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v);
-      let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v);
-      let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v);
-      let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v);
-
-      // Reassemble i64x4 pairs → i32x8 [c0..c7].
-      let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd);
-      let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd);
-      let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd);
-
-      // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2).
-      let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32);
-      let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32);
-      let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32);
-
-      // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64.
-      // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes.
-      let y_lo_u16 = _mm256_castsi256_si128(y_vec);
-      let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec);
-      let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v);
-      let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v);
-
-      let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v);
-      let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v);
-
-      // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup.
-      let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
-        _mm256_add_epi32(y_lo_scaled, r_dup_lo),
-        _mm256_add_epi32(y_hi_scaled, r_dup_hi),
-      ));
-      let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
-        _mm256_add_epi32(y_lo_scaled, g_dup_lo),
-        _mm256_add_epi32(y_hi_scaled, g_dup_hi),
-      ));
-      let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
-        _mm256_add_epi32(y_lo_scaled, b_dup_lo),
-        _mm256_add_epi32(y_hi_scaled, b_dup_hi),
-      ));
-
-      // Write 16 pixels via two 8-pixel helpers.
-      if ALPHA {
-        let dst = out.as_mut_ptr().add(x * 4);
-        write_rgba_u16_8(
-          _mm256_castsi256_si128(r_u16),
-          _mm256_castsi256_si128(g_u16),
-          _mm256_castsi256_si128(b_u16),
-          alpha_u16,
-          dst,
-        );
-        write_rgba_u16_8(
-          _mm256_extracti128_si256::<1>(r_u16),
-          _mm256_extracti128_si256::<1>(g_u16),
-          _mm256_extracti128_si256::<1>(b_u16),
-          alpha_u16,
-          dst.add(32),
-        );
-      } else {
-        let dst = out.as_mut_ptr().add(x * 3);
-        write_rgb_u16_8(
-          _mm256_castsi256_si128(r_u16),
-          _mm256_castsi256_si128(g_u16),
-          _mm256_castsi256_si128(b_u16),
-          dst,
-        );
-        write_rgb_u16_8(
-          _mm256_extracti128_si256::<1>(r_u16),
-          _mm256_extracti128_si256::<1>(g_u16),
-          _mm256_extracti128_si256::<1>(b_u16),
-          dst.add(24),
-        );
+    if !BE {
+      let alpha_u16 = _mm_set1_epi16(-1i16);
+      let rnd_v = _mm256_set1_epi64x(RND);
+      let rnd32_v = _mm256_set1_epi32(1 << 14);
+      let y_off_v = _mm256_set1_epi32(y_off);
+      let y_scale_v = _mm256_set1_epi32(y_scale);
+      let c_scale_v = _mm256_set1_epi32(c_scale);
+      // Chroma bias via wrapping 0x8000 trick.
+      let bias16_v = _mm256_set1_epi16(-32768i16);
+      let cru = _mm256_set1_epi32(coeffs.r_u());
+      let crv = _mm256_set1_epi32(coeffs.r_v());
+      let cgu = _mm256_set1_epi32(coeffs.g_u());
+      let cgv = _mm256_set1_epi32(coeffs.g_v());
+      let cbu = _mm256_set1_epi32(coeffs.b_u());
+      let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+      while x + 16 <= width {
+        // Two 256-bit loads → 16 pixels, 8 UV pairs.
+        let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2));
+
+        // Subtract chroma bias.
+        let u_i16 = _mm256_sub_epi16(u_vec, bias16_v);
+        let v_i16 = _mm256_sub_epi16(v_vec, bias16_v);
+
+        // Widen 8 valid chroma i16 lanes to i32x8.
+        // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute.
+        let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+        let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+
+        // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32).
+        let u_d = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_i32, c_scale_v),
+          rnd32_v,
+        ));
+        let v_d = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_i32, c_scale_v),
+          rnd32_v,
+        ));
+
+        // i64 chroma: even/odd i32 lanes via 0xF5 shuffle.
+        let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d);
+        let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d);
+
+        let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v);
+        let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v);
+        let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v);
+        let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v);
+        let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v);
+        let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v);
+
+        // Reassemble i64x4 pairs → i32x8 [c0..c7].
+        let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd);
+        let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd);
+        let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd);
+
+        // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2).
+        let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32);
+        let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32);
+        let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32);
+
+        // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64.
+        // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes.
+        let y_lo_u16 = _mm256_castsi256_si128(y_vec);
+        let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec);
+        let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v);
+        let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v);
+
+        let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v);
+        let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v);
+
+        // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup.
+        let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
+          _mm256_add_epi32(y_lo_scaled, r_dup_lo),
+          _mm256_add_epi32(y_hi_scaled, r_dup_hi),
+        ));
+        let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
+          _mm256_add_epi32(y_lo_scaled, g_dup_lo),
+          _mm256_add_epi32(y_hi_scaled, g_dup_hi),
+        ));
+        let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(
+          _mm256_add_epi32(y_lo_scaled, b_dup_lo),
+          _mm256_add_epi32(y_hi_scaled, b_dup_hi),
+        ));
+
+        // Write 16 pixels via two 8-pixel helpers.
+        if ALPHA {
+          let dst = out.as_mut_ptr().add(x * 4);
+          write_rgba_u16_8(
+            _mm256_castsi256_si128(r_u16),
+            _mm256_castsi256_si128(g_u16),
+            _mm256_castsi256_si128(b_u16),
+            alpha_u16,
+            dst,
+          );
+          write_rgba_u16_8(
+            _mm256_extracti128_si256::<1>(r_u16),
+            _mm256_extracti128_si256::<1>(g_u16),
+            _mm256_extracti128_si256::<1>(b_u16),
+            alpha_u16,
+            dst.add(32),
+          );
+        } else {
+          let dst = out.as_mut_ptr().add(x * 3);
+          write_rgb_u16_8(
+            _mm256_castsi256_si128(r_u16),
+            _mm256_castsi256_si128(g_u16),
+            _mm256_castsi256_si128(b_u16),
+            dst,
+          );
+          write_rgb_u16_8(
+            _mm256_extracti128_si256::<1>(r_u16),
+            _mm256_extracti128_si256::<1>(g_u16),
+            _mm256_extracti128_si256::<1>(b_u16),
+            dst.add(24),
+          );
+        }
+
+        x += 16;
       }
-
-      x += 16;
     }
 
     // Scalar tail — remaining < 16 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -450,62 +462,69 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn y216_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
-    // Per-lane Y permute mask: pick even u16 lanes (low byte first) into
-    // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed.
-    let split_idx = _mm256_setr_epi8(
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
-    );
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31.
-      let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
-      let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast());
-      let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast());
-
-      // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane.
-      let v0s = _mm256_shuffle_epi8(v0, split_idx);
-      let v1s = _mm256_shuffle_epi8(v1, split_idx);
-      let v2s = _mm256_shuffle_epi8(v2, split_idx);
-      let v3s = _mm256_shuffle_epi8(v3, split_idx);
-
-      // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits.
-      let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
-      let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
-      let v2p = _mm256_permute4x64_epi64::<0x88>(v2s);
-      let v3p = _mm256_permute4x64_epi64::<0x88>(v3s);
-
-      // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16).
-      let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15]
-      let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31]
-
-      // `>> 8` to obtain u8 luma (high byte of each Y u16 sample).
-      // `_mm256_srli_epi16::<8>` has a literal const count.
-      let y_lo_shr = _mm256_srli_epi16::<8>(y_lo);
-      let y_hi_shr = _mm256_srli_epi16::<8>(y_hi);
-
-      // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup.
-      let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr);
-      _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8);
-
-      x += 32;
+    if !BE {
+      // Per-lane Y permute mask: pick even u16 lanes (low byte first) into
+      // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed.
+      let split_idx = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
+      );
+
+      while x + 32 <= width {
+        // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31.
+        let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
+        let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast());
+        let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast());
+
+        // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane.
+        let v0s = _mm256_shuffle_epi8(v0, split_idx);
+        let v1s = _mm256_shuffle_epi8(v1, split_idx);
+        let v2s = _mm256_shuffle_epi8(v2, split_idx);
+        let v3s = _mm256_shuffle_epi8(v3, split_idx);
+
+        // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits.
+        let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
+        let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
+        let v2p = _mm256_permute4x64_epi64::<0x88>(v2s);
+        let v3p = _mm256_permute4x64_epi64::<0x88>(v3s);
+
+        // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16).
+        let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15]
+        let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31]
+
+        // `>> 8` to obtain u8 luma (high byte of each Y u16 sample).
+        // `_mm256_srli_epi16::<8>` has a literal const count.
+        let y_lo_shr = _mm256_srli_epi16::<8>(y_lo);
+        let y_hi_shr = _mm256_srli_epi16::<8>(y_hi);
+
+        // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup.
+        let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr);
+        _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8);
+
+        x += 32;
+      }
     }
 
     // Scalar tail — remaining < 32 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -526,52 +545,59 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn y216_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   // SAFETY: AVX2 availability is the caller's obligation.
   unsafe {
-    // Per-lane Y permute mask (same as luma_row above).
-    let split_idx = _mm256_setr_epi8(
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1,
-      -1, -1, -1, -1, -1, -1,
-    );
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
-      let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast());
-      let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast());
-
-      let v0s = _mm256_shuffle_epi8(v0, split_idx);
-      let v1s = _mm256_shuffle_epi8(v1, split_idx);
-      let v2s = _mm256_shuffle_epi8(v2, split_idx);
-      let v3s = _mm256_shuffle_epi8(v3, split_idx);
-
-      let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
-      let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
-      let v2p = _mm256_permute4x64_epi64::<0x88>(v2s);
-      let v3p = _mm256_permute4x64_epi64::<0x88>(v3s);
-
-      let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15]
-      let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31]
-
-      // Direct store — full 16-bit Y values, no shift.
-      _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo);
-      _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi);
-
-      x += 32;
+    if !BE {
+      // Per-lane Y permute mask (same as luma_row above).
+      let split_idx = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1,
+        -1, -1, -1, -1, -1, -1,
+      );
+
+      while x + 32 <= width {
+        let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
+        let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast());
+        let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast());
+
+        let v0s = _mm256_shuffle_epi8(v0, split_idx);
+        let v1s = _mm256_shuffle_epi8(v1, split_idx);
+        let v2s = _mm256_shuffle_epi8(v2, split_idx);
+        let v3s = _mm256_shuffle_epi8(v3, split_idx);
+
+        let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
+        let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
+        let v2p = _mm256_permute4x64_epi64::<0x88>(v2s);
+        let v3p = _mm256_permute4x64_epi64::<0x88>(v3s);
+
+        let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15]
+        let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31]
+
+        // Direct store — full 16-bit Y values, no shift.
+        _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo);
+        _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi);
+
+        x += 32;
+      }
     }
 
     // Scalar tail — remaining < 32 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/y2xx.rs b/src/row/arch/x86_avx2/y2xx.rs
index 1b9d76f3..bc3c5bb1 100644
--- a/src/row/arch/x86_avx2/y2xx.rs
+++ b/src/row/arch/x86_avx2/y2xx.rs
@@ -164,7 +164,11 @@ unsafe fn unpack_y2xx_16px_avx2(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -192,122 +196,125 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
   // by the `while x + 16 <= width` loop and the caller-promised slice
   // lengths checked above.
   unsafe {
-    let rnd_v = _mm256_set1_epi32(RND);
-    let y_off_v = _mm256_set1_epi16(y_off as i16);
-    let y_scale_v = _mm256_set1_epi32(y_scale);
-    let c_scale_v = _mm256_set1_epi32(c_scale);
-    let bias_v = _mm256_set1_epi16(bias as i16);
-    // Loop-invariant runtime shift count for `_mm256_srl_epi16` — see
-    // module-level note.
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let cru = _mm256_set1_epi32(coeffs.r_u());
-    let crv = _mm256_set1_epi32(coeffs.r_v());
-    let cgu = _mm256_set1_epi32(coeffs.g_u());
-    let cgv = _mm256_set1_epi32(coeffs.g_v());
-    let cbu = _mm256_set1_epi32(coeffs.b_u());
-    let cbv = _mm256_set1_epi32(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-
-      // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
-      // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
-      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
-
-      // Widen 8-valid-lane i16 chroma to two i32x8 halves so the Q15
-      // multiplies don't overflow. Only lanes 0..7 of `_lo` are
-      // valid; `_hi` is entirely don't-care. We feed both halves
-      // through `chroma_i16x16` to recycle the helper exactly; the
-      // don't-care output lanes are discarded by the
-      // `chroma_dup` step below (which only consumes lanes 0..7 in
-      // its `lo16` return).
-      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
-      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
-      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
-      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
-
-      let u_d_lo = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-
-      // 16-lane chroma vectors with valid data in lanes 0..7.
-      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
-      // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15.
-      // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32
-      // chroma; we have only 8).
-      let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma);
-      let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma);
-      let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma);
-
-      // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16.
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8
-      // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so
-      // the low 16 bytes hold the saturated u8 of our 16 valid lanes.
-      let zero = _mm256_setzero_si256();
-      let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero);
-      let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero);
-      let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero);
-
-      // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit
-      // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel
-      // iter. Use the v210-style stack-buffer + scalar interleave
-      // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.)
-      let mut r_tmp = [0u8; 32];
-      let mut g_tmp = [0u8; 32];
-      let mut b_tmp = [0u8; 32];
-      _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8);
-      _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8);
-      _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8);
-
-      if ALPHA {
-        let dst = &mut out[x * 4..x * 4 + 16 * 4];
-        for i in 0..16 {
-          dst[i * 4] = r_tmp[i];
-          dst[i * 4 + 1] = g_tmp[i];
-          dst[i * 4 + 2] = b_tmp[i];
-          dst[i * 4 + 3] = 0xFF;
-        }
-      } else {
-        let dst = &mut out[x * 3..x * 3 + 16 * 3];
-        for i in 0..16 {
-          dst[i * 3] = r_tmp[i];
-          dst[i * 3 + 1] = g_tmp[i];
-          dst[i * 3 + 2] = b_tmp[i];
+    if !BE {
+      let rnd_v = _mm256_set1_epi32(RND);
+      let y_off_v = _mm256_set1_epi16(y_off as i16);
+      let y_scale_v = _mm256_set1_epi32(y_scale);
+      let c_scale_v = _mm256_set1_epi32(c_scale);
+      let bias_v = _mm256_set1_epi16(bias as i16);
+      // Loop-invariant runtime shift count for `_mm256_srl_epi16` — see
+      // module-level note.
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let cru = _mm256_set1_epi32(coeffs.r_u());
+      let crv = _mm256_set1_epi32(coeffs.r_v());
+      let cgu = _mm256_set1_epi32(coeffs.g_u());
+      let cgv = _mm256_set1_epi32(coeffs.g_v());
+      let cbu = _mm256_set1_epi32(coeffs.b_u());
+      let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+      while x + 16 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+
+        // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
+        // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
+        let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+        // Widen 8-valid-lane i16 chroma to two i32x8 halves so the Q15
+        // multiplies don't overflow. Only lanes 0..7 of `_lo` are
+        // valid; `_hi` is entirely don't-care. We feed both halves
+        // through `chroma_i16x16` to recycle the helper exactly; the
+        // don't-care output lanes are discarded by the
+        // `chroma_dup` step below (which only consumes lanes 0..7 in
+        // its `lo16` return).
+        let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+        let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+        let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+        let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+        let u_d_lo = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+
+        // 16-lane chroma vectors with valid data in lanes 0..7.
+        let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
+        // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15.
+        // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32
+        // chroma; we have only 8).
+        let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma);
+        let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma);
+        let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma);
+
+        // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16.
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8
+        // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so
+        // the low 16 bytes hold the saturated u8 of our 16 valid lanes.
+        let zero = _mm256_setzero_si256();
+        let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero);
+        let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero);
+        let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero);
+
+        // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit
+        // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel
+        // iter. Use the v210-style stack-buffer + scalar interleave
+        // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.)
+        let mut r_tmp = [0u8; 32];
+        let mut g_tmp = [0u8; 32];
+        let mut b_tmp = [0u8; 32];
+        _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8);
+        _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8);
+        _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8);
+
+        if ALPHA {
+          let dst = &mut out[x * 4..x * 4 + 16 * 4];
+          for i in 0..16 {
+            dst[i * 4] = r_tmp[i];
+            dst[i * 4 + 1] = g_tmp[i];
+            dst[i * 4 + 2] = b_tmp[i];
+            dst[i * 4 + 3] = 0xFF;
+          }
+        } else {
+          let dst = &mut out[x * 3..x * 3 + 16 * 3];
+          for i in 0..16 {
+            dst[i * 3] = r_tmp[i];
+            dst[i * 3 + 1] = g_tmp[i];
+            dst[i * 3 + 2] = b_tmp[i];
+          }
         }
-      }
 
-      x += 16;
+        x += 16;
+      }
     }
 
     // Scalar tail — remaining < 16 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -334,7 +341,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -360,112 +371,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let rnd_v = _mm256_set1_epi32(RND);
-    let y_off_v = _mm256_set1_epi16(y_off as i16);
-    let y_scale_v = _mm256_set1_epi32(y_scale);
-    let c_scale_v = _mm256_set1_epi32(c_scale);
-    let bias_v = _mm256_set1_epi16(bias as i16);
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let max_v = _mm256_set1_epi16(out_max);
-    let zero_v = _mm256_set1_epi16(0);
-    let cru = _mm256_set1_epi32(coeffs.r_u());
-    let crv = _mm256_set1_epi32(coeffs.r_v());
-    let cgu = _mm256_set1_epi32(coeffs.g_u());
-    let cgv = _mm256_set1_epi32(coeffs.g_v());
-    let cbu = _mm256_set1_epi32(coeffs.b_u());
-    let cbv = _mm256_set1_epi32(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-      let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
-
-      let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
-      let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
-      let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
-      let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
-
-      let u_d_lo = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(u_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi = q15_shift(_mm256_add_epi32(
-        _mm256_mullo_epi32(v_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-
-      let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma);
-      let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma);
-      let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma);
-
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2
-      // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`.
-      let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v);
-      let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v);
-      let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v);
-
-      // 16-pixel u16 store: split each i16x16 channel into two
-      // 128-bit halves and use the SSE4.1 u16 interleave helpers
-      // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as
-      // the AVX2 high-bit YUV planar u16 path.
-      if ALPHA {
-        let alpha_u16 = _mm_set1_epi16(out_max);
-        let dst = out.as_mut_ptr().add(x * 4);
-        write_rgba_u16_8(
-          _mm256_castsi256_si128(r),
-          _mm256_castsi256_si128(g),
-          _mm256_castsi256_si128(b),
-          alpha_u16,
-          dst,
-        );
-        write_rgba_u16_8(
-          _mm256_extracti128_si256::<1>(r),
-          _mm256_extracti128_si256::<1>(g),
-          _mm256_extracti128_si256::<1>(b),
-          alpha_u16,
-          dst.add(32),
-        );
-      } else {
-        let dst = out.as_mut_ptr().add(x * 3);
-        write_rgb_u16_8(
-          _mm256_castsi256_si128(r),
-          _mm256_castsi256_si128(g),
-          _mm256_castsi256_si128(b),
-          dst,
-        );
-        write_rgb_u16_8(
-          _mm256_extracti128_si256::<1>(r),
-          _mm256_extracti128_si256::<1>(g),
-          _mm256_extracti128_si256::<1>(b),
-          dst.add(24),
-        );
-      }
+    if !BE {
+      let rnd_v = _mm256_set1_epi32(RND);
+      let y_off_v = _mm256_set1_epi16(y_off as i16);
+      let y_scale_v = _mm256_set1_epi32(y_scale);
+      let c_scale_v = _mm256_set1_epi32(c_scale);
+      let bias_v = _mm256_set1_epi16(bias as i16);
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let max_v = _mm256_set1_epi16(out_max);
+      let zero_v = _mm256_set1_epi16(0);
+      let cru = _mm256_set1_epi32(coeffs.r_u());
+      let crv = _mm256_set1_epi32(coeffs.r_v());
+      let cgu = _mm256_set1_epi32(coeffs.g_u());
+      let cgv = _mm256_set1_epi32(coeffs.g_v());
+      let cbu = _mm256_set1_epi32(coeffs.b_u());
+      let cbv = _mm256_set1_epi32(coeffs.b_v());
+
+      while x + 16 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+        let u_i16 = _mm256_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm256_sub_epi16(v_vec, bias_v);
+
+        let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16));
+        let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16));
+        let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16));
+        let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16));
+
+        let u_d_lo = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(u_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi = q15_shift(_mm256_add_epi32(
+          _mm256_mullo_epi32(v_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+
+        let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma);
+        let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma);
+        let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma);
+
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2
+        // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`.
+        let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v);
+        let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v);
+        let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v);
+
+        // 16-pixel u16 store: split each i16x16 channel into two
+        // 128-bit halves and use the SSE4.1 u16 interleave helpers
+        // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as
+        // the AVX2 high-bit YUV planar u16 path.
+        if ALPHA {
+          let alpha_u16 = _mm_set1_epi16(out_max);
+          let dst = out.as_mut_ptr().add(x * 4);
+          write_rgba_u16_8(
+            _mm256_castsi256_si128(r),
+            _mm256_castsi256_si128(g),
+            _mm256_castsi256_si128(b),
+            alpha_u16,
+            dst,
+          );
+          write_rgba_u16_8(
+            _mm256_extracti128_si256::<1>(r),
+            _mm256_extracti128_si256::<1>(g),
+            _mm256_extracti128_si256::<1>(b),
+            alpha_u16,
+            dst.add(32),
+          );
+        } else {
+          let dst = out.as_mut_ptr().add(x * 3);
+          write_rgb_u16_8(
+            _mm256_castsi256_si128(r),
+            _mm256_castsi256_si128(g),
+            _mm256_castsi256_si128(b),
+            dst,
+          );
+          write_rgb_u16_8(
+            _mm256_extracti128_si256::<1>(r),
+            _mm256_extracti128_si256::<1>(g),
+            _mm256_extracti128_si256::<1>(b),
+            dst.add(24),
+          );
+        }
 
-      x += 16;
+        x += 16;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -491,7 +504,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -508,50 +521,52 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    // Per-lane Y permute mask: pick even u16 lanes (low byte at [0],
-    // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed.
-    let split_idx = _mm256_setr_epi8(
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
-    );
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
-      let v0s = _mm256_shuffle_epi8(v0, split_idx);
-      let v1s = _mm256_shuffle_epi8(v1, split_idx);
-      // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y
-      // values in its two lanes' low 64 bits. Pack lane0_low and
-      // lane1_low into the low 128 bits of each vector via
-      // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]).
-      let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
-      let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
-      // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes).
-      // Low 128 of v1p = [Y8..Y15].
-      // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low).
-      let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p);
-
-      // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
-      // any BITS ∈ {10, 12} — same single-shift simplification used
-      // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a
-      // literal const count, so it works without runtime-count helper.
-      let y_shr = _mm256_srli_epi16::<8>(y_vec);
-      // Pack 16 i16 lanes to u8 — only low 16 bytes used.
-      let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256());
-      // Store low 16 bytes via stack buffer + copy_from_slice.
-      let mut tmp = [0u8; 32];
-      _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8);
-      luma_out[x..x + 16].copy_from_slice(&tmp[..16]);
-
-      x += 16;
+    if !BE {
+      // Per-lane Y permute mask: pick even u16 lanes (low byte at [0],
+      // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed.
+      let split_idx = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
+      );
+
+      while x + 16 <= width {
+        let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
+        let v0s = _mm256_shuffle_epi8(v0, split_idx);
+        let v1s = _mm256_shuffle_epi8(v1, split_idx);
+        // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y
+        // values in its two lanes' low 64 bits. Pack lane0_low and
+        // lane1_low into the low 128 bits of each vector via
+        // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]).
+        let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
+        let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
+        // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes).
+        // Low 128 of v1p = [Y8..Y15].
+        // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low).
+        let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p);
+
+        // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
+        // any BITS ∈ {10, 12} — same single-shift simplification used
+        // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a
+        // literal const count, so it works without runtime-count helper.
+        let y_shr = _mm256_srli_epi16::<8>(y_vec);
+        // Pack 16 i16 lanes to u8 — only low 16 bytes used.
+        let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256());
+        // Store low 16 bytes via stack buffer + copy_from_slice.
+        let mut tmp = [0u8; 32];
+        _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8);
+        luma_out[x..x + 16].copy_from_slice(&tmp[..16]);
+
+        x += 16;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -569,7 +584,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -586,33 +601,35 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let split_idx = _mm256_setr_epi8(
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
-      0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
-    );
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
-      let v0s = _mm256_shuffle_epi8(v0, split_idx);
-      let v1s = _mm256_shuffle_epi8(v1, split_idx);
-      let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
-      let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
-      let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p);
-      // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
-      // into low-bit-packed form for the native-depth u16 output.
-      let y_low = _mm256_srl_epi16(y_vec, shr_count);
-      _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low);
-      x += 16;
+    if !BE {
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let split_idx = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane
+        0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane
+      );
+
+      while x + 16 <= width {
+        let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast());
+        let v0s = _mm256_shuffle_epi8(v0, split_idx);
+        let v1s = _mm256_shuffle_epi8(v1, split_idx);
+        let v0p = _mm256_permute4x64_epi64::<0x88>(v0s);
+        let v1p = _mm256_permute4x64_epi64::<0x88>(v1s);
+        let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p);
+        // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
+        // into low-bit-packed form for the native-depth u16 output.
+        let y_low = _mm256_srl_epi16(y_vec, shr_count);
+        _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low);
+        x += 16;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_u16_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_u16_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/tests/v210.rs b/src/row/arch/x86_avx512/tests/v210.rs
index 0abf4bae..f2652cb8 100644
--- a/src/row/arch/x86_avx512/tests/v210.rs
+++ b/src/row/arch/x86_avx512/tests/v210.rs
@@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v210_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v210_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -82,9 +82,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v210_to_luma_row(&p, &mut s, width);
+  scalar::v210_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_row(&p, &mut k, width);
+    v210_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 v210→luma diverges (width={width})");
 }
@@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v210_to_luma_u16_row(&p, &mut s, width);
+  scalar::v210_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_u16_row(&p, &mut k, width);
+    v210_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 v210→luma u16 diverges (width={width})");
 }
@@ -250,7 +250,7 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v210_to_luma_u16_row(&packed, &mut luma, W);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "avx512 v210 luma reorder bug");
@@ -259,9 +259,15 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v210_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v210_to_rgb_or_rgba_row::<false>(
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_avx512/tests/y216.rs b/src/row/arch/x86_avx512/tests/y216.rs
index 93fa76cc..ae8b2bc7 100644
--- a/src/row/arch/x86_avx512/tests/y216.rs
+++ b/src/row/arch/x86_avx512/tests/y216.rs
@@ -16,9 +16,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -33,9 +33,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -49,9 +49,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y216_to_luma_row(&p, &mut s, width);
+  scalar::y216_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_row(&p, &mut k, width);
+    y216_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 y216→luma u8 diverges (width={width})");
 }
@@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y216_to_luma_u16_row(&p, &mut s, width);
+  scalar::y216_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_u16_row(&p, &mut k, width);
+    y216_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 y216→luma u16 diverges (width={width})");
 }
@@ -178,7 +178,7 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y216_to_luma_u16_row(&packed, &mut luma_u16, W);
+    y216_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "AVX-512 y216 luma_u16 reorder bug");
@@ -187,9 +187,15 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_avx512/tests/y2xx.rs b/src/row/arch/x86_avx512/tests/y2xx.rs
index fd5ccbad..dc609f8d 100644
--- a/src/row/arch/x86_avx512/tests/y2xx.rs
+++ b/src/row/arch/x86_avx512/tests/y2xx.rs
@@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&packed, &mut luma_u16, W);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
       &packed,
       &mut simd_rgb,
       W,
@@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
       false,
     );
   }
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -111,9 +111,9 @@ fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: boo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -125,9 +125,9 @@ fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -139,9 +139,11 @@ fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range:
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -153,9 +155,11 @@ fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -167,9 +171,9 @@ fn check_luma<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y2xx_n_to_luma_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 y2xx<{BITS}>→luma diverges (width={width})");
 }
@@ -178,9 +182,9 @@ fn check_luma_u16<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y2xx_n_to_luma_u16_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(
     s, k,
@@ -278,15 +282,15 @@ fn avx512_y212_matches_scalar_widths() {
     let p = pseudo_random_y212(w, 0xAA55);
     let mut s = std::vec![0u8; w * 3];
     let mut k = std::vec![0u8; w * 3];
-    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
+    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
     unsafe {
-      y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
+      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
     }
     assert_eq!(s, k, "AVX-512 y2xx<12>→RGB diverges (width={w})");
 
     let mut s_u16 = std::vec![0u16; w * 4];
     let mut k_u16 = std::vec![0u16; w * 4];
-    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
       &p,
       &mut s_u16,
       w,
@@ -294,7 +298,7 @@ fn avx512_y212_matches_scalar_widths() {
       true,
     );
     unsafe {
-      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
         &p,
         &mut k_u16,
         w,
@@ -309,17 +313,17 @@ fn avx512_y212_matches_scalar_widths() {
 
     let mut sl = std::vec![0u8; w];
     let mut kl = std::vec![0u8; w];
-    scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w);
+    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
     unsafe {
-      y2xx_n_to_luma_row::<12>(&p, &mut kl, w);
+      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
     }
     assert_eq!(sl, kl, "AVX-512 y2xx<12>→luma diverges (width={w})");
 
     let mut slu = std::vec![0u16; w];
     let mut klu = std::vec![0u16; w];
-    scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w);
+    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
     unsafe {
-      y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w);
+      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
     }
     assert_eq!(slu, klu, "AVX-512 y2xx<12>→luma u16 diverges (width={w})");
   }
diff --git a/src/row/arch/x86_avx512/v210.rs b/src/row/arch/x86_avx512/v210.rs
index e5a77eb4..8c68f16d 100644
--- a/src/row/arch/x86_avx512/v210.rs
+++ b/src/row/arch/x86_avx512/v210.rs
@@ -40,7 +40,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u32x16, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Static permute index tables --------------------------------------
@@ -187,11 +187,11 @@ static V_FROM_MID: [i16; 32] = [
 /// `permutexvar` op `vpermw`).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i) {
+unsafe fn unpack_v210_4words_avx512<const BE: bool>(ptr: *const u8) -> (__m512i, __m512i, __m512i) {
   // SAFETY: caller obligation — `ptr` has 64 bytes readable; AVX-512F
   // + AVX-512BW are available.
   unsafe {
-    let words = _mm512_loadu_si512(ptr.cast());
+    let words = load_endian_u32x16::<BE>(ptr);
     let mask10 = _mm512_set1_epi32(0x3FF);
     let low10 = _mm512_and_si512(words, mask10);
     let mid10 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask10);
@@ -247,7 +247,7 @@ unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -290,7 +290,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     // Main loop: 24 pixels (4 v210 words = 64 bytes) per iteration.
     let quads = words / 4;
     for q in 0..quads {
-      let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64));
+      let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::<BE>(packed.as_ptr().add(q * 64));
 
       let y_i16 = y_vec;
 
@@ -392,7 +392,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[quads * 64..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -409,7 +415,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -451,7 +457,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let quads = words / 4;
     for q in 0..quads {
-      let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64));
+      let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::<BE>(packed.as_ptr().add(q * 64));
 
       let y_i16 = y_vec;
       let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
@@ -529,7 +535,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[quads * 64..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -552,7 +558,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v210_to_luma_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -566,7 +576,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 
     let quads = words / 4;
     for q in 0..quads {
-      let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64));
+      let (y_vec, _, _) = unpack_v210_4words_avx512::<BE>(packed.as_ptr().add(q * 64));
       // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x64 via packus
       // (only first 32 lanes carry data, paired with a zero hi half;
       // first 24 bytes of the result are valid Y0..Y23).
@@ -585,7 +595,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
       let tail_packed = &packed[quads * 64..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -602,7 +612,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -613,7 +627,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
   unsafe {
     let quads = words / 4;
     for q in 0..quads {
-      let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64));
+      let (y_vec, _, _) = unpack_v210_4words_avx512::<BE>(packed.as_ptr().add(q * 64));
       // Store first 24 of the 32 u16 lanes via stack buffer + copy_from_slice.
       let mut tmp = [0u16; 32];
       _mm512_storeu_si512(tmp.as_mut_ptr().cast(), y_vec);
@@ -627,7 +641,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
       let tail_packed = &packed[quads * 64..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/y216.rs b/src/row/arch/x86_avx512/y216.rs
index 2a60b3b3..be564433 100644
--- a/src/row/arch/x86_avx512/y216.rs
+++ b/src/row/arch/x86_avx512/y216.rs
@@ -118,7 +118,7 @@ unsafe fn unpack_y216_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -137,150 +137,160 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 
   // SAFETY: AVX-512F + AVX-512BW is the caller's obligation.
   unsafe {
-    let rnd_v = _mm512_set1_epi32(RND);
-    let y_off_v = _mm512_set1_epi32(y_off);
-    let y_scale_v = _mm512_set1_epi32(y_scale);
-    let c_scale_v = _mm512_set1_epi32(c_scale);
-    // Chroma bias: 32768 via wrapping -32768 i16.
-    let bias16_v = _mm512_set1_epi16(-32768i16);
-    let cru = _mm512_set1_epi32(coeffs.r_u());
-    let crv = _mm512_set1_epi32(coeffs.r_v());
-    let cgu = _mm512_set1_epi32(coeffs.g_u());
-    let cgv = _mm512_set1_epi32(coeffs.g_v());
-    let cbu = _mm512_set1_epi32(coeffs.b_u());
-    let cbv = _mm512_set1_epi32(coeffs.b_v());
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
-    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
-
     let mut x = 0usize;
-    while x + 64 <= width {
-      // --- lo group: pixels x..x+31 (32 pixels) --------------------------
-      let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2));
-
-      let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v);
-      let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v);
-
-      // Widen 16 valid U/V i16 lanes to two i32x16 halves.
-      let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16));
-      let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16));
-      let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16));
-      let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16));
-
-      let u_d_lo_a = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_lo_a, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_lo_b = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_lo_b, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo_a = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_lo_a, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo_b = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_lo_b, c_scale_v),
-        rnd_v,
-      ));
-
-      // chroma_i16x32: 32-lane vector, valid data in lanes 0..16.
-      let r_chroma_lo = chroma_i16x32(
-        cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
-      );
-      let g_chroma_lo = chroma_i16x32(
-        cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
-      );
-      let b_chroma_lo = chroma_i16x32(
-        cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
-      );
-
-      // Duplicate each chroma sample into its 4:2:2 Y-pair slot.
-      // 16 valid chroma → lo32 covers all 32 Y lanes.
-      let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx);
-      let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx);
-      let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx);
-
-      // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767.
-      let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup);
-
-      // --- hi group: pixels x+32..x+63 (32 pixels) ----------------------
-      let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64));
-
-      let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v);
-      let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v);
-
-      let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16));
-      let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16));
-      let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16));
-      let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16));
-
-      let u_d_hi_a = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_hi_a, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi_b = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_hi_b, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi_a = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_hi_a, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi_b = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_hi_b, c_scale_v),
-        rnd_v,
-      ));
-
-      let r_chroma_hi = chroma_i16x32(
-        cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
-      );
-      let g_chroma_hi = chroma_i16x32(
-        cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
-      );
-      let b_chroma_hi = chroma_i16x32(
-        cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
-      );
-
-      let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx);
-      let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx);
-      let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx);
-
-      let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup);
-
-      // Saturating i16 add + narrow to u8x64 per channel.
-      let r_u8 = narrow_u8x64(
-        _mm512_adds_epi16(y_lo_scaled, r_dup_lo),
-        _mm512_adds_epi16(y_hi_scaled, r_dup_hi),
-        pack_fixup,
-      );
-      let g_u8 = narrow_u8x64(
-        _mm512_adds_epi16(y_lo_scaled, g_dup_lo),
-        _mm512_adds_epi16(y_hi_scaled, g_dup_hi),
-        pack_fixup,
-      );
-      let b_u8 = narrow_u8x64(
-        _mm512_adds_epi16(y_lo_scaled, b_dup_lo),
-        _mm512_adds_epi16(y_hi_scaled, b_dup_hi),
-        pack_fixup,
-      );
-
-      if ALPHA {
-        let alpha = _mm512_set1_epi8(-1);
-        write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let rnd_v = _mm512_set1_epi32(RND);
+      let y_off_v = _mm512_set1_epi32(y_off);
+      let y_scale_v = _mm512_set1_epi32(y_scale);
+      let c_scale_v = _mm512_set1_epi32(c_scale);
+      // Chroma bias: 32768 via wrapping -32768 i16.
+      let bias16_v = _mm512_set1_epi16(-32768i16);
+      let cru = _mm512_set1_epi32(coeffs.r_u());
+      let crv = _mm512_set1_epi32(coeffs.r_v());
+      let cgu = _mm512_set1_epi32(coeffs.g_u());
+      let cgv = _mm512_set1_epi32(coeffs.g_v());
+      let cbu = _mm512_set1_epi32(coeffs.b_u());
+      let cbv = _mm512_set1_epi32(coeffs.b_v());
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+      let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+      while x + 64 <= width {
+        // --- lo group: pixels x..x+31 (32 pixels) ------------------------
+        let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2));
+
+        let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v);
+        let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v);
+
+        // Widen 16 valid U/V i16 lanes to two i32x16 halves.
+        let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16));
+        let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16));
+        let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16));
+        let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16));
+
+        let u_d_lo_a = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_lo_a, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_lo_b = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_lo_b, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo_a = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_lo_a, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo_b = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_lo_b, c_scale_v),
+          rnd_v,
+        ));
+
+        // chroma_i16x32: 32-lane vector, valid data in lanes 0..16.
+        let r_chroma_lo = chroma_i16x32(
+          cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
+        );
+        let g_chroma_lo = chroma_i16x32(
+          cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
+        );
+        let b_chroma_lo = chroma_i16x32(
+          cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
+        );
+
+        // Duplicate each chroma sample into its 4:2:2 Y-pair slot.
+        // 16 valid chroma → lo32 covers all 32 Y lanes.
+        let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx);
+        let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx);
+        let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx);
+
+        // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767.
+        let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+        // --- hi group: pixels x+32..x+63 (32 pixels) ----------------------
+        let (y_hi_vec, u_hi_vec, v_hi_vec) =
+          unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64));
+
+        let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v);
+        let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v);
+
+        let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16));
+        let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16));
+        let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16));
+        let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16));
+
+        let u_d_hi_a = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_hi_a, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi_b = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_hi_b, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi_a = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_hi_a, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi_b = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_hi_b, c_scale_v),
+          rnd_v,
+        ));
+
+        let r_chroma_hi = chroma_i16x32(
+          cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
+        );
+        let g_chroma_hi = chroma_i16x32(
+          cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
+        );
+        let b_chroma_hi = chroma_i16x32(
+          cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
+        );
+
+        let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx);
+        let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx);
+        let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx);
+
+        let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+        // Saturating i16 add + narrow to u8x64 per channel.
+        let r_u8 = narrow_u8x64(
+          _mm512_adds_epi16(y_lo_scaled, r_dup_lo),
+          _mm512_adds_epi16(y_hi_scaled, r_dup_hi),
+          pack_fixup,
+        );
+        let g_u8 = narrow_u8x64(
+          _mm512_adds_epi16(y_lo_scaled, g_dup_lo),
+          _mm512_adds_epi16(y_hi_scaled, g_dup_hi),
+          pack_fixup,
+        );
+        let b_u8 = narrow_u8x64(
+          _mm512_adds_epi16(y_lo_scaled, b_dup_lo),
+          _mm512_adds_epi16(y_hi_scaled, b_dup_hi),
+          pack_fixup,
+        );
+
+        if ALPHA {
+          let alpha = _mm512_set1_epi8(-1);
+          write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+        }
+
+        x += 64;
       }
-
-      x += 64;
     }
 
     // Scalar tail — remaining < 64 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -301,7 +311,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -320,125 +330,130 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
   // SAFETY: AVX-512F + AVX-512BW is the caller's obligation.
   unsafe {
-    let alpha_u16 = _mm_set1_epi16(-1i16);
-    let rnd_i64_v = _mm512_set1_epi64(RND_I64);
-    let rnd_i32_v = _mm512_set1_epi32(RND_I32);
-    let y_off_v = _mm512_set1_epi32(y_off);
-    let y_scale_v = _mm512_set1_epi32(y_scale);
-    let c_scale_v = _mm512_set1_epi32(c_scale);
-    let bias16_v = _mm512_set1_epi16(-32768i16);
-    let cru = _mm512_set1_epi32(coeffs.r_u());
-    let crv = _mm512_set1_epi32(coeffs.r_v());
-    let cgu = _mm512_set1_epi32(coeffs.g_u());
-    let cgv = _mm512_set1_epi32(coeffs.g_v());
-    let cbu = _mm512_set1_epi32(coeffs.b_u());
-    let cbv = _mm512_set1_epi32(coeffs.b_v());
-
-    // Permute indices built once.
-    // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots.
-    let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
-    let dup_hi_idx = _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
-    // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...].
-    let interleave_idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      // One deinterleave gives 32 Y + 16 UV pairs.
-      let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2));
-
-      // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16).
-      let u_i16 = _mm512_sub_epi16(u_vec, bias16_v);
-      let v_i16 = _mm512_sub_epi16(v_vec, bias16_v);
-
-      // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale.
-      // High 256 bits of u_vec / v_vec hold don't-care values after the
-      // U/V split permute; they won't reach chroma_i64x8_avx512.
-      let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
-      let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
-
-      // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 →
-      // product ≤ ~1.26·10⁹ — fits i32.
-      let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_i32, c_scale_v),
-        rnd_i32_v,
-      ));
-      let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_i32, c_scale_v),
-        rnd_i32_v,
-      ));
-
-      // i64 chroma: even and odd i32 lanes separately.
-      let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d);
-      let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d);
-
-      let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v);
-      let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v);
-      let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v);
-      let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v);
-      let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v);
-      let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v);
-
-      // Reassemble i64x8 pairs → i32x16 [c0..c15].
-      let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx);
-      let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx);
-      let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx);
-
-      // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling).
-      let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32);
-      let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32);
-      let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32);
-      let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32);
-      let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32);
-      let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32);
-
-      // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64.
-      let y_lo_u16 = _mm512_castsi512_si256(y_vec);
-      let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec);
-      let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v);
-      let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v);
-
-      let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx);
-      let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx);
-
-      // Y + chroma → pack with unsigned saturation to u16x32.
-      let r_u16 = _mm512_permutexvar_epi64(
-        pack_fixup,
-        _mm512_packus_epi32(
-          _mm512_add_epi32(y_lo_scaled, r_dup_lo),
-          _mm512_add_epi32(y_hi_scaled, r_dup_hi),
-        ),
-      );
-      let g_u16 = _mm512_permutexvar_epi64(
-        pack_fixup,
-        _mm512_packus_epi32(
-          _mm512_add_epi32(y_lo_scaled, g_dup_lo),
-          _mm512_add_epi32(y_hi_scaled, g_dup_hi),
-        ),
-      );
-      let b_u16 = _mm512_permutexvar_epi64(
-        pack_fixup,
-        _mm512_packus_epi32(
-          _mm512_add_epi32(y_lo_scaled, b_dup_lo),
-          _mm512_add_epi32(y_hi_scaled, b_dup_hi),
-        ),
-      );
-
-      if ALPHA {
-        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let alpha_u16 = _mm_set1_epi16(-1i16);
+      let rnd_i64_v = _mm512_set1_epi64(RND_I64);
+      let rnd_i32_v = _mm512_set1_epi32(RND_I32);
+      let y_off_v = _mm512_set1_epi32(y_off);
+      let y_scale_v = _mm512_set1_epi32(y_scale);
+      let c_scale_v = _mm512_set1_epi32(c_scale);
+      let bias16_v = _mm512_set1_epi16(-32768i16);
+      let cru = _mm512_set1_epi32(coeffs.r_u());
+      let crv = _mm512_set1_epi32(coeffs.r_v());
+      let cgu = _mm512_set1_epi32(coeffs.g_u());
+      let cgv = _mm512_set1_epi32(coeffs.g_v());
+      let cbu = _mm512_set1_epi32(coeffs.b_u());
+      let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+      // Permute indices built once.
+      // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots.
+      let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+      let dup_hi_idx =
+        _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
+      // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...].
+      let interleave_idx =
+        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+
+      while x + 32 <= width {
+        // One deinterleave gives 32 Y + 16 UV pairs.
+        let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2));
+
+        // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16).
+        let u_i16 = _mm512_sub_epi16(u_vec, bias16_v);
+        let v_i16 = _mm512_sub_epi16(v_vec, bias16_v);
+
+        // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale.
+        // High 256 bits of u_vec / v_vec hold don't-care values after the
+        // U/V split permute; they won't reach chroma_i64x8_avx512.
+        let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+        let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+
+        // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 →
+        // product ≤ ~1.26·10⁹ — fits i32.
+        let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_i32, c_scale_v),
+          rnd_i32_v,
+        ));
+        let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_i32, c_scale_v),
+          rnd_i32_v,
+        ));
+
+        // i64 chroma: even and odd i32 lanes separately.
+        let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d);
+        let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d);
+
+        let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v);
+        let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v);
+        let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v);
+        let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v);
+        let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v);
+        let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v);
+
+        // Reassemble i64x8 pairs → i32x16 [c0..c15].
+        let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx);
+        let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx);
+        let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx);
+
+        // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling).
+        let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32);
+        let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32);
+        let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32);
+        let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32);
+        let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32);
+        let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32);
+
+        // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64.
+        let y_lo_u16 = _mm512_castsi512_si256(y_vec);
+        let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec);
+        let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v);
+        let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v);
+
+        let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx);
+        let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx);
+
+        // Y + chroma → pack with unsigned saturation to u16x32.
+        let r_u16 = _mm512_permutexvar_epi64(
+          pack_fixup,
+          _mm512_packus_epi32(
+            _mm512_add_epi32(y_lo_scaled, r_dup_lo),
+            _mm512_add_epi32(y_hi_scaled, r_dup_hi),
+          ),
+        );
+        let g_u16 = _mm512_permutexvar_epi64(
+          pack_fixup,
+          _mm512_packus_epi32(
+            _mm512_add_epi32(y_lo_scaled, g_dup_lo),
+            _mm512_add_epi32(y_hi_scaled, g_dup_hi),
+          ),
+        );
+        let b_u16 = _mm512_permutexvar_epi64(
+          pack_fixup,
+          _mm512_packus_epi32(
+            _mm512_add_epi32(y_lo_scaled, b_dup_lo),
+            _mm512_add_epi32(y_hi_scaled, b_dup_hi),
+          ),
+        );
+
+        if ALPHA {
+          write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+        }
+
+        x += 32;
       }
-
-      x += 32;
     }
 
     // Scalar tail — remaining < 32 pixels.
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -464,43 +479,49 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn y216_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   // SAFETY: AVX-512F + AVX-512BW is the caller's obligation.
   unsafe {
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-    let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
-
     let mut x = 0usize;
-    while x + 64 <= width {
-      // lo group: pixels x..x+31
-      let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
-      let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1);
-      let y_lo_shr = _mm512_srli_epi16::<8>(y_lo);
-
-      // hi group: pixels x+32..x+63
-      let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast());
-      let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast());
-      let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3);
-      let y_hi_shr = _mm512_srli_epi16::<8>(y_hi);
-
-      // Pack 64 × i16 → 64 × u8 with natural order.
-      let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup);
-      // Store all 64 bytes at once.
-      _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8);
-
-      x += 64;
+    if !BE {
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
+
+      while x + 64 <= width {
+        // lo group: pixels x..x+31
+        let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
+        let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1);
+        let y_lo_shr = _mm512_srli_epi16::<8>(y_lo);
+
+        // hi group: pixels x+32..x+63
+        let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast());
+        let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast());
+        let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3);
+        let y_hi_shr = _mm512_srli_epi16::<8>(y_hi);
+
+        // Pack 64 × i16 → 64 × u8 with natural order.
+        let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup);
+        // Store all 64 bytes at once.
+        _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8);
+
+        x += 64;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -520,39 +541,45 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn y216_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   // SAFETY: AVX-512F + AVX-512BW is the caller's obligation.
   unsafe {
-    let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
-
     let mut x = 0usize;
-    while x + 64 <= width {
-      // lo group: pixels x..x+31
-      let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
-      let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1);
-
-      // hi group: pixels x+32..x+63
-      let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast());
-      let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast());
-      let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3);
-
-      // Direct store — full 16-bit Y values, no shift.
-      _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo);
-      _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi);
-
-      x += 64;
+    if !BE {
+      let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
+
+      while x + 64 <= width {
+        // lo group: pixels x..x+31
+        let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
+        let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1);
+
+        // hi group: pixels x+32..x+63
+        let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast());
+        let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast());
+        let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3);
+
+        // Direct store — full 16-bit Y values, no shift.
+        _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo);
+        _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi);
+
+        x += 64;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/y2xx.rs b/src/row/arch/x86_avx512/y2xx.rs
index 4944cc6d..1d2b1dcd 100644
--- a/src/row/arch/x86_avx512/y2xx.rs
+++ b/src/row/arch/x86_avx512/y2xx.rs
@@ -177,7 +177,11 @@ unsafe fn unpack_y2xx_32px_avx512(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -205,132 +209,135 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
   // adds are bounded by the `while x + 32 <= width` loop and the
   // caller-promised slice lengths checked above.
   unsafe {
-    let rnd_v = _mm512_set1_epi32(RND);
-    let y_off_v = _mm512_set1_epi16(y_off as i16);
-    let y_scale_v = _mm512_set1_epi32(y_scale);
-    let c_scale_v = _mm512_set1_epi32(c_scale);
-    let bias_v = _mm512_set1_epi16(bias as i16);
-    // Loop-invariant runtime shift count for `_mm512_srl_epi16` — see
-    // module-level note.
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let cru = _mm512_set1_epi32(coeffs.r_u());
-    let crv = _mm512_set1_epi32(coeffs.r_v());
-    let cgu = _mm512_set1_epi32(coeffs.g_u());
-    let cgv = _mm512_set1_epi32(coeffs.g_v());
-    let cbu = _mm512_set1_epi32(coeffs.b_u());
-    let cbv = _mm512_set1_epi32(coeffs.b_v());
-
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
-    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-
-      // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
-      // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. Only lanes 0..16
-      // carry valid samples; the bias subtraction on don't-care lanes
-      // is harmless since they're discarded by `chroma_dup`'s `hi32`.
-      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
-
-      // Widen 16-valid-lane i16 chroma to two i32x16 halves so the
-      // Q15 multiplies don't overflow. Only lanes 0..16 of `_lo` are
-      // valid; `_hi` is entirely don't-care. We feed both halves
-      // through `chroma_i16x32` to recycle the helper exactly; the
-      // don't-care output lanes 16..32 are discarded by `chroma_dup`'s
-      // `hi32` return below (which only consumes lanes 0..16 in its
-      // `lo32` return).
-      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
-      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
-      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
-      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
-
-      let u_d_lo = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-
-      // i16x32 chroma vectors with valid data in lanes 0..16.
-      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-
-      // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup`
-      // duplicates each of 32 chroma lanes into its pair slot,
-      // splitting across two i16x32 vectors. With 16 valid chroma in
-      // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1,
-      // ..., c15,c15]); `hi32` is don't-care.
-      let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
-      let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
-      let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
-
-      // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32.
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
-
-      // Per-channel saturating add (i16x32). All 32 lanes valid.
-      let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo);
-      let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo);
-      let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo);
-
-      // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)`
-      // packs 32 i16 lanes of `lo` to u8 in the result's first 32
-      // bytes (next 32 zero, after the lane-fixup permute).
-      let zero = _mm512_setzero_si512();
-      let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup);
-      let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup);
-      let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup);
-
-      // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls
-      // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32`
-      // pulls the two valid 128-bit halves out of the u8x64 result.
-      if ALPHA {
-        let alpha = _mm_set1_epi8(-1);
-        let r0 = _mm512_castsi512_si128(r_u8);
-        let r1 = _mm512_extracti32x4_epi32::<1>(r_u8);
-        let g0 = _mm512_castsi512_si128(g_u8);
-        let g1 = _mm512_extracti32x4_epi32::<1>(g_u8);
-        let b0 = _mm512_castsi512_si128(b_u8);
-        let b1 = _mm512_extracti32x4_epi32::<1>(b_u8);
-        let dst = out.as_mut_ptr().add(x * 4);
-        write_rgba_16(r0, g0, b0, alpha, dst);
-        write_rgba_16(r1, g1, b1, alpha, dst.add(64));
-      } else {
-        let r0 = _mm512_castsi512_si128(r_u8);
-        let r1 = _mm512_extracti32x4_epi32::<1>(r_u8);
-        let g0 = _mm512_castsi512_si128(g_u8);
-        let g1 = _mm512_extracti32x4_epi32::<1>(g_u8);
-        let b0 = _mm512_castsi512_si128(b_u8);
-        let b1 = _mm512_extracti32x4_epi32::<1>(b_u8);
-        let dst = out.as_mut_ptr().add(x * 3);
-        write_rgb_16(r0, g0, b0, dst);
-        write_rgb_16(r1, g1, b1, dst.add(48));
+    if !BE {
+      let rnd_v = _mm512_set1_epi32(RND);
+      let y_off_v = _mm512_set1_epi16(y_off as i16);
+      let y_scale_v = _mm512_set1_epi32(y_scale);
+      let c_scale_v = _mm512_set1_epi32(c_scale);
+      let bias_v = _mm512_set1_epi16(bias as i16);
+      // Loop-invariant runtime shift count for `_mm512_srl_epi16` — see
+      // module-level note.
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let cru = _mm512_set1_epi32(coeffs.r_u());
+      let crv = _mm512_set1_epi32(coeffs.r_v());
+      let cgu = _mm512_set1_epi32(coeffs.g_u());
+      let cgv = _mm512_set1_epi32(coeffs.g_v());
+      let cbu = _mm512_set1_epi32(coeffs.b_u());
+      let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+      let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+      while x + 32 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+
+        // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
+        // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. Only lanes 0..16
+        // carry valid samples; the bias subtraction on don't-care lanes
+        // is harmless since they're discarded by `chroma_dup`'s `hi32`.
+        let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+        // Widen 16-valid-lane i16 chroma to two i32x16 halves so the
+        // Q15 multiplies don't overflow. Only lanes 0..16 of `_lo` are
+        // valid; `_hi` is entirely don't-care. We feed both halves
+        // through `chroma_i16x32` to recycle the helper exactly; the
+        // don't-care output lanes 16..32 are discarded by `chroma_dup`'s
+        // `hi32` return below (which only consumes lanes 0..16 in its
+        // `lo32` return).
+        let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+        let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+        let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+        let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+        let u_d_lo = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+
+        // i16x32 chroma vectors with valid data in lanes 0..16.
+        let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+        let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+        let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+        // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup`
+        // duplicates each of 32 chroma lanes into its pair slot,
+        // splitting across two i16x32 vectors. With 16 valid chroma in
+        // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1,
+        // ..., c15,c15]); `hi32` is don't-care.
+        let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+        let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+        let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+        // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32.
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+        // Per-channel saturating add (i16x32). All 32 lanes valid.
+        let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo);
+        let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo);
+        let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo);
+
+        // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)`
+        // packs 32 i16 lanes of `lo` to u8 in the result's first 32
+        // bytes (next 32 zero, after the lane-fixup permute).
+        let zero = _mm512_setzero_si512();
+        let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup);
+        let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup);
+        let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup);
+
+        // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls
+        // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32`
+        // pulls the two valid 128-bit halves out of the u8x64 result.
+        if ALPHA {
+          let alpha = _mm_set1_epi8(-1);
+          let r0 = _mm512_castsi512_si128(r_u8);
+          let r1 = _mm512_extracti32x4_epi32::<1>(r_u8);
+          let g0 = _mm512_castsi512_si128(g_u8);
+          let g1 = _mm512_extracti32x4_epi32::<1>(g_u8);
+          let b0 = _mm512_castsi512_si128(b_u8);
+          let b1 = _mm512_extracti32x4_epi32::<1>(b_u8);
+          let dst = out.as_mut_ptr().add(x * 4);
+          write_rgba_16(r0, g0, b0, alpha, dst);
+          write_rgba_16(r1, g1, b1, alpha, dst.add(64));
+        } else {
+          let r0 = _mm512_castsi512_si128(r_u8);
+          let r1 = _mm512_extracti32x4_epi32::<1>(r_u8);
+          let g0 = _mm512_castsi512_si128(g_u8);
+          let g1 = _mm512_extracti32x4_epi32::<1>(g_u8);
+          let b0 = _mm512_castsi512_si128(b_u8);
+          let b1 = _mm512_extracti32x4_epi32::<1>(b_u8);
+          let dst = out.as_mut_ptr().add(x * 3);
+          write_rgb_16(r0, g0, b0, dst);
+          write_rgb_16(r1, g1, b1, dst.add(48));
+        }
+
+        x += 32;
       }
-
-      x += 32;
     }
 
     // Scalar tail — remaining < 32 pixels (always even per 4:2:2).
+    // When BE=true the full row is covered here.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -357,7 +364,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -383,86 +394,88 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let rnd_v = _mm512_set1_epi32(RND);
-    let y_off_v = _mm512_set1_epi16(y_off as i16);
-    let y_scale_v = _mm512_set1_epi32(y_scale);
-    let c_scale_v = _mm512_set1_epi32(c_scale);
-    let bias_v = _mm512_set1_epi16(bias as i16);
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let max_v = _mm512_set1_epi16(out_max);
-    let zero_v = _mm512_set1_epi16(0);
-    let cru = _mm512_set1_epi32(coeffs.r_u());
-    let crv = _mm512_set1_epi32(coeffs.r_v());
-    let cgu = _mm512_set1_epi32(coeffs.g_u());
-    let cgv = _mm512_set1_epi32(coeffs.g_v());
-    let cbu = _mm512_set1_epi32(coeffs.b_u());
-    let cbv = _mm512_set1_epi32(coeffs.b_v());
-
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-    let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
-    let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-      let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
-
-      let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
-      let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
-      let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
-      let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
-
-      let u_d_lo = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let u_d_hi = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(u_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_lo = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_lo_i32, c_scale_v),
-        rnd_v,
-      ));
-      let v_d_hi = q15_shift(_mm512_add_epi32(
-        _mm512_mullo_epi32(v_hi_i32, c_scale_v),
-        rnd_v,
-      ));
-
-      let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-      let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-      let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
-
-      let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
-      let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
-      let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
-
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
-
-      // Native-depth output: clamp to [0, (1 << BITS) - 1].
-      let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v);
-      let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v);
-      let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v);
-
-      // 32-pixel u16 store via the shared 32-pixel writers.
-      if ALPHA {
-        let alpha_u16 = _mm_set1_epi16(out_max);
-        write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let rnd_v = _mm512_set1_epi32(RND);
+      let y_off_v = _mm512_set1_epi16(y_off as i16);
+      let y_scale_v = _mm512_set1_epi32(y_scale);
+      let c_scale_v = _mm512_set1_epi32(c_scale);
+      let bias_v = _mm512_set1_epi16(bias as i16);
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let max_v = _mm512_set1_epi16(out_max);
+      let zero_v = _mm512_set1_epi16(0);
+      let cru = _mm512_set1_epi32(coeffs.r_u());
+      let crv = _mm512_set1_epi32(coeffs.r_v());
+      let cgu = _mm512_set1_epi32(coeffs.g_u());
+      let cgv = _mm512_set1_epi32(coeffs.g_v());
+      let cbu = _mm512_set1_epi32(coeffs.b_u());
+      let cbv = _mm512_set1_epi32(coeffs.b_v());
+
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
+      let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
+
+      while x + 32 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+        let u_i16 = _mm512_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm512_sub_epi16(v_vec, bias_v);
+
+        let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16));
+        let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16));
+        let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16));
+        let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16));
+
+        let u_d_lo = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let u_d_hi = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(u_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_lo = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_lo_i32, c_scale_v),
+          rnd_v,
+        ));
+        let v_d_hi = q15_shift(_mm512_add_epi32(
+          _mm512_mullo_epi32(v_hi_i32, c_scale_v),
+          rnd_v,
+        ));
+
+        let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+        let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+        let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup);
+
+        let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx);
+        let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx);
+        let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx);
+
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
+
+        // Native-depth output: clamp to [0, (1 << BITS) - 1].
+        let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v);
+        let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v);
+        let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v);
+
+        // 32-pixel u16 store via the shared 32-pixel writers.
+        if ALPHA {
+          let alpha_u16 = _mm_set1_epi16(out_max);
+          write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3));
+        }
+
+        x += 32;
       }
-
-      x += 32;
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -488,7 +501,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -505,38 +518,40 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-    let zero = _mm512_setzero_si512();
-    let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      // Load 64 u16 = 32 pixels and pull just the Y lanes via the
-      // cross-vector u16 permute. We don't need chroma here.
-      let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
-      let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1);
-      // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
-      // any BITS ∈ {10, 12} — same single-shift simplification used
-      // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const
-      // count, so it works without runtime-count helper.
-      let y_shr = _mm512_srli_epi16::<8>(y_raw);
-      // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack
-      // fixup); next 32 zero from the zero-hi pack source.
-      let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup);
-      // Store first 32 bytes via the low 256-bit half.
-      _mm256_storeu_si256(
-        luma_out.as_mut_ptr().add(x).cast(),
-        _mm512_castsi512_si256(y_u8),
-      );
-      x += 32;
+    if !BE {
+      let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      let zero = _mm512_setzero_si512();
+      let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
+
+      while x + 32 <= width {
+        // Load 64 u16 = 32 pixels and pull just the Y lanes via the
+        // cross-vector u16 permute. We don't need chroma here.
+        let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
+        let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1);
+        // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
+        // any BITS ∈ {10, 12} — same single-shift simplification used
+        // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const
+        // count, so it works without runtime-count helper.
+        let y_shr = _mm512_srli_epi16::<8>(y_raw);
+        // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack
+        // fixup); next 32 zero from the zero-hi pack source.
+        let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup);
+        // Store first 32 bytes via the low 256-bit half.
+        _mm256_storeu_si256(
+          luma_out.as_mut_ptr().add(x).cast(),
+          _mm512_castsi512_si256(y_u8),
+        );
+        x += 32;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -554,7 +569,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -571,26 +586,28 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
-
     let mut x = 0usize;
-    while x + 32 <= width {
-      let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
-      let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
-      let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1);
-      // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into
-      // low-bit-packed form for the native-depth u16 output.
-      let y_low = _mm512_srl_epi16(y_raw, shr_count);
-      _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low);
-      x += 32;
+    if !BE {
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast());
+
+      while x + 32 <= width {
+        let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast());
+        let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast());
+        let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1);
+        // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into
+        // low-bit-packed form for the native-depth u16 output.
+        let y_low = _mm512_srl_epi16(y_raw, shr_count);
+        _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low);
+        x += 32;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_u16_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_u16_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/tests/v210.rs b/src/row/arch/x86_sse41/tests/v210.rs
index 6f1b9480..dea42837 100644
--- a/src/row/arch/x86_sse41/tests/v210.rs
+++ b/src/row/arch/x86_sse41/tests/v210.rs
@@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v210_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v210_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -82,9 +82,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v210_to_luma_row(&p, &mut s, width);
+  scalar::v210_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_row(&p, &mut k, width);
+    v210_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 v210→luma diverges (width={width})");
 }
@@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v210_to_luma_u16_row(&p, &mut s, width);
+  scalar::v210_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v210_to_luma_u16_row(&p, &mut k, width);
+    v210_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 v210→luma u16 diverges (width={width})");
 }
@@ -234,7 +234,7 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v210_to_luma_u16_row(&packed, &mut luma, W);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "sse4.1 v210 luma reorder bug");
@@ -243,9 +243,15 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v210_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v210_to_rgb_or_rgba_row::<false>(
+  scalar::v210_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_sse41/tests/y216.rs b/src/row/arch/x86_sse41/tests/y216.rs
index ebe59115..48e7acf8 100644
--- a/src/row/arch/x86_sse41/tests/y216.rs
+++ b/src/row/arch/x86_sse41/tests/y216.rs
@@ -15,9 +15,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -32,9 +32,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -48,9 +48,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y216_to_luma_row(&p, &mut s, width);
+  scalar::y216_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_row(&p, &mut k, width);
+    y216_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 y216→luma diverges (width={width})");
 }
@@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_y216(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y216_to_luma_u16_row(&p, &mut s, width);
+  scalar::y216_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    y216_to_luma_u16_row(&p, &mut k, width);
+    y216_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 y216→luma u16 diverges (width={width})");
 }
@@ -166,7 +166,7 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y216_to_luma_u16_row(&packed, &mut luma_u16, W);
+    y216_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "SSE4.1 y216 luma_u16 reorder bug");
@@ -175,9 +175,15 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/x86_sse41/tests/y2xx.rs b/src/row/arch/x86_sse41/tests/y2xx.rs
index 1c97b77c..fe0e5cf7 100644
--- a/src/row/arch/x86_sse41/tests/y2xx.rs
+++ b/src/row/arch/x86_sse41/tests/y2xx.rs
@@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&packed, &mut luma_u16, W);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
       &packed,
       &mut simd_rgb,
       W,
@@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
       false,
     );
   }
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -107,9 +107,9 @@ fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: boo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -121,9 +121,9 @@ fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bo
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    y2xx_n_to_rgb_or_rgba_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -135,9 +135,11 @@ fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range:
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -149,9 +151,11 @@ fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range
   let p = pseudo_random_y210(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut s, width, matrix, full_range);
+  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true>(&p, &mut k, width, matrix, full_range);
+    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -163,9 +167,9 @@ fn check_luma<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::y2xx_n_to_luma_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 y2xx<{BITS}>→luma diverges (width={width})");
 }
@@ -174,9 +178,9 @@ fn check_luma_u16<const BITS: u32>(width: usize) {
   let p = pseudo_random_y210(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::y2xx_n_to_luma_u16_row::<BITS>(&p, &mut s, width);
+  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
   unsafe {
-    y2xx_n_to_luma_u16_row::<BITS>(&p, &mut k, width);
+    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
   }
   assert_eq!(
     s, k,
@@ -264,15 +268,15 @@ fn sse41_y212_matches_scalar_widths() {
     let p = pseudo_random_y212(w, 0xAA55);
     let mut s = std::vec![0u8; w * 3];
     let mut k = std::vec![0u8; w * 3];
-    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
+    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
     unsafe {
-      y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
+      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
     }
     assert_eq!(s, k, "SSE4.1 y2xx<12>→RGB diverges (width={w})");
 
     let mut s_u16 = std::vec![0u16; w * 4];
     let mut k_u16 = std::vec![0u16; w * 4];
-    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
       &p,
       &mut s_u16,
       w,
@@ -280,7 +284,7 @@ fn sse41_y212_matches_scalar_widths() {
       true,
     );
     unsafe {
-      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(
+      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
         &p,
         &mut k_u16,
         w,
@@ -295,17 +299,17 @@ fn sse41_y212_matches_scalar_widths() {
 
     let mut sl = std::vec![0u8; w];
     let mut kl = std::vec![0u8; w];
-    scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w);
+    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
     unsafe {
-      y2xx_n_to_luma_row::<12>(&p, &mut kl, w);
+      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
     }
     assert_eq!(sl, kl, "SSE4.1 y2xx<12>→luma diverges (width={w})");
 
     let mut slu = std::vec![0u16; w];
     let mut klu = std::vec![0u16; w];
-    scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w);
+    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
     unsafe {
-      y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w);
+      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
     }
     assert_eq!(slu, klu, "SSE4.1 y2xx<12>→luma u16 diverges (width={w})");
   }
diff --git a/src/row/arch/x86_sse41/v210.rs b/src/row/arch/x86_sse41/v210.rs
index cc11438d..eb37f5b8 100644
--- a/src/row/arch/x86_sse41/v210.rs
+++ b/src/row/arch/x86_sse41/v210.rs
@@ -14,7 +14,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u32x4, *};
 use crate::{ColorMatrix, row::scalar};
 
 /// Unpacks one 16-byte v210 word into three `__m128i` vectors holding
@@ -42,11 +42,11 @@ use crate::{ColorMatrix, row::scalar};
 /// `_mm_shuffle_epi8`).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) {
+unsafe fn unpack_v210_word_sse41<const BE: bool>(ptr: *const u8) -> (__m128i, __m128i, __m128i) {
   // SAFETY: caller obligation — `ptr` has 16 bytes readable; SSE4.1
   // (and thus SSSE3) is available.
   unsafe {
-    let words = _mm_loadu_si128(ptr.cast());
+    let words = load_endian_u32x4::<BE>(ptr);
     let mask10 = _mm_set1_epi32(0x3FF);
     let low10 = _mm_and_si128(words, mask10);
     let mid10 = _mm_and_si128(_mm_srli_epi32::<10>(words), mask10);
@@ -143,7 +143,7 @@ unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i)
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -180,7 +180,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     let cbv = _mm_set1_epi32(coeffs.b_v());
 
     for w in 0..words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = y_vec;
 
@@ -263,7 +263,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v210_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -280,7 +286,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -317,7 +323,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
     let cbv = _mm_set1_epi32(coeffs.b_v());
 
     for w in 0..words {
-      let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16));
+      let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::<BE>(packed.as_ptr().add(w * 16));
 
       let y_i16 = y_vec;
       let u_i16 = _mm_sub_epi16(u_vec, bias_v);
@@ -383,7 +389,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut out[tail_start_px * bpp..width * bpp];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -406,7 +412,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v210_to_luma_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -416,7 +426,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..words {
-      let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_sse41::<BE>(packed.as_ptr().add(w * 16));
       // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8 via packus.
       let y_shr = _mm_srli_epi16::<2>(y_vec);
       let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128());
@@ -430,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -447,7 +457,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width:
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   let words = width / 6;
@@ -457,7 +471,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
     for w in 0..words {
-      let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16));
+      let (y_vec, _, _) = unpack_v210_word_sse41::<BE>(packed.as_ptr().add(w * 16));
       // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice.
       let mut tmp = [0u16; 8];
       _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_vec);
@@ -468,7 +482,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w
       let tail_packed = &packed[words * 16..total_words * 16];
       let tail_out = &mut luma_out[tail_start_px..width];
       let tail_w = width - tail_start_px;
-      scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::v210_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/y216.rs b/src/row/arch/x86_sse41/y216.rs
index a98cdc45..e799caee 100644
--- a/src/row/arch/x86_sse41/y216.rs
+++ b/src/row/arch/x86_sse41/y216.rs
@@ -48,7 +48,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -65,160 +65,168 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
   const RND: i32 = 1 << 14;
 
   unsafe {
-    let rnd_v = _mm_set1_epi32(RND);
-    // Y216 samples are full u16 [0..65535]; use i32 y_off and
-    // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767.
-    let y_off_v = _mm_set1_epi32(y_off);
-    let y_scale_v = _mm_set1_epi32(y_scale);
-    let c_scale_v = _mm_set1_epi32(c_scale);
-    // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000.
-    let bias16_v = _mm_set1_epi16(-32768i16);
-    let cru = _mm_set1_epi32(coeffs.r_u());
-    let crv = _mm_set1_epi32(coeffs.r_v());
-    let cgu = _mm_set1_epi32(coeffs.g_u());
-    let cgv = _mm_set1_epi32(coeffs.g_v());
-    let cbu = _mm_set1_epi32(coeffs.b_u());
-    let cbv = _mm_set1_epi32(coeffs.b_v());
-    let alpha_u8 = _mm_set1_epi8(-1);
-
-    // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each).
-    // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes.
-    // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes):
-    //   0,1 = Y0  2,3 = U0  4,5 = Y1  6,7 = V0
-    //   8,9 = Y2  10,11 = U1  12,13 = Y3  14,15 = V1
-    // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed.
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-    // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes.
-    let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
-    // U lanes from interleaved [U,V,U,V,...]: even u16 lanes.
-    let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-    // V lanes: odd u16 lanes.
-    let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------
-      // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3
-      // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7
-      let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-
-      // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi.
-      let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8
-      let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0]
-      let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8
-
-      // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group.
-      let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0]
-      let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0]
-      let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3]
-
-      // Split U and V (4 valid low-half lanes each).
-      let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8
-      let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8
-
-      // Center UV: subtract 32768 wrapping.
-      let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v);
-      let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v);
-
-      // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale.
-      let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3]
-      let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3]
-      // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are
-      // 0x8080 garbage from the -1-byte shuffles, but we don't use them.
-      // Widen the high half too for `chroma_i16x8` (don't-care input).
-      let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16));
-      let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16));
-
-      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v));
-
-      // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3,
-      // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid.
-      let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-      let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-      let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
-
-      // Duplicate each chroma sample into its Y-pair slot (4:2:2):
-      // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3]
-      let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo);
-      let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo);
-      let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo);
-
-      // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767.
-      let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v);
-
-      // Saturating add and narrow to u8.
-      let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128());
-      let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128());
-      let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128());
-
-      // --- hi group: pixels x+8..x+15 ---------------------------------
-      let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
-      let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
-
-      let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx);
-      let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx);
-      let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
-
-      let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx);
-      let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx);
-      let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half);
-
-      let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx);
-      let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx);
-
-      let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v);
-      let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v);
-
-      let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16);
-      let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16);
-      let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16));
-      let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16));
-
-      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
-      let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
-      let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v));
-
-      let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
-      let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
-      let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
-
-      let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi);
-      let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi);
-      let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi);
-
-      let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v);
-
-      let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128());
-      let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128());
-      let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128());
-
-      // Combine two 8-pixel groups into 16-pixel output.
-      // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes.
-      // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8.
-      let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8);
-      let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8);
-      let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8);
-
-      if ALPHA {
-        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let rnd_v = _mm_set1_epi32(RND);
+      // Y216 samples are full u16 [0..65535]; use i32 y_off and
+      // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767.
+      let y_off_v = _mm_set1_epi32(y_off);
+      let y_scale_v = _mm_set1_epi32(y_scale);
+      let c_scale_v = _mm_set1_epi32(c_scale);
+      // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000.
+      let bias16_v = _mm_set1_epi16(-32768i16);
+      let cru = _mm_set1_epi32(coeffs.r_u());
+      let crv = _mm_set1_epi32(coeffs.r_v());
+      let cgu = _mm_set1_epi32(coeffs.g_u());
+      let cgv = _mm_set1_epi32(coeffs.g_v());
+      let cbu = _mm_set1_epi32(coeffs.b_u());
+      let cbv = _mm_set1_epi32(coeffs.b_v());
+      let alpha_u8 = _mm_set1_epi8(-1);
+
+      // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each).
+      // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes.
+      // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes):
+      //   0,1 = Y0  2,3 = U0  4,5 = Y1  6,7 = V0
+      //   8,9 = Y2  10,11 = U1  12,13 = Y3  14,15 = V1
+      // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed.
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+      // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes.
+      let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+      // U lanes from interleaved [U,V,U,V,...]: even u16 lanes.
+      let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+      // V lanes: odd u16 lanes.
+      let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 16 <= width {
+        // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------
+        // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3
+        // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7
+        let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+
+        // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi.
+        let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8
+        let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0]
+        let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8
+
+        // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group.
+        let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0]
+        let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0]
+        let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3]
+
+        // Split U and V (4 valid low-half lanes each).
+        let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8
+        let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8
+
+        // Center UV: subtract 32768 wrapping.
+        let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v);
+        let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v);
+
+        // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale.
+        let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3]
+        let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3]
+        // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are
+        // 0x8080 garbage from the -1-byte shuffles, but we don't use them.
+        // Widen the high half too for `chroma_i16x8` (don't-care input).
+        let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16));
+        let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16));
+
+        let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v));
+
+        // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3,
+        // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid.
+        let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+        let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+        let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v);
+
+        // Duplicate each chroma sample into its Y-pair slot (4:2:2):
+        // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3]
+        let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo);
+        let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo);
+        let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo);
+
+        // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767.
+        let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v);
+
+        // Saturating add and narrow to u8.
+        let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128());
+        let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128());
+        let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128());
+
+        // --- hi group: pixels x+8..x+15 ---------------------------------
+        let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
+        let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
+
+        let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx);
+        let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx);
+        let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
+
+        let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx);
+        let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx);
+        let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half);
+
+        let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx);
+        let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx);
+
+        let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v);
+        let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v);
+
+        let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16);
+        let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16);
+        let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16));
+        let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16));
+
+        let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+        let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+        let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v));
+
+        let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
+        let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
+        let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v);
+
+        let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi);
+        let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi);
+        let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi);
+
+        let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v);
+
+        let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128());
+        let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128());
+        let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128());
+
+        // Combine two 8-pixel groups into 16-pixel output.
+        // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes.
+        // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8.
+        let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8);
+        let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8);
+        let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8);
+
+        if ALPHA {
+          write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+        }
+
+        x += 16;
       }
+    } // end if !BE
 
-      x += 16;
-    }
-
-    // Scalar tail — remaining < 16 pixels.
+    // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::y216_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -241,7 +249,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -258,147 +266,149 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   const RND: i64 = 1 << 14;
 
   unsafe {
-    let alpha_u16 = _mm_set1_epi16(-1i16);
-    let rnd_v = _mm_set1_epi64x(RND);
-    let rnd32_v = _mm_set1_epi32(1 << 14);
-    let y_off_v = _mm_set1_epi32(y_off);
-    let y_scale_v = _mm_set1_epi32(y_scale);
-    let c_scale_v = _mm_set1_epi32(c_scale);
-    // bias 32768 via wrapping i16 trick
-    let bias16_v = _mm_set1_epi16(-32768i16);
-    let cru = _mm_set1_epi32(coeffs.r_u());
-    let crv = _mm_set1_epi32(coeffs.r_v());
-    let cgu = _mm_set1_epi32(coeffs.g_u());
-    let cgv = _mm_set1_epi32(coeffs.g_v());
-    let cbu = _mm_set1_epi32(coeffs.b_u());
-    let cbv = _mm_set1_epi32(coeffs.b_v());
-
-    // Byte-level shuffle masks (same as u8 path).
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-    let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
-    let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-    let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      // Two 128-bit loads: each covers 8 u16 = 4 pixels.
-      // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1]
-      // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3]
-      let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-
-      // Y: [Y0..Y7] u16x8
-      let y_lo_half = _mm_shuffle_epi8(lo, y_idx);
-      let y_hi_half = _mm_shuffle_epi8(hi, y_idx);
-      let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half);
-
-      // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3]
-      let c_lo_half = _mm_shuffle_epi8(lo, c_idx);
-      let c_hi_half = _mm_shuffle_epi8(hi, c_idx);
-      let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half);
-
-      // U and V (4 valid low-half lanes each)
-      let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0]
-      let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0]
-
-      // Center UV via wrapping i16 subtraction.
-      let u_i16 = _mm_sub_epi16(u_vec4, bias16_v);
-      let v_i16 = _mm_sub_epi16(v_vec4, bias16_v);
-
-      // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16).
-      let u_i32 = _mm_cvtepi16_epi32(u_i16);
-      let v_i32 = _mm_cvtepi16_epi32(v_i16);
-      let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v));
-      let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v));
-
-      // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes.
-      let u_d_even = u_d;
-      let v_d_even = v_d;
-      let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even
-      let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d);
-
-      let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v);
-      let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v);
-      let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v);
-      let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v);
-      let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v);
-      let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v);
-
-      // Reassemble i64x2 pairs (even + odd) → i32x4.
-      let r_ch_i32 = _mm_unpacklo_epi64(
-        _mm_unpacklo_epi32(r_ch_even, r_ch_odd),
-        _mm_unpackhi_epi32(r_ch_even, r_ch_odd),
-      );
-      let g_ch_i32 = _mm_unpacklo_epi64(
-        _mm_unpacklo_epi32(g_ch_even, g_ch_odd),
-        _mm_unpackhi_epi32(g_ch_even, g_ch_odd),
-      );
-      let b_ch_i32 = _mm_unpacklo_epi64(
-        _mm_unpacklo_epi32(b_ch_even, b_ch_odd),
-        _mm_unpackhi_epi32(b_ch_even, b_ch_odd),
-      );
-
-      // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2).
-      // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3)
-      // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7)
-      let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32);
-      let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32);
-      let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32);
-      let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32);
-      let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32);
-      let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32);
-
-      // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64.
-      let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32
-      let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7]
-      let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v);
-      let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v);
-
-      // Even/odd split for _mm_mul_epi32.
-      let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v);
-      let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v);
-      let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v);
-      let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v);
-
-      // Reassemble Y i64x2 pairs to i32x4.
-      let y_lo_i32 = _mm_unpacklo_epi64(
-        _mm_unpacklo_epi32(y_lo_even, y_lo_odd),
-        _mm_unpackhi_epi32(y_lo_even, y_lo_odd),
-      );
-      let y_hi_i32 = _mm_unpacklo_epi64(
-        _mm_unpacklo_epi32(y_hi_even, y_hi_odd),
-        _mm_unpackhi_epi32(y_hi_even, y_hi_odd),
-      );
-
-      // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32.
-      let r_u16 = _mm_packus_epi32(
-        _mm_add_epi32(y_lo_i32, r_dup_lo),
-        _mm_add_epi32(y_hi_i32, r_dup_hi),
-      );
-      let g_u16 = _mm_packus_epi32(
-        _mm_add_epi32(y_lo_i32, g_dup_lo),
-        _mm_add_epi32(y_hi_i32, g_dup_hi),
-      );
-      let b_u16 = _mm_packus_epi32(
-        _mm_add_epi32(y_lo_i32, b_dup_lo),
-        _mm_add_epi32(y_hi_i32, b_dup_hi),
-      );
-
-      if ALPHA {
-        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+    if !BE {
+      let alpha_u16 = _mm_set1_epi16(-1i16);
+      let rnd_v = _mm_set1_epi64x(RND);
+      let rnd32_v = _mm_set1_epi32(1 << 14);
+      let y_off_v = _mm_set1_epi32(y_off);
+      let y_scale_v = _mm_set1_epi32(y_scale);
+      let c_scale_v = _mm_set1_epi32(c_scale);
+      // bias 32768 via wrapping i16 trick
+      let bias16_v = _mm_set1_epi16(-32768i16);
+      let cru = _mm_set1_epi32(coeffs.r_u());
+      let crv = _mm_set1_epi32(coeffs.r_v());
+      let cgu = _mm_set1_epi32(coeffs.g_u());
+      let cgv = _mm_set1_epi32(coeffs.g_v());
+      let cbu = _mm_set1_epi32(coeffs.b_u());
+      let cbv = _mm_set1_epi32(coeffs.b_v());
+
+      // Byte-level shuffle masks (same as u8 path).
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+      let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+      let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+      let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 8 <= width {
+        // Two 128-bit loads: each covers 8 u16 = 4 pixels.
+        // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1]
+        // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3]
+        let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+
+        // Y: [Y0..Y7] u16x8
+        let y_lo_half = _mm_shuffle_epi8(lo, y_idx);
+        let y_hi_half = _mm_shuffle_epi8(hi, y_idx);
+        let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half);
+
+        // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3]
+        let c_lo_half = _mm_shuffle_epi8(lo, c_idx);
+        let c_hi_half = _mm_shuffle_epi8(hi, c_idx);
+        let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half);
+
+        // U and V (4 valid low-half lanes each)
+        let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0]
+        let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0]
+
+        // Center UV via wrapping i16 subtraction.
+        let u_i16 = _mm_sub_epi16(u_vec4, bias16_v);
+        let v_i16 = _mm_sub_epi16(v_vec4, bias16_v);
+
+        // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16).
+        let u_i32 = _mm_cvtepi16_epi32(u_i16);
+        let v_i32 = _mm_cvtepi16_epi32(v_i16);
+        let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v));
+        let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v));
+
+        // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes.
+        let u_d_even = u_d;
+        let v_d_even = v_d;
+        let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even
+        let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d);
+
+        let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v);
+        let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v);
+        let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v);
+        let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v);
+        let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v);
+        let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v);
+
+        // Reassemble i64x2 pairs (even + odd) → i32x4.
+        let r_ch_i32 = _mm_unpacklo_epi64(
+          _mm_unpacklo_epi32(r_ch_even, r_ch_odd),
+          _mm_unpackhi_epi32(r_ch_even, r_ch_odd),
+        );
+        let g_ch_i32 = _mm_unpacklo_epi64(
+          _mm_unpacklo_epi32(g_ch_even, g_ch_odd),
+          _mm_unpackhi_epi32(g_ch_even, g_ch_odd),
+        );
+        let b_ch_i32 = _mm_unpacklo_epi64(
+          _mm_unpacklo_epi32(b_ch_even, b_ch_odd),
+          _mm_unpackhi_epi32(b_ch_even, b_ch_odd),
+        );
+
+        // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2).
+        // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3)
+        // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7)
+        let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32);
+        let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32);
+        let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32);
+        let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32);
+        let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32);
+        let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32);
+
+        // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64.
+        let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32
+        let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7]
+        let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v);
+        let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v);
+
+        // Even/odd split for _mm_mul_epi32.
+        let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v);
+        let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v);
+        let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v);
+        let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v);
+
+        // Reassemble Y i64x2 pairs to i32x4.
+        let y_lo_i32 = _mm_unpacklo_epi64(
+          _mm_unpacklo_epi32(y_lo_even, y_lo_odd),
+          _mm_unpackhi_epi32(y_lo_even, y_lo_odd),
+        );
+        let y_hi_i32 = _mm_unpacklo_epi64(
+          _mm_unpacklo_epi32(y_hi_even, y_hi_odd),
+          _mm_unpackhi_epi32(y_hi_even, y_hi_odd),
+        );
+
+        // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32.
+        let r_u16 = _mm_packus_epi32(
+          _mm_add_epi32(y_lo_i32, r_dup_lo),
+          _mm_add_epi32(y_hi_i32, r_dup_hi),
+        );
+        let g_u16 = _mm_packus_epi32(
+          _mm_add_epi32(y_lo_i32, g_dup_lo),
+          _mm_add_epi32(y_hi_i32, g_dup_hi),
+        );
+        let b_u16 = _mm_packus_epi32(
+          _mm_add_epi32(y_lo_i32, b_dup_lo),
+          _mm_add_epi32(y_hi_i32, b_dup_hi),
+        );
+
+        if ALPHA {
+          write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+        }
+
+        x += 8;
       }
+    } // end if !BE
 
-      x += 8;
-    }
-
-    // Scalar tail — remaining < 8 pixels.
+    // Scalar tail — remaining < 8 pixels, or full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::y216_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -423,49 +433,55 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn y216_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   unsafe {
-    // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes.
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      // Four loads covering 16 pixels (16 u16 per load pair).
-      // packed offset x*2 = quadruple-base for pixel x.
-      // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15.
-      let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-      let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
-      let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
-
-      // Extract Y lanes into u16x8.
-      let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..]
-      let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..]
-      let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7]
-
-      let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..]
-      let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..]
-      let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
-
-      // `>> 8` to get u8 luma (high byte of each Y sample).
-      let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo);
-      let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi);
-      // Pack 16 × i16 → 16 × u8.
-      let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr);
-      _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8);
-
-      x += 16;
+    if !BE {
+      // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes.
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 16 <= width {
+        // Four loads covering 16 pixels (16 u16 per load pair).
+        // packed offset x*2 = quadruple-base for pixel x.
+        // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15.
+        let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+        let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
+        let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
+
+        // Extract Y lanes into u16x8.
+        let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..]
+        let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..]
+        let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7]
+
+        let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..]
+        let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..]
+        let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
+
+        // `>> 8` to get u8 luma (high byte of each Y sample).
+        let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo);
+        let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi);
+        // Pack 16 × i16 → 16 × u8.
+        let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr);
+        _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8);
+
+        x += 16;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -484,41 +500,47 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 4. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn y216_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
 
   unsafe {
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 16 <= width {
-      let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-      let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
-      let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
+    if !BE {
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
 
-      let y_lo_half = _mm_shuffle_epi8(lo0, y_idx);
-      let y_hi_half = _mm_shuffle_epi8(hi0, y_idx);
-      let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7]
+      while x + 16 <= width {
+        let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+        let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast());
+        let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast());
 
-      let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx);
-      let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx);
-      let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
+        let y_lo_half = _mm_shuffle_epi8(lo0, y_idx);
+        let y_hi_half = _mm_shuffle_epi8(hi0, y_idx);
+        let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7]
 
-      // Direct copy — full 16-bit Y values, no shift.
-      _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo);
-      _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi);
+        let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx);
+        let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx);
+        let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15]
 
-      x += 16;
+        // Direct copy — full 16-bit Y values, no shift.
+        _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo);
+        _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi);
+
+        x += 16;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x..width];
       let tail_w = width - x;
-      scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w);
+      scalar::y216_to_luma_u16_row::<BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/y2xx.rs b/src/row/arch/x86_sse41/y2xx.rs
index eaa88f7e..e8e18aff 100644
--- a/src/row/arch/x86_sse41/y2xx.rs
+++ b/src/row/arch/x86_sse41/y2xx.rs
@@ -130,7 +130,11 @@ unsafe fn unpack_y2xx_8px_sse41(
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -158,111 +162,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
   // by the `while x + 8 <= width` loop and the caller-promised slice
   // lengths checked above.
   unsafe {
-    let rnd_v = _mm_set1_epi32(RND);
-    let y_off_v = _mm_set1_epi16(y_off as i16);
-    let y_scale_v = _mm_set1_epi32(y_scale);
-    let c_scale_v = _mm_set1_epi32(c_scale);
-    let bias_v = _mm_set1_epi16(bias as i16);
-    // Loop-invariant runtime shift count for `_mm_srl_epi16`, see
-    // module-level note.
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let cru = _mm_set1_epi32(coeffs.r_u());
-    let crv = _mm_set1_epi32(coeffs.r_v());
-    let cgu = _mm_set1_epi32(coeffs.g_u());
-    let cgv = _mm_set1_epi32(coeffs.g_v());
-    let cbu = _mm_set1_epi32(coeffs.b_u());
-    let cbv = _mm_set1_epi32(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-
-      // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
-      // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
-      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
-
-      // Widen 8-lane i16 chroma to two i32x4 halves so the Q15
-      // multiplies don't overflow. Only lanes 0..3 of `_lo` are
-      // valid; `_hi` is entirely don't-care. We feed both halves
-      // through `chroma_i16x8` to recycle the helper exactly; the
-      // don't-care output lanes are discarded by the
-      // `_mm_unpacklo_epi16` duplicate step below (which only consumes
-      // lanes 0..3).
-      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
-      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
-      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
-      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
-
-      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
-
-      // 8-lane chroma vectors with valid data in lanes 0..3.
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
-      // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with
-      // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3].
-      let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma);
-      let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma);
-      let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma);
-
-      // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits
-      // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero
-      // for hi) so the low 8 bytes of the result hold the saturated
-      // u8 of the input i16x8. Only the first 8 bytes per channel
-      // matter.
-      let zero = _mm_setzero_si128();
-      let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero);
-      let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero);
-      let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero);
-
-      // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16`
-      // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter
-      // body we use the v210-style stack-buffer + scalar interleave
-      // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.)
-      let mut r_tmp = [0u8; 16];
-      let mut g_tmp = [0u8; 16];
-      let mut b_tmp = [0u8; 16];
-      _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8);
-      _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8);
-      _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8);
-
-      if ALPHA {
-        let dst = &mut out[x * 4..x * 4 + 8 * 4];
-        for i in 0..8 {
-          dst[i * 4] = r_tmp[i];
-          dst[i * 4 + 1] = g_tmp[i];
-          dst[i * 4 + 2] = b_tmp[i];
-          dst[i * 4 + 3] = 0xFF;
-        }
-      } else {
-        let dst = &mut out[x * 3..x * 3 + 8 * 3];
-        for i in 0..8 {
-          dst[i * 3] = r_tmp[i];
-          dst[i * 3 + 1] = g_tmp[i];
-          dst[i * 3 + 2] = b_tmp[i];
+    if !BE {
+      let rnd_v = _mm_set1_epi32(RND);
+      let y_off_v = _mm_set1_epi16(y_off as i16);
+      let y_scale_v = _mm_set1_epi32(y_scale);
+      let c_scale_v = _mm_set1_epi32(c_scale);
+      let bias_v = _mm_set1_epi16(bias as i16);
+      // Loop-invariant runtime shift count for `_mm_srl_epi16`, see
+      // module-level note.
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let cru = _mm_set1_epi32(coeffs.r_u());
+      let crv = _mm_set1_epi32(coeffs.r_v());
+      let cgu = _mm_set1_epi32(coeffs.g_u());
+      let cgv = _mm_set1_epi32(coeffs.g_v());
+      let cbu = _mm_set1_epi32(coeffs.b_u());
+      let cbv = _mm_set1_epi32(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+
+        // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since
+        // each chroma sample is ≤ 2^BITS - 1 ≤ 4095.
+        let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+        // Widen 8-lane i16 chroma to two i32x4 halves so the Q15
+        // multiplies don't overflow. Only lanes 0..3 of `_lo` are
+        // valid; `_hi` is entirely don't-care. We feed both halves
+        // through `chroma_i16x8` to recycle the helper exactly; the
+        // don't-care output lanes are discarded by the
+        // `_mm_unpacklo_epi16` duplicate step below (which only consumes
+        // lanes 0..3).
+        let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+        let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+        let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+        let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+        let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+        // 8-lane chroma vectors with valid data in lanes 0..3.
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via
+        // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with
+        // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3].
+        let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma);
+        let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma);
+        let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma);
+
+        // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8.
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits
+        // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero
+        // for hi) so the low 8 bytes of the result hold the saturated
+        // u8 of the input i16x8. Only the first 8 bytes per channel
+        // matter.
+        let zero = _mm_setzero_si128();
+        let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero);
+        let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero);
+        let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero);
+
+        // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16`
+        // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter
+        // body we use the v210-style stack-buffer + scalar interleave
+        // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.)
+        let mut r_tmp = [0u8; 16];
+        let mut g_tmp = [0u8; 16];
+        let mut b_tmp = [0u8; 16];
+        _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8);
+        _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8);
+        _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8);
+
+        if ALPHA {
+          let dst = &mut out[x * 4..x * 4 + 8 * 4];
+          for i in 0..8 {
+            dst[i * 4] = r_tmp[i];
+            dst[i * 4 + 1] = g_tmp[i];
+            dst[i * 4 + 2] = b_tmp[i];
+            dst[i * 4 + 3] = 0xFF;
+          }
+        } else {
+          let dst = &mut out[x * 3..x * 3 + 8 * 3];
+          for i in 0..8 {
+            dst[i * 3] = r_tmp[i];
+            dst[i * 3 + 1] = g_tmp[i];
+            dst[i * 3 + 2] = b_tmp[i];
+          }
         }
-      }
 
-      x += 8;
-    }
+        x += 8;
+      }
+    } // end if !BE
 
-    // Scalar tail — remaining < 8 pixels (always even per 4:2:2).
+    // Scalar tail — remaining < 8 pixels (always even per 4:2:2),
+    // or full-row fallback when BE=true.
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -288,7 +295,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: boo
 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -314,72 +325,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let rnd_v = _mm_set1_epi32(RND);
-    let y_off_v = _mm_set1_epi16(y_off as i16);
-    let y_scale_v = _mm_set1_epi32(y_scale);
-    let c_scale_v = _mm_set1_epi32(c_scale);
-    let bias_v = _mm_set1_epi16(bias as i16);
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let max_v = _mm_set1_epi16(out_max);
-    let zero_v = _mm_set1_epi16(0);
-    let cru = _mm_set1_epi32(coeffs.r_u());
-    let crv = _mm_set1_epi32(coeffs.r_v());
-    let cgu = _mm_set1_epi32(coeffs.g_u());
-    let cgv = _mm_set1_epi32(coeffs.g_v());
-    let cbu = _mm_set1_epi32(coeffs.b_u());
-    let cbv = _mm_set1_epi32(coeffs.b_v());
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count);
-
-      let y_i16 = y_vec;
-      let u_i16 = _mm_sub_epi16(u_vec, bias_v);
-      let v_i16 = _mm_sub_epi16(v_vec, bias_v);
-
-      let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
-      let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
-      let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
-      let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
-
-      let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
-      let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
-      let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
-      let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
-
-      let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-      let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
-
-      let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma);
-      let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma);
-      let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma);
-
-      let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
-
-      // Native-depth output: clamp to [0, (1 << BITS) - 1].
-      // `_mm_adds_epi16` saturates at i16 bounds (no-op here since
-      // |sum| stays well inside i16 for BITS ≤ 12), then min/max
-      // clamps to the BITS range.
-      let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v);
-      let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v);
-      let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v);
-
-      if ALPHA {
-        let alpha = _mm_set1_epi16(out_max);
-        write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4));
-      } else {
-        write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3));
-      }
+    if !BE {
+      let rnd_v = _mm_set1_epi32(RND);
+      let y_off_v = _mm_set1_epi16(y_off as i16);
+      let y_scale_v = _mm_set1_epi32(y_scale);
+      let c_scale_v = _mm_set1_epi32(c_scale);
+      let bias_v = _mm_set1_epi16(bias as i16);
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let max_v = _mm_set1_epi16(out_max);
+      let zero_v = _mm_set1_epi16(0);
+      let cru = _mm_set1_epi32(coeffs.r_u());
+      let crv = _mm_set1_epi32(coeffs.r_v());
+      let cgu = _mm_set1_epi32(coeffs.g_u());
+      let cgv = _mm_set1_epi32(coeffs.g_v());
+      let cbu = _mm_set1_epi32(coeffs.b_u());
+      let cbv = _mm_set1_epi32(coeffs.b_v());
+
+      while x + 8 <= width {
+        let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count);
+
+        let y_i16 = y_vec;
+        let u_i16 = _mm_sub_epi16(u_vec, bias_v);
+        let v_i16 = _mm_sub_epi16(v_vec, bias_v);
+
+        let u_lo_i32 = _mm_cvtepi16_epi32(u_i16);
+        let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16));
+        let v_lo_i32 = _mm_cvtepi16_epi32(v_i16);
+        let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16));
+
+        let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v));
+        let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v));
+        let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v));
+        let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v));
+
+        let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+        let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v);
+
+        let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma);
+        let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma);
+        let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma);
+
+        let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v);
+
+        // Native-depth output: clamp to [0, (1 << BITS) - 1].
+        // `_mm_adds_epi16` saturates at i16 bounds (no-op here since
+        // |sum| stays well inside i16 for BITS ≤ 12), then min/max
+        // clamps to the BITS range.
+        let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v);
+        let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v);
+        let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v);
+
+        if ALPHA {
+          let alpha = _mm_set1_epi16(out_max);
+          write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4));
+        } else {
+          write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3));
+        }
 
-      x += 8;
-    }
+        x += 8;
+      }
+    } // end if !BE
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA>(
+      scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -405,7 +418,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const AL
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -422,39 +435,41 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    // Y permute mask: pick even u16 lanes (low byte at [0], high byte
-    // at [1]) into the low 8 bytes; high 8 bytes zeroed.
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _]
-      let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _]
-      let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned
-
-      // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
-      // any BITS ∈ {10, 12} — same single-shift simplification used
-      // by NEON's `vshrn_n_u16::<8>`.
-      // `_mm_srli_epi16::<8>` has a literal const count, so it works
-      // here without the runtime-count helper.
-      let y_shr = _mm_srli_epi16::<8>(y_vec);
-      // Pack 8 i16 lanes to u8 — only low 8 bytes used.
-      let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128());
-      // Store low 8 bytes via stack buffer + copy_from_slice.
-      let mut tmp = [0u8; 16];
-      _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8);
-      luma_out[x..x + 8].copy_from_slice(&tmp[..8]);
-
-      x += 8;
+    if !BE {
+      // Y permute mask: pick even u16 lanes (low byte at [0], high byte
+      // at [1]) into the low 8 bytes; high 8 bytes zeroed.
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 8 <= width {
+        let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _]
+        let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _]
+        let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned
+
+        // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for
+        // any BITS ∈ {10, 12} — same single-shift simplification used
+        // by NEON's `vshrn_n_u16::<8>`.
+        // `_mm_srli_epi16::<8>` has a literal const count, so it works
+        // here without the runtime-count helper.
+        let y_shr = _mm_srli_epi16::<8>(y_vec);
+        // Pack 8 i16 lanes to u8 — only low 8 bytes used.
+        let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128());
+        // Store low 8 bytes via stack buffer + copy_from_slice.
+        let mut tmp = [0u8; 16];
+        _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8);
+        luma_out[x..x + 8].copy_from_slice(&tmp[..8]);
+
+        x += 8;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
@@ -471,7 +486,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row<const BITS: u32>(
 /// 4. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -488,28 +503,30 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 
   // SAFETY: caller's obligation per the safety contract above.
   unsafe {
-    let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
-    let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
-
     let mut x = 0usize;
-    while x + 8 <= width {
-      let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
-      let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
-      let y_lo = _mm_shuffle_epi8(lo, y_idx);
-      let y_hi = _mm_shuffle_epi8(hi, y_idx);
-      let y_vec = _mm_unpacklo_epi64(y_lo, y_hi);
-      // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
-      // into low-bit-packed form for the native-depth u16 output.
-      let y_low = _mm_srl_epi16(y_vec, shr_count);
-      _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low);
-      x += 8;
+    if !BE {
+      let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32);
+      let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
+
+      while x + 8 <= width {
+        let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
+        let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast());
+        let y_lo = _mm_shuffle_epi8(lo, y_idx);
+        let y_hi = _mm_shuffle_epi8(hi, y_idx);
+        let y_vec = _mm_unpacklo_epi64(y_lo, y_hi);
+        // Right-shift by `(16 - BITS)` to bring MSB-aligned samples
+        // into low-bit-packed form for the native-depth u16 output.
+        let y_low = _mm_srl_epi16(y_vec, shr_count);
+        _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low);
+        x += 8;
+      }
     }
 
     if x < width {
       let tail_packed = &packed[x * 2..width * 2];
       let tail_out = &mut luma_out[x..width];
       let tail_w = width - x;
-      scalar::y2xx_n_to_luma_u16_row::<BITS>(tail_packed, tail_out, tail_w);
+      scalar::y2xx_n_to_luma_u16_row::<BITS, BE>(tail_packed, tail_out, tail_w);
     }
   }
 }
diff --git a/src/row/dispatch/v210.rs b/src/row/dispatch/v210.rs
index 2760c4b2..7d1c14b1 100644
--- a/src/row/dispatch/v210.rs
+++ b/src/row/dispatch/v210.rs
@@ -7,8 +7,8 @@
 //! block; `use_simd = false` forces scalar.
 //!
 //! The per-format SIMD kernels are const-generic on `ALPHA`
-//! (`v210_to_rgb_or_rgba_row::<ALPHA>` /
-//! `v210_to_rgb_u16_or_rgba_u16_row::<ALPHA>`) — the public
+//! (`v210_to_rgb_or_rgba_row::<ALPHA, BE>` /
+//! `v210_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>`) — the public
 //! dispatchers split them into RGB vs. RGBA entries by hard-wiring
 //! `ALPHA = false` / `true`.
 
@@ -31,7 +31,8 @@ use crate::{
 
 /// Converts one row of v210 to packed RGB (u8). See
 /// [`scalar::v210_to_rgb_or_rgba_row`] for byte layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `big_endian = true` selects
+/// the big-endian wire encoding (32-bit words stored MSB-first).
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn v210_to_rgb_row(
   packed: &[u8],
@@ -40,6 +41,7 @@ pub fn v210_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -54,36 +56,57 @@ pub fn v210_to_rgb_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -91,7 +114,10 @@ pub fn v210_to_rgb_row(
     }
   }
 
-  scalar::v210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::v210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range),
+    scalar::v210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of v210 to packed RGBA (u8) with `α = 0xFF`.
@@ -103,6 +129,7 @@ pub fn v210_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -117,36 +144,57 @@ pub fn v210_to_rgba_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -154,7 +202,10 @@ pub fn v210_to_rgba_row(
     }
   }
 
-  scalar::v210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::v210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range),
+    scalar::v210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of v210 to packed `u16` RGB at native 10-bit
@@ -167,6 +218,7 @@ pub fn v210_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -181,36 +233,57 @@ pub fn v210_to_rgb_u16_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -218,7 +291,14 @@ pub fn v210_to_rgb_u16_row(
     }
   }
 
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range
+    ),
+    scalar::v210_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Converts one row of v210 to packed `u16` RGBA at native 10-bit
@@ -231,6 +311,7 @@ pub fn v210_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -245,36 +326,57 @@ pub fn v210_to_rgba_u16_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -282,13 +384,26 @@ pub fn v210_to_rgba_u16_row(
     }
   }
 
-  scalar::v210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range
+    ),
+    scalar::v210_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Extracts one row of 8-bit luma from a packed v210 buffer.
 /// Y values are downshifted from 10-bit to 8-bit via `>> 2`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn v210_to_luma_row(
+  packed: &[u8],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "v210 requires even width (4:2:2 chroma pair)"
@@ -299,36 +414,57 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v210_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::neon::v210_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::v210_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::v210_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::v210_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::v210_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -336,14 +472,23 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si
     }
   }
 
-  scalar::v210_to_luma_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::v210_to_luma_row::<false>(packed, luma_out, width),
+    scalar::v210_to_luma_row::<true>(packed, luma_out, width)
+  );
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed v210
 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in
 /// its low 10 bits).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn v210_to_luma_u16_row(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "v210 requires even width (4:2:2 chroma pair)"
@@ -354,36 +499,57 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v210_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::v210_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::neon::v210_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v210_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::v210_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::v210_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v210_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::v210_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::v210_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v210_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::v210_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::v210_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v210_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::v210_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::v210_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -391,7 +557,10 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u
     }
   }
 
-  scalar::v210_to_luma_u16_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::v210_to_luma_u16_row::<false>(packed, luma_out, width),
+    scalar::v210_to_luma_u16_row::<true>(packed, luma_out, width)
+  );
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -435,7 +604,7 @@ mod tests {
 
     // u8 RGB
     let mut rgb = [0u8; 6 * 3];
-    v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false);
+    v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -444,7 +613,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 6 * 4];
-    v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false);
+    v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -452,7 +621,15 @@ mod tests {
 
     // u16 RGB at native 10-bit depth.
     let mut rgb_u16 = [0u16; 6 * 3];
-    v210_to_rgb_u16_row(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true, false);
+    v210_to_rgb_u16_row(
+      &word,
+      &mut rgb_u16,
+      6,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(512) <= 2);
       assert_eq!(px[0], px[1]);
@@ -461,21 +638,29 @@ mod tests {
 
     // u16 RGBA — alpha = 1023.
     let mut rgba_u16 = [0u16; 6 * 4];
-    v210_to_rgba_u16_row(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true, false);
+    v210_to_rgba_u16_row(
+      &word,
+      &mut rgba_u16,
+      6,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 1023);
     }
 
     // u8 luma — Y=512 → 128 after `>> 2`.
     let mut luma = [0u8; 6];
-    v210_to_luma_row(&word, &mut luma, 6, false);
+    v210_to_luma_row(&word, &mut luma, 6, false, false);
     for &y in &luma {
       assert_eq!(y, (512u16 >> 2) as u8);
     }
 
     // u16 luma — low-packed 10-bit Y.
     let mut luma_u16 = [0u16; 6];
-    v210_to_luma_u16_row(&word, &mut luma_u16, 6, false);
+    v210_to_luma_u16_row(&word, &mut luma_u16, 6, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 512);
     }
diff --git a/src/row/dispatch/y210.rs b/src/row/dispatch/y210.rs
index e9ab9eca..97bd0766 100644
--- a/src/row/dispatch/y210.rs
+++ b/src/row/dispatch/y210.rs
@@ -31,7 +31,8 @@ use crate::{
 
 /// Converts one row of Y210 to packed RGB (u8). See
 /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `big_endian = true` selects
+/// the big-endian wire encoding (u16 samples stored MSB-first).
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn y210_to_rgb_row(
   packed: &[u16],
@@ -40,6 +41,7 @@ pub fn y210_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -54,36 +56,57 @@ pub fn y210_to_rgb_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -91,7 +114,10 @@ pub fn y210_to_rgb_row(
     }
   }
 
-  scalar::y210_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y210_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range),
+    scalar::y210_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y210 to packed RGBA (u8) with `α = 0xFF`.
@@ -103,6 +129,7 @@ pub fn y210_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -117,36 +144,57 @@ pub fn y210_to_rgba_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -154,7 +202,10 @@ pub fn y210_to_rgba_row(
     }
   }
 
-  scalar::y210_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y210_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range),
+    scalar::y210_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y210 to packed `u16` RGB at native 10-bit
@@ -167,6 +218,7 @@ pub fn y210_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -181,36 +233,57 @@ pub fn y210_to_rgb_u16_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -218,7 +291,14 @@ pub fn y210_to_rgb_u16_row(
     }
   }
 
-  scalar::y210_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y210_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range
+    ),
+    scalar::y210_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Converts one row of Y210 to packed `u16` RGBA at native 10-bit
@@ -231,6 +311,7 @@ pub fn y210_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -245,36 +326,57 @@ pub fn y210_to_rgba_u16_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -282,13 +384,26 @@ pub fn y210_to_rgba_u16_row(
     }
   }
 
-  scalar::y210_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y210_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range
+    ),
+    scalar::y210_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Extracts one row of 8-bit luma from a packed Y210 buffer.
 /// Y values are downshifted from 10-bit to 8-bit via `>> 2`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn y210_to_luma_row(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y210 requires even width (4:2:2 chroma pair)"
@@ -299,36 +414,57 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_luma_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::neon::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -336,14 +472,23 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
     }
   }
 
-  scalar::y210_to_luma_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y210_to_luma_row::<false>(packed, luma_out, width),
+    scalar::y210_to_luma_row::<true>(packed, luma_out, width)
+  );
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed Y210
 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in
 /// its low 10 bits).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn y210_to_luma_u16_row(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y210 requires even width (4:2:2 chroma pair)"
@@ -354,36 +499,57 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -391,7 +557,10 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
     }
   }
 
-  scalar::y210_to_luma_u16_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y210_to_luma_u16_row::<false>(packed, luma_out, width),
+    scalar::y210_to_luma_u16_row::<true>(packed, luma_out, width)
+  );
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -433,7 +602,7 @@ mod tests {
 
     // u8 RGB
     let mut rgb = [0u8; 8 * 3];
-    y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false);
+    y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -442,7 +611,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 8 * 4];
-    y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false);
+    y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -450,7 +619,15 @@ mod tests {
 
     // u16 RGB at native 10-bit depth.
     let mut rgb_u16 = [0u16; 8 * 3];
-    y210_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false);
+    y210_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(512) <= 2);
       assert_eq!(px[0], px[1]);
@@ -459,21 +636,29 @@ mod tests {
 
     // u16 RGBA — alpha = 1023.
     let mut rgba_u16 = [0u16; 8 * 4];
-    y210_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false);
+    y210_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 1023);
     }
 
     // u8 luma — Y=512 → 128 after `>> 2`.
     let mut luma = [0u8; 8];
-    y210_to_luma_row(&buf, &mut luma, 8, false);
+    y210_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, (512u16 >> 2) as u8);
     }
 
     // u16 luma — low-packed 10-bit Y.
     let mut luma_u16 = [0u16; 8];
-    y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 512);
     }
diff --git a/src/row/dispatch/y212.rs b/src/row/dispatch/y212.rs
index aa253721..2245c50e 100644
--- a/src/row/dispatch/y212.rs
+++ b/src/row/dispatch/y212.rs
@@ -31,7 +31,8 @@ use crate::{
 
 /// Converts one row of Y212 to packed RGB (u8). See
 /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `big_endian = true` selects
+/// the big-endian wire encoding (u16 samples stored MSB-first).
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn y212_to_rgb_row(
   packed: &[u16],
@@ -40,6 +41,7 @@ pub fn y212_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -54,36 +56,57 @@ pub fn y212_to_rgb_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -91,7 +114,10 @@ pub fn y212_to_rgb_row(
     }
   }
 
-  scalar::y212_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y212_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range),
+    scalar::y212_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y212 to packed RGBA (u8) with `α = 0xFF`.
@@ -103,6 +129,7 @@ pub fn y212_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -117,36 +144,57 @@ pub fn y212_to_rgba_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -154,7 +202,10 @@ pub fn y212_to_rgba_row(
     }
   }
 
-  scalar::y212_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y212_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range),
+    scalar::y212_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y212 to packed `u16` RGB at native 12-bit
@@ -167,6 +218,7 @@ pub fn y212_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -181,36 +233,57 @@ pub fn y212_to_rgb_u16_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -218,7 +291,14 @@ pub fn y212_to_rgb_u16_row(
     }
   }
 
-  scalar::y212_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y212_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range
+    ),
+    scalar::y212_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Converts one row of Y212 to packed `u16` RGBA at native 12-bit
@@ -231,6 +311,7 @@ pub fn y212_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -245,36 +326,57 @@ pub fn y212_to_rgba_u16_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -282,13 +384,26 @@ pub fn y212_to_rgba_u16_row(
     }
   }
 
-  scalar::y212_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y212_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range
+    ),
+    scalar::y212_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Extracts one row of 8-bit luma from a packed Y212 buffer.
 /// Y values are downshifted from 12-bit to 8-bit via `>> 4`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn y212_to_luma_row(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y212 requires even width (4:2:2 chroma pair)"
@@ -299,36 +414,57 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_luma_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::neon::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -336,14 +472,23 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
     }
   }
 
-  scalar::y212_to_luma_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y212_to_luma_row::<false>(packed, luma_out, width),
+    scalar::y212_to_luma_row::<true>(packed, luma_out, width)
+  );
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed Y212
 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in
 /// its low 12 bits).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn y212_to_luma_u16_row(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y212 requires even width (4:2:2 chroma pair)"
@@ -354,36 +499,57 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -391,7 +557,10 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
     }
   }
 
-  scalar::y212_to_luma_u16_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y212_to_luma_u16_row::<false>(packed, luma_out, width),
+    scalar::y212_to_luma_u16_row::<true>(packed, luma_out, width)
+  );
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -433,7 +602,7 @@ mod tests {
 
     // u8 RGB
     let mut rgb = [0u8; 8 * 3];
-    y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false);
+    y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -442,7 +611,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 8 * 4];
-    y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false);
+    y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -450,7 +619,15 @@ mod tests {
 
     // u16 RGB at native 12-bit depth.
     let mut rgb_u16 = [0u16; 8 * 3];
-    y212_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false);
+    y212_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(2048) <= 2);
       assert_eq!(px[0], px[1]);
@@ -459,21 +636,29 @@ mod tests {
 
     // u16 RGBA — alpha = 4095.
     let mut rgba_u16 = [0u16; 8 * 4];
-    y212_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false);
+    y212_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 4095);
     }
 
     // u8 luma — Y=2048 → 128 after `>> 4`.
     let mut luma = [0u8; 8];
-    y212_to_luma_row(&buf, &mut luma, 8, false);
+    y212_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, (2048u16 >> 4) as u8);
     }
 
     // u16 luma — low-packed 12-bit Y.
     let mut luma_u16 = [0u16; 8];
-    y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 2048);
     }
diff --git a/src/row/dispatch/y216.rs b/src/row/dispatch/y216.rs
index 9f0fc6de..541022c7 100644
--- a/src/row/dispatch/y216.rs
+++ b/src/row/dispatch/y216.rs
@@ -30,7 +30,8 @@ use crate::{
 
 /// Converts one row of Y216 to packed RGB (u8). See
 /// [`scalar::y216_to_rgb_or_rgba_row`] for sample layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `big_endian = true` selects
+/// the big-endian wire encoding (u16 samples stored MSB-first).
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn y216_to_rgb_row(
   packed: &[u16],
@@ -39,6 +40,7 @@ pub fn y216_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -53,36 +55,57 @@ pub fn y216_to_rgb_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -90,7 +113,10 @@ pub fn y216_to_rgb_row(
     }
   }
 
-  scalar::y216_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y216_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range),
+    scalar::y216_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y216 to packed RGBA (u8) with `α = 0xFF`.
@@ -102,6 +128,7 @@ pub fn y216_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -116,36 +143,57 @@ pub fn y216_to_rgba_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -153,7 +201,10 @@ pub fn y216_to_rgba_row(
     }
   }
 
-  scalar::y216_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y216_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range),
+    scalar::y216_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range)
+  );
 }
 
 /// Converts one row of Y216 to packed `u16` RGB at native 16-bit
@@ -166,6 +217,7 @@ pub fn y216_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -180,36 +232,57 @@ pub fn y216_to_rgb_u16_row(
     "rgb_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -217,7 +290,14 @@ pub fn y216_to_rgb_u16_row(
     }
   }
 
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range
+    ),
+    scalar::y216_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Converts one row of Y216 to packed `u16` RGBA at native 16-bit
@@ -230,6 +310,7 @@ pub fn y216_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  big_endian: bool,
 ) {
   assert!(
     width.is_multiple_of(2),
@@ -244,36 +325,57 @@ pub fn y216_to_rgba_u16_row(
     "rgba_out row too short"
   );
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); },
+            unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          );
           return;
         }
       },
@@ -281,13 +383,26 @@ pub fn y216_to_rgba_u16_row(
     }
   }
 
-  scalar::y216_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  dispatch_be!(
+    scalar::y216_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range
+    ),
+    scalar::y216_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range
+    )
+  );
 }
 
 /// Extracts one row of 8-bit luma from a packed Y216 buffer.
 /// Y values are downshifted from 16-bit to 8-bit via `>> 8`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn y216_to_luma_row(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y216 requires even width (4:2:2 chroma pair)"
@@ -298,36 +413,57 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y216_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::neon::y216_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y216_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y216_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y216_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_luma_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_luma_row::<false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y216_to_luma_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -335,13 +471,22 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
     }
   }
 
-  scalar::y216_to_luma_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y216_to_luma_row::<false>(packed, luma_out, width),
+    scalar::y216_to_luma_row::<true>(packed, luma_out, width)
+  );
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed Y216
 /// buffer (full-range: each `u16` carries the 16-bit Y value directly).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn y216_to_luma_u16_row(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  big_endian: bool,
+) {
   assert!(
     width.is_multiple_of(2),
     "Y216 requires even width (4:2:2 chroma pair)"
@@ -352,36 +497,57 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
   );
   assert!(luma_out.len() >= width, "luma_out row too short");
 
+  macro_rules! dispatch_be {
+    ($call_le:expr, $call_be:expr) => {
+      if big_endian { $call_be } else { $call_le }
+    };
+  }
+
   if use_simd {
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::y216_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::neon::y216_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::neon::y216_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::y216_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx512::y216_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx512::y216_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::y216_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_avx2::y216_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_avx2::y216_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::y216_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::x86_sse41::y216_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::x86_sse41::y216_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::y216_to_luma_u16_row(packed, luma_out, width); }
+          dispatch_be!(
+            unsafe { arch::wasm_simd128::y216_to_luma_u16_row::<false>(packed, luma_out, width); },
+            unsafe { arch::wasm_simd128::y216_to_luma_u16_row::<true>(packed, luma_out, width); }
+          );
           return;
         }
       },
@@ -389,7 +555,10 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
     }
   }
 
-  scalar::y216_to_luma_u16_row(packed, luma_out, width);
+  dispatch_be!(
+    scalar::y216_to_luma_u16_row::<false>(packed, luma_out, width),
+    scalar::y216_to_luma_u16_row::<true>(packed, luma_out, width)
+  );
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -431,7 +600,7 @@ mod tests {
 
     // u8 RGB
     let mut rgb = [0u8; 8 * 3];
-    y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false);
+    y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -440,7 +609,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 8 * 4];
-    y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false);
+    y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -448,7 +617,15 @@ mod tests {
 
     // u16 RGB at native 16-bit depth.
     let mut rgb_u16 = [0u16; 8 * 3];
-    y216_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false);
+    y216_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(32768) <= 4);
       assert_eq!(px[0], px[1]);
@@ -457,21 +634,29 @@ mod tests {
 
     // u16 RGBA — alpha = 0xFFFF.
     let mut rgba_u16 = [0u16; 8 * 4];
-    y216_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false);
+    y216_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 0xFFFF);
     }
 
     // u8 luma — Y=32768 → 128 after `>> 8`.
     let mut luma = [0u8; 8];
-    y216_to_luma_row(&buf, &mut luma, 8, false);
+    y216_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, (32768u16 >> 8) as u8);
     }
 
     // u16 luma — full 16-bit Y value.
     let mut luma_u16 = [0u16; 8];
-    y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 32768);
     }
@@ -483,7 +668,7 @@ mod tests {
     // packed buffer has only 2 elements for width=4 (needs 8).
     let packed = [0u16; 2];
     let mut rgb = [0u8; 4 * 3];
-    y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -492,7 +677,7 @@ mod tests {
     // output buffer has only 2 bytes for width=4 (needs 12).
     let packed = [0u16; 8];
     let mut rgb = [0u8; 2];
-    y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -500,7 +685,7 @@ mod tests {
   fn y216_dispatcher_rejects_odd_width() {
     let packed = [0u16; 6];
     let mut rgb = [0u8; 9];
-    y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false);
+    y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -521,6 +706,7 @@ mod tests {
       ColorMatrix::Bt709,
       true,
       false,
+      false,
     );
   }
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 297f1c3c..93feab94 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -928,7 +928,15 @@ mod overflow_tests {
     let candidate = ((usize::MAX / 16) + 1) * 6;
     let p: [u8; 0] = [];
     let mut rgb: [u8; 0] = [];
-    v210_to_rgb_row(&p, &mut rgb, candidate, ColorMatrix::Bt601, true, false);
+    v210_to_rgb_row(
+      &p,
+      &mut rgb,
+      candidate,
+      ColorMatrix::Bt601,
+      true,
+      false,
+      false,
+    );
   }
 
   // ---- Y2xx dispatcher — `width × 2` overflow ----
@@ -958,6 +966,7 @@ mod overflow_tests {
       ColorMatrix::Bt601,
       true,
       false,
+      false,
     );
   }
 }
diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs
index b0e390ee..5c8cb66c 100644
--- a/src/row/scalar/mod.rs
+++ b/src/row/scalar/mod.rs
@@ -123,6 +123,59 @@ pub(crate) use yuv_planar_high_bit::*;
 
 // ---- Shared scalar helpers (used across all conversion families) -------
 
+/// Reads one `u16` from the byte address `ptr` in the endianness
+/// indicated by `BE`. `BE = false` → little-endian (native v210/Y2xx
+/// on-wire format); `BE = true` → big-endian. The unused branch is
+/// eliminated by the compiler when the caller is monomorphized.
+///
+/// **Target-endian aware** — this matches the SIMD `load_endian_u16x*`
+/// helpers' semantics: `u16::from_be_bytes` / `u16::from_le_bytes`
+/// each emit a `bswap` only when the source byte order differs from
+/// the host CPU's native order. On a BE host the `BE = true` branch
+/// is a plain load (no swap) and the `BE = false` branch swaps; on
+/// an LE host the polarity reverses. This is the strict-superset-of-
+/// bugs alternative to a naive `if BE { x.swap_bytes() }` pattern,
+/// which would corrupt rows on s390x / other BE hosts. See
+/// `fix(be-tier10b): make scalar BE conversion target-endian aware`
+/// for the codex finding that motivated this contract crate-wide.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 2 readable bytes.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) unsafe fn load_endian_u16<const BE: bool>(ptr: *const u8) -> u16 {
+  let bytes = unsafe { [*ptr, *ptr.add(1)] };
+  if BE {
+    u16::from_be_bytes(bytes)
+  } else {
+    u16::from_le_bytes(bytes)
+  }
+}
+
+/// Reads one `u32` from the byte address `ptr` in the endianness
+/// indicated by `BE`. `BE = false` → little-endian; `BE = true` →
+/// big-endian. The unused branch is eliminated by the compiler when
+/// the caller is monomorphized.
+///
+/// **Target-endian aware** — `u32::from_be_bytes` / `u32::from_le_bytes`
+/// each emit a `bswap` only when the source byte order differs from
+/// the host CPU's native order, matching the SIMD `load_endian_u32x*`
+/// helpers. See [`load_endian_u16`] for the full target-endian
+/// contract and the codex motivation.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 4 readable bytes.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) unsafe fn load_endian_u32<const BE: bool>(ptr: *const u8) -> u32 {
+  let bytes = unsafe { [*ptr, *ptr.add(1), *ptr.add(2), *ptr.add(3)] };
+  if BE {
+    u32::from_be_bytes(bytes)
+  } else {
+    u32::from_le_bytes(bytes)
+  }
+}
+
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(super) fn clamp_u8(v: i32) -> u8 {
   v.clamp(0, 255) as u8
diff --git a/src/row/scalar/v210.rs b/src/row/scalar/v210.rs
index 00a4e029..1b9db248 100644
--- a/src/row/scalar/v210.rs
+++ b/src/row/scalar/v210.rs
@@ -10,6 +10,15 @@
 //!   word 2: `[Cr1, Y3, Cb2]`
 //!   word 3: `[Y4,  Cr2, Y5]`
 //!
+//! ## Big-endian wire format (`BE = true`)
+//!
+//! When `BE = true`, each 32-bit word in the packed stream is
+//! stored in big-endian byte order. `load_endian_u32::<BE>` handles
+//! the conditional byte-swap at each u32 load site inside
+//! `unpack_v210_word`; the `BE = false` path is identical to the
+//! previous `u32::from_le_bytes` decode. The unused branch is
+//! eliminated at monomorphization.
+//!
 //! ## Partial-word support
 //!
 //! Real captures (e.g. 720p = 1280 wide) commonly end on a partial
@@ -32,14 +41,16 @@ use super::*;
 
 /// Extracts 6 Y + 3 U + 3 V 10-bit samples from one 16-byte v210
 /// word. Output samples are 10-bit values in the low 10 bits of
-/// each `u16`.
+/// each `u16`. `BE = true` reads each 32-bit word in big-endian
+/// byte order.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) {
+fn unpack_v210_word<const BE: bool>(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) {
   debug_assert_eq!(word.len(), 16);
-  let w0 = u32::from_le_bytes([word[0], word[1], word[2], word[3]]);
-  let w1 = u32::from_le_bytes([word[4], word[5], word[6], word[7]]);
-  let w2 = u32::from_le_bytes([word[8], word[9], word[10], word[11]]);
-  let w3 = u32::from_le_bytes([word[12], word[13], word[14], word[15]]);
+  // SAFETY: word has exactly 16 bytes (checked above); each offset is ≤ 12.
+  let w0 = unsafe { load_endian_u32::<BE>(word.as_ptr()) };
+  let w1 = unsafe { load_endian_u32::<BE>(word.as_ptr().add(4)) };
+  let w2 = unsafe { load_endian_u32::<BE>(word.as_ptr().add(8)) };
+  let w3 = unsafe { load_endian_u32::<BE>(word.as_ptr().add(12)) };
 
   // Word 0: [Cb0, Y0, Cr0]
   let cb0 = (w0 & 0x3FF) as u16;
@@ -70,14 +81,14 @@ fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) {
 ///
 /// Supports any **even** `width`: complete 6-px words run the full
 /// loop; a final partial word emits 2 or 4 pixels from its valid
-/// chroma-pair prefix.
+/// chroma-pair prefix. `BE = true` selects big-endian u32 word decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= ceil(width / 6) * 16`.
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn v210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u8],
   width: usize,
@@ -101,7 +112,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 
   for w in 0..full_words {
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, us, vs) = unpack_v210_word(word);
+    let (ys, us, vs) = unpack_v210_word::<BE>(word);
 
     // 6 pixels per word; each chroma pair (U[i], V[i]) covers
     // Y[2i] and Y[2i+1].
@@ -135,7 +146,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
     // pairs are valid (1 pair for 2 px; 2 pairs for 4 px).
     let w = full_words;
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, us, vs) = unpack_v210_word(word);
+    let (ys, us, vs) = unpack_v210_word::<BE>(word);
     let pairs = tail_pixels / 2;
     for i in 0..pairs {
       let u_d = q15_scale(us[i] as i32 - bias, c_scale);
@@ -172,14 +183,15 @@ pub(crate) fn v210_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// `(1 << 10) - 1 = 1023` (opaque maximum at 10-bit).
 ///
 /// Supports any **even** `width`: see [`v210_to_rgb_or_rgba_row`]
-/// for partial-word semantics.
+/// for partial-word semantics. `BE = true` selects big-endian u32 word
+/// decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= ceil(width / 6) * 16`.
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u8],
   out: &mut [u16],
   width: usize,
@@ -204,7 +216,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
   for w in 0..full_words {
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, us, vs) = unpack_v210_word(word);
+    let (ys, us, vs) = unpack_v210_word::<BE>(word);
 
     for i in 0..3 {
       let u_d = q15_scale(us[i] as i32 - bias, c_scale);
@@ -232,7 +244,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   if tail_pixels > 0 {
     let w = full_words;
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, us, vs) = unpack_v210_word(word);
+    let (ys, us, vs) = unpack_v210_word::<BE>(word);
     let pairs = tail_pixels / 2;
     for i in 0..pairs {
       let u_d = q15_scale(us[i] as i32 - bias, c_scale);
@@ -262,13 +274,14 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
 /// Scalar v210 → 8-bit luma. Y values are downshifted from 10-bit
 /// to 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely.
+/// `BE = true` selects big-endian u32 word decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= ceil(width / 6) * 16`.
 /// - `luma_out.len() >= width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
+pub(crate) fn v210_to_luma_row<const BE: bool>(packed: &[u8], luma_out: &mut [u8], width: usize) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   debug_assert!(packed.len() >= total_words * 16, "packed row too short");
@@ -279,7 +292,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize)
 
   for w in 0..full_words {
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, _, _) = unpack_v210_word(word);
+    let (ys, _, _) = unpack_v210_word::<BE>(word);
     for k in 0..6 {
       luma_out[w * 6 + k] = (ys[k] >> 2) as u8;
     }
@@ -287,7 +300,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize)
   if tail_pixels > 0 {
     let w = full_words;
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, _, _) = unpack_v210_word(word);
+    let (ys, _, _) = unpack_v210_word::<BE>(word);
     for k in 0..tail_pixels {
       luma_out[w * 6 + k] = (ys[k] >> 2) as u8;
     }
@@ -296,14 +309,19 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize)
 
 /// Scalar v210 → native-depth `u16` luma (low-bit-packed). Each
 /// output `u16` carries the source's 10-bit Y value in its low 10
-/// bits (upper 6 bits zero).
+/// bits (upper 6 bits zero). `BE = true` selects big-endian u32 word
+/// decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= ceil(width / 6) * 16`.
 /// - `luma_out.len() >= width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) {
+pub(crate) fn v210_to_luma_u16_row<const BE: bool>(
+  packed: &[u8],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(width.is_multiple_of(2), "v210 requires even width");
   let total_words = width.div_ceil(6);
   debug_assert!(packed.len() >= total_words * 16, "packed row too short");
@@ -314,13 +332,13 @@ pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: u
 
   for w in 0..full_words {
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, _, _) = unpack_v210_word(word);
+    let (ys, _, _) = unpack_v210_word::<BE>(word);
     luma_out[w * 6..w * 6 + 6].copy_from_slice(&ys);
   }
   if tail_pixels > 0 {
     let w = full_words;
     let word = &packed[w * 16..w * 16 + 16];
-    let (ys, _, _) = unpack_v210_word(word);
+    let (ys, _, _) = unpack_v210_word::<BE>(word);
     luma_out[w * 6..w * 6 + tail_pixels].copy_from_slice(&ys[..tail_pixels]);
   }
 }
@@ -358,12 +376,34 @@ mod tests {
     out
   }
 
+  /// Pack a v210 word using big-endian u32 encoding (each 32-bit word stored BE).
+  fn pack_v210_word_be(samples: [u16; 12]) -> [u8; 16] {
+    let mut out = [0u8; 16];
+    let w0 = (samples[0] as u32 & 0x3FF)
+      | ((samples[1] as u32 & 0x3FF) << 10)
+      | ((samples[2] as u32 & 0x3FF) << 20);
+    let w1 = (samples[3] as u32 & 0x3FF)
+      | ((samples[4] as u32 & 0x3FF) << 10)
+      | ((samples[5] as u32 & 0x3FF) << 20);
+    let w2 = (samples[6] as u32 & 0x3FF)
+      | ((samples[7] as u32 & 0x3FF) << 10)
+      | ((samples[8] as u32 & 0x3FF) << 20);
+    let w3 = (samples[9] as u32 & 0x3FF)
+      | ((samples[10] as u32 & 0x3FF) << 10)
+      | ((samples[11] as u32 & 0x3FF) << 20);
+    out[0..4].copy_from_slice(&w0.to_be_bytes());
+    out[4..8].copy_from_slice(&w1.to_be_bytes());
+    out[8..12].copy_from_slice(&w2.to_be_bytes());
+    out[12..16].copy_from_slice(&w3.to_be_bytes());
+    out
+  }
+
   #[test]
   fn scalar_v210_to_rgb_gray_is_gray() {
     // Full-range gray: Y=512, U=V=512 (10-bit center).
     let word = pack_v210_word([512; 12]);
     let mut rgb = [0u8; 6 * 3];
-    v210_to_rgb_or_rgba_row::<false>(&word, &mut rgb, 6, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<false, false>(&word, &mut rgb, 6, ColorMatrix::Bt709, true);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -375,7 +415,7 @@ mod tests {
   fn scalar_v210_to_rgba_gray_is_gray_with_opaque_alpha() {
     let word = pack_v210_word([512; 12]);
     let mut rgba = [0u8; 6 * 4];
-    v210_to_rgb_or_rgba_row::<true>(&word, &mut rgba, 6, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<true, false>(&word, &mut rgba, 6, ColorMatrix::Bt709, true);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -387,7 +427,13 @@ mod tests {
     // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023).
     let word = pack_v210_word([512; 12]);
     let mut rgb_u16 = [0u16; 6 * 3];
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true);
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &word,
+      &mut rgb_u16,
+      6,
+      ColorMatrix::Bt709,
+      true,
+    );
     for px in rgb_u16.chunks(3) {
       // Gray luma at 512 / full-range produces RGB ~512 in 10-bit.
       assert!(px[0].abs_diff(512) <= 2);
@@ -400,7 +446,13 @@ mod tests {
   fn scalar_v210_to_rgba_u16_alpha_is_max() {
     let word = pack_v210_word([512; 12]);
     let mut rgba_u16 = [0u16; 6 * 4];
-    v210_to_rgb_u16_or_rgba_u16_row::<true>(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true);
+    v210_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      &word,
+      &mut rgba_u16,
+      6,
+      ColorMatrix::Bt709,
+      true,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1");
     }
@@ -413,7 +465,7 @@ mod tests {
     ];
     let word = pack_v210_word(samples);
     let mut luma = [0u8; 6];
-    v210_to_luma_row(&word, &mut luma, 6);
+    v210_to_luma_row::<false>(&word, &mut luma, 6);
     // Y values: 200, 300, 400, 500, 600, 700 → 10-bit, downshift >> 2.
     assert_eq!(luma[0], (200u16 >> 2) as u8);
     assert_eq!(luma[1], (300u16 >> 2) as u8);
@@ -428,7 +480,7 @@ mod tests {
     let samples = [100, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700];
     let word = pack_v210_word(samples);
     let mut luma = [0u16; 6];
-    v210_to_luma_u16_row(&word, &mut luma, 6);
+    v210_to_luma_u16_row::<false>(&word, &mut luma, 6);
     assert_eq!(luma[0], 200);
     assert_eq!(luma[1], 300);
     assert_eq!(luma[2], 400);
@@ -445,7 +497,7 @@ mod tests {
     packed.extend_from_slice(&pack_v210_word(samples));
     packed.extend_from_slice(&pack_v210_word(samples));
     let mut rgb = std::vec![0u8; 12 * 3];
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut rgb, 12, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<false, false>(&packed, &mut rgb, 12, ColorMatrix::Bt709, true);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
     }
@@ -468,19 +520,19 @@ mod tests {
       packed.extend_from_slice(&pack_v210_word([512; 12]));
     }
     let mut rgb = std::vec![0u8; width * 3];
-    v210_to_rgb_or_rgba_row::<false>(&packed, &mut rgb, width, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<false, false>(&packed, &mut rgb, width, ColorMatrix::Bt709, true);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1, "width={width}: gray RGB diverged");
       assert_eq!(px[0], px[1]);
     }
     let mut rgba = std::vec![0u8; width * 4];
-    v210_to_rgb_or_rgba_row::<true>(&packed, &mut rgba, width, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<true, false>(&packed, &mut rgba, width, ColorMatrix::Bt709, true);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
     }
     let mut rgb_u16 = std::vec![0u16; width * 3];
-    v210_to_rgb_u16_or_rgba_u16_row::<false>(
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(
       &packed,
       &mut rgb_u16,
       width,
@@ -491,12 +543,12 @@ mod tests {
       assert!(px[0].abs_diff(512) <= 2);
     }
     let mut luma = std::vec![0u8; width];
-    v210_to_luma_row(&packed, &mut luma, width);
+    v210_to_luma_row::<false>(&packed, &mut luma, width);
     for &y in &luma {
       assert_eq!(y, 128);
     }
     let mut luma_u16 = std::vec![0u16; width];
-    v210_to_luma_u16_row(&packed, &mut luma_u16, width);
+    v210_to_luma_u16_row::<false>(&packed, &mut luma_u16, width);
     for &y in &luma_u16 {
       assert_eq!(y, 512);
     }
@@ -558,8 +610,81 @@ mod tests {
     ];
     let word = pack_v210_word(samples);
     let mut luma = [0u8; 2];
-    v210_to_luma_row(&word, &mut luma, 2);
+    v210_to_luma_row::<false>(&word, &mut luma, 2);
     assert_eq!(luma[0], (600u16 >> 2) as u8);
     assert_eq!(luma[1], (700u16 >> 2) as u8);
   }
+
+  // ---- BE parity tests -----------------------------------------------
+  //
+  // For each output type: pack the same samples in BE word encoding,
+  // run the BE=true path, assert identical output to the LE=false path.
+
+  #[test]
+  fn scalar_v210_be_rgb_matches_le() {
+    let samples = [
+      100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800,
+    ];
+    let le_word = pack_v210_word(samples);
+    let be_word = pack_v210_word_be(samples);
+    let mut le_rgb = [0u8; 6 * 3];
+    let mut be_rgb = [0u8; 6 * 3];
+    v210_to_rgb_or_rgba_row::<false, false>(&le_word, &mut le_rgb, 6, ColorMatrix::Bt709, true);
+    v210_to_rgb_or_rgba_row::<false, true>(&be_word, &mut be_rgb, 6, ColorMatrix::Bt709, true);
+    assert_eq!(le_rgb, be_rgb, "BE rgb output must match LE");
+  }
+
+  #[test]
+  fn scalar_v210_be_rgb_u16_matches_le() {
+    let samples = [
+      100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800,
+    ];
+    let le_word = pack_v210_word(samples);
+    let be_word = pack_v210_word_be(samples);
+    let mut le_rgb = [0u16; 6 * 3];
+    let mut be_rgb = [0u16; 6 * 3];
+    v210_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &le_word,
+      &mut le_rgb,
+      6,
+      ColorMatrix::Bt709,
+      true,
+    );
+    v210_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      &be_word,
+      &mut be_rgb,
+      6,
+      ColorMatrix::Bt709,
+      true,
+    );
+    assert_eq!(le_rgb, be_rgb, "BE rgb_u16 output must match LE");
+  }
+
+  #[test]
+  fn scalar_v210_be_luma_matches_le() {
+    let samples = [
+      100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700,
+    ];
+    let le_word = pack_v210_word(samples);
+    let be_word = pack_v210_word_be(samples);
+    let mut le_luma = [0u8; 6];
+    let mut be_luma = [0u8; 6];
+    v210_to_luma_row::<false>(&le_word, &mut le_luma, 6);
+    v210_to_luma_row::<true>(&be_word, &mut be_luma, 6);
+    assert_eq!(le_luma, be_luma, "BE luma output must match LE");
+  }
+
+  #[test]
+  fn scalar_v210_be_luma_u16_matches_le() {
+    let samples = [
+      100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700,
+    ];
+    let le_word = pack_v210_word(samples);
+    let be_word = pack_v210_word_be(samples);
+    let mut le_luma = [0u16; 6];
+    let mut be_luma = [0u16; 6];
+    v210_to_luma_u16_row::<false>(&le_word, &mut le_luma, 6);
+    v210_to_luma_u16_row::<true>(&be_word, &mut be_luma, 6);
+    assert_eq!(le_luma, be_luma, "BE luma_u16 output must match LE");
+  }
 }
diff --git a/src/row/scalar/y216.rs b/src/row/scalar/y216.rs
index 088ec22e..291e8914 100644
--- a/src/row/scalar/y216.rs
+++ b/src/row/scalar/y216.rs
@@ -6,13 +6,21 @@
 //! `src/row/scalar/yuv_planar_16bit.rs`'s i64 chroma scalar
 //! pattern but sourced from YUYV-shaped u16 quadruples rather
 //! than separate Y/U/V planes.
+//!
+//! ## Big-endian wire format (`BE = true`)
+//!
+//! When `BE = true`, each `u16` element in `packed` is stored in
+//! big-endian byte order. `load_endian_u16::<BE>` handles the
+//! conditional byte-swap at each sample site; the unused branch is
+//! eliminated at monomorphization.
 
 use super::*;
 
 // ---- u8 RGB / RGBA output (i32 chroma — same as Y210/Y212) -------
 
+/// `BE = true` selects big-endian wire decoding for each u16 sample.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn y216_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -29,13 +37,15 @@ pub(crate) fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
   let bias = chroma_bias::<16>();
 
   let pairs = width / 2;
+  // SAFETY: bounds validated above; off4 + 6 < packed.len() * 2 for p < pairs.
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
+    let off4 = p * 4 * 2;
     // No right-shift: BITS=16 means samples are already full-width.
-    let y0 = q[0] as i32;
-    let u = q[1] as i32;
-    let y1 = q[2] as i32;
-    let v = q[3] as i32;
+    let y0 = unsafe { load_endian_u16::<BE>(base.add(off4)) } as i32;
+    let u = unsafe { load_endian_u16::<BE>(base.add(off4 + 2)) } as i32;
+    let y1 = unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) } as i32;
+    let v = unsafe { load_endian_u16::<BE>(base.add(off4 + 6)) } as i32;
 
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -59,8 +69,9 @@ pub(crate) fn y216_to_rgb_or_rgba_row<const ALPHA: bool>(
 
 // ---- u16 RGB / RGBA native-depth output (i64 chroma) ----------------
 
+/// `BE = true` selects big-endian wire decoding for each u16 sample.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -78,12 +89,13 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   let out_max: i32 = 0xFFFF;
 
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    let y0 = q[0] as i32;
-    let u = q[1] as i32;
-    let y1 = q[2] as i32;
-    let v = q[3] as i32;
+    let off4 = p * 4 * 2;
+    let y0 = unsafe { load_endian_u16::<BE>(base.add(off4)) } as i32;
+    let u = unsafe { load_endian_u16::<BE>(base.add(off4 + 2)) } as i32;
+    let y1 = unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) } as i32;
+    let v = unsafe { load_endian_u16::<BE>(base.add(off4 + 6)) } as i32;
 
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -107,31 +119,38 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
 // ---- Luma (u8) — `>> 8` ----------------------------------------------
 
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) fn y216_to_luma_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    out[p * 2] = (q[0] >> 8) as u8;
-    out[p * 2 + 1] = (q[2] >> 8) as u8;
+    let off4 = p * 4 * 2;
+    let y0 = unsafe { load_endian_u16::<BE>(base.add(off4)) };
+    let y1 = unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) };
+    out[p * 2] = (y0 >> 8) as u8;
+    out[p * 2 + 1] = (y1 >> 8) as u8;
   }
 }
 
 // ---- Luma (u16, direct extract) ---------------------------------------
 
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) fn y216_to_luma_u16_row<const BE: bool>(packed: &[u16], out: &mut [u16], width: usize) {
   debug_assert!(width.is_multiple_of(2));
   debug_assert!(packed.len() >= width * 2);
   debug_assert!(out.len() >= width);
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    out[p * 2] = q[0]; // direct extract — full 16 bits, no shift
-    out[p * 2 + 1] = q[2];
+    let off4 = p * 4 * 2;
+    // Direct extract — full 16 bits, no shift; byte-swap if BE.
+    out[p * 2] = unsafe { load_endian_u16::<BE>(base.add(off4)) };
+    out[p * 2 + 1] = unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) };
   }
 }
 
@@ -147,6 +166,11 @@ mod tests {
     [4096, 32768, 32000, 32768, 0, 16384, 65535, 49152]
   }
 
+  /// Byte-swap every u16 to produce the BE-encoded form.
+  fn to_be_u16(le: &[u16]) -> std::vec::Vec<u16> {
+    le.iter().map(|&v| v.swap_bytes()).collect()
+  }
+
   /// u8 RGB output — hand-derived expected values for Bt709 limited range.
   ///
   /// Pair 0 (neutral chroma, U=V=32768=bias → u_d=v_d=0 → chroma=0):
@@ -159,7 +183,7 @@ mod tests {
   fn y216_known_pattern_rgb() {
     let packed = test_input();
     let mut out = [0u8; 4 * 3];
-    y216_to_rgb_or_rgba_row::<false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_or_rgba_row::<false, false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
 
     // Pixel 0: Y=4096 (limited-range black), neutral chroma → (0, 0, 0)
     assert_eq!(&out[0..3], &[0, 0, 0], "pixel 0 (Y=4096, neutral chroma)");
@@ -184,7 +208,7 @@ mod tests {
   fn y216_known_pattern_rgba() {
     let packed = test_input();
     let mut out = [0u8; 4 * 4];
-    y216_to_rgb_or_rgba_row::<true>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_or_rgba_row::<true, false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
 
     assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
     assert_eq!(&out[4..8], &[127, 127, 127, 0xFF]);
@@ -201,7 +225,13 @@ mod tests {
   fn y216_known_pattern_rgb_u16() {
     let packed = test_input();
     let mut out = [0u16; 4 * 3];
-    y216_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut out,
+      4,
+      ColorMatrix::Bt709,
+      false,
+    );
 
     // Pixel 0: Y=4096 = limited-range floor → all channels 0
     assert_eq!(
@@ -226,7 +256,7 @@ mod tests {
   fn y216_known_pattern_rgba_u16() {
     let packed = test_input();
     let mut out = [0u16; 4 * 4];
-    y216_to_rgb_u16_or_rgba_u16_row::<true>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<true, false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
 
     assert_eq!(&out[0..4], &[0, 0, 0, 0xFFFF]);
     assert_eq!(&out[4..8], &[32618, 32618, 32618, 0xFFFF]);
@@ -242,7 +272,7 @@ mod tests {
   fn y216_luma_extract() {
     let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555];
     let mut out = [0u8; 2];
-    y216_to_luma_row(&packed, &mut out, 2);
+    y216_to_luma_row::<false>(&packed, &mut out, 2);
     assert_eq!(out[0], 0xAB, "Y0 luma u8");
     assert_eq!(out[1], 0xCD, "Y1 luma u8");
   }
@@ -253,8 +283,55 @@ mod tests {
   fn y216_luma_u16_extract() {
     let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555];
     let mut out = [0u16; 2];
-    y216_to_luma_u16_row(&packed, &mut out, 2);
+    y216_to_luma_u16_row::<false>(&packed, &mut out, 2);
     assert_eq!(out[0], 0xAB12, "Y0 luma u16");
     assert_eq!(out[1], 0xCD34, "Y1 luma u16");
   }
+
+  // ---- BE=true parity tests -------------------------------------------
+
+  /// Verify byte-swapped Y216 input + BE=true matches LE+BE=false for RGB u8.
+  #[test]
+  fn y216_be_rgb_matches_le() {
+    let le = test_input();
+    let be = to_be_u16(&le);
+    let mut out_le = [0u8; 4 * 3];
+    let mut out_be = [0u8; 4 * 3];
+    y216_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, 4, ColorMatrix::Bt709, false);
+    assert_eq!(out_le, out_be, "BE and LE RGB paths must match");
+  }
+
+  #[test]
+  fn y216_be_rgb_u16_matches_le() {
+    let le = test_input();
+    let be = to_be_u16(&le);
+    let mut out_le = [0u16; 4 * 3];
+    let mut out_be = [0u16; 4 * 3];
+    y216_to_rgb_u16_or_rgba_u16_row::<false, false>(&le, &mut out_le, 4, ColorMatrix::Bt709, false);
+    y216_to_rgb_u16_or_rgba_u16_row::<false, true>(&be, &mut out_be, 4, ColorMatrix::Bt709, false);
+    assert_eq!(out_le, out_be, "BE and LE RGB u16 paths must match");
+  }
+
+  #[test]
+  fn y216_be_luma_matches_le() {
+    let le = test_input();
+    let be = to_be_u16(&le);
+    let mut luma_le = [0u8; 4];
+    let mut luma_be = [0u8; 4];
+    y216_to_luma_row::<false>(&le, &mut luma_le, 4);
+    y216_to_luma_row::<true>(&be, &mut luma_be, 4);
+    assert_eq!(luma_le, luma_be, "BE and LE luma paths must match");
+  }
+
+  #[test]
+  fn y216_be_luma_u16_matches_le() {
+    let le = test_input();
+    let be = to_be_u16(&le);
+    let mut luma_le = [0u16; 4];
+    let mut luma_be = [0u16; 4];
+    y216_to_luma_u16_row::<false>(&le, &mut luma_le, 4);
+    y216_to_luma_u16_row::<true>(&be, &mut luma_be, 4);
+    assert_eq!(luma_le, luma_be, "BE and LE luma_u16 paths must match");
+  }
 }
diff --git a/src/row/scalar/y2xx.rs b/src/row/scalar/y2xx.rs
index 51aa7ba0..d3c7a8b3 100644
--- a/src/row/scalar/y2xx.rs
+++ b/src/row/scalar/y2xx.rs
@@ -10,6 +10,15 @@
 //! `BITS` (mirrors `v210.rs`'s use of `range_params_n` /
 //! `chroma_bias` / `q15_scale` / `q15_chroma`, just sourced from
 //! Y2xx's u16 packed quadruples rather than v210's 16-byte words).
+//!
+//! ## Big-endian wire format (`BE = true`)
+//!
+//! When `BE = true`, each `u16` element in `packed` is stored in
+//! big-endian byte order (high byte first). The `<const BE: bool>`
+//! const-generic gates `load_endian_u16::<BE>` at each sample read
+//! site; on LE targets the `BE = false` path is identical to the
+//! previous plain slice index. On LE hosts with `BE = false` the
+//! compiler eliminates the branch entirely.
 
 use super::*;
 
@@ -31,12 +40,14 @@ const fn rshift_bits<const BITS: u32>(sample: u16) -> u16 {
 /// (downshifted from the native BITS Q15 pipeline via
 /// `range_params_n::<BITS, 8>`).
 ///
+/// `BE = true` selects big-endian wire decoding for each u16 sample.
+///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= width * 2` (one u16 quadruple per chroma pair).
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -60,12 +71,15 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
 
   // One chroma pair (= 2 pixels) per iter.
   let pairs = width / 2;
+  // SAFETY: bounds checked by the debug_asserts above; p * 4 + 4 <= width * 2
+  // because pairs = width / 2, so p < pairs means p * 4 + 4 <= width * 2.
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    let y0 = rshift_bits::<BITS>(q[0]) as i32;
-    let u = rshift_bits::<BITS>(q[1]) as i32;
-    let y1 = rshift_bits::<BITS>(q[2]) as i32;
-    let v = rshift_bits::<BITS>(q[3]) as i32;
+    let off4 = p * 4 * 2; // byte offset to quadruple p (4 u16 = 8 bytes)
+    let y0 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4)) }) as i32;
+    let u = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 2)) }) as i32;
+    let y1 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) }) as i32;
+    let v = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 6)) }) as i32;
 
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -96,13 +110,18 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
 ///
 /// `ALPHA = true` writes a 4-element-per-pixel output with α =
 /// `(1 << BITS) - 1` (opaque maximum at the native depth).
+/// `BE = true` selects big-endian wire decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= width * 2`.
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -127,12 +146,13 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bo
   let alpha_max: u16 = out_max as u16;
 
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    let y0 = rshift_bits::<BITS>(q[0]) as i32;
-    let u = rshift_bits::<BITS>(q[1]) as i32;
-    let y1 = rshift_bits::<BITS>(q[2]) as i32;
-    let v = rshift_bits::<BITS>(q[3]) as i32;
+    let off4 = p * 4 * 2;
+    let y0 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4)) }) as i32;
+    let u = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 2)) }) as i32;
+    let y1 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) }) as i32;
+    let v = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 6)) }) as i32;
 
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -158,13 +178,14 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row<const BITS: u32, const ALPHA: bo
 
 /// Y2xx → 8-bit luma. Y values are downshifted from BITS to 8 via
 /// `>> (BITS - 8)`. Bypasses the YUV → RGB pipeline entirely.
+/// `BE = true` selects big-endian wire decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= width * 2`.
 /// - `luma_out.len() >= width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y2xx_n_to_luma_row<const BITS: u32>(
+pub(crate) fn y2xx_n_to_luma_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u8],
   width: usize,
@@ -180,10 +201,11 @@ pub(crate) fn y2xx_n_to_luma_row<const BITS: u32>(
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    let y0 = rshift_bits::<BITS>(q[0]);
-    let y1 = rshift_bits::<BITS>(q[2]);
+    let off4 = p * 4 * 2;
+    let y0 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4)) });
+    let y1 = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) });
     luma_out[p * 2] = (y0 >> (BITS - 8)) as u8;
     luma_out[p * 2 + 1] = (y1 >> (BITS - 8)) as u8;
   }
@@ -191,14 +213,15 @@ pub(crate) fn y2xx_n_to_luma_row<const BITS: u32>(
 
 /// Y2xx → native-depth `u16` luma (low-bit-packed). Each output
 /// `u16` carries the source's BITS-bit Y value in its low BITS bits
-/// (upper `(16 - BITS)` bits zero).
+/// (upper `(16 - BITS)` bits zero). `BE = true` selects big-endian
+/// wire decoding.
 ///
 /// # Panics (debug builds)
 /// - `width` must be even.
 /// - `packed.len() >= width * 2`.
 /// - `luma_out.len() >= width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y2xx_n_to_luma_u16_row<const BITS: u32>(
+pub(crate) fn y2xx_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
   packed: &[u16],
   luma_out: &mut [u16],
   width: usize,
@@ -214,10 +237,11 @@ pub(crate) fn y2xx_n_to_luma_u16_row<const BITS: u32>(
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
   let pairs = width / 2;
+  let base = packed.as_ptr().cast::<u8>();
   for p in 0..pairs {
-    let q = &packed[p * 4..p * 4 + 4];
-    luma_out[p * 2] = rshift_bits::<BITS>(q[0]);
-    luma_out[p * 2 + 1] = rshift_bits::<BITS>(q[2]);
+    let off4 = p * 4 * 2;
+    luma_out[p * 2] = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4)) });
+    luma_out[p * 2 + 1] = rshift_bits::<BITS>(unsafe { load_endian_u16::<BE>(base.add(off4 + 4)) });
   }
 }
 
@@ -227,39 +251,47 @@ pub(crate) fn y2xx_n_to_luma_u16_row<const BITS: u32>(
 // BITS=12 wrappers (`y212_to_*_row`) without further kernel changes.
 
 /// Public Y210 (BITS=10) → packed RGB / RGBA u8 wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y210_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn y210_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  y2xx_n_to_rgb_or_rgba_row::<10, ALPHA>(packed, out, width, matrix, full_range);
+  y2xx_n_to_rgb_or_rgba_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range);
 }
 
 /// Public Y210 → packed `u16` RGB / RGBA wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA>(packed, out, width, matrix, full_range);
+  y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range);
 }
 
 /// Public Y210 → 8-bit luma wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
-  y2xx_n_to_luma_row::<10>(packed, luma_out, width);
+pub(crate) fn y210_to_luma_row<const BE: bool>(packed: &[u16], luma_out: &mut [u8], width: usize) {
+  y2xx_n_to_luma_row::<10, BE>(packed, luma_out, width);
 }
 
 /// Public Y210 → native-depth `u16` luma wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
-  y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width);
+pub(crate) fn y210_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
+  y2xx_n_to_luma_u16_row::<10, BE>(packed, luma_out, width);
 }
 
 // ---- Public Y212 (BITS=12) wrappers ------------------------------------
@@ -268,39 +300,47 @@ pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width:
 // SIMD code — the per-arch backends already accept BITS ∈ {10, 12}.
 
 /// Public Y212 (BITS=12) → packed RGB / RGBA u8 wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y212_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn y212_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  y2xx_n_to_rgb_or_rgba_row::<12, ALPHA>(packed, out, width, matrix, full_range);
+  y2xx_n_to_rgb_or_rgba_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range);
 }
 
 /// Public Y212 → packed `u16` RGB / RGBA wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA>(packed, out, width, matrix, full_range);
+  y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range);
 }
 
 /// Public Y212 → 8-bit luma wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
-  y2xx_n_to_luma_row::<12>(packed, luma_out, width);
+pub(crate) fn y212_to_luma_row<const BE: bool>(packed: &[u16], luma_out: &mut [u8], width: usize) {
+  y2xx_n_to_luma_row::<12, BE>(packed, luma_out, width);
 }
 
 /// Public Y212 → native-depth `u16` luma wrapper.
+/// `BE = true` selects big-endian wire decoding.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
-  y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width);
+pub(crate) fn y212_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
+  y2xx_n_to_luma_u16_row::<12, BE>(packed, luma_out, width);
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -329,12 +369,17 @@ mod tests {
     buf
   }
 
+  /// Byte-swap every u16 in a slice to produce the BE-encoded form.
+  fn to_be_u16(le: &[u16]) -> std::vec::Vec<u16> {
+    le.iter().map(|&v| v.swap_bytes()).collect()
+  }
+
   #[test]
   fn scalar_y210_to_rgb_gray_is_gray() {
     // Full-range gray: Y=512, U=V=512 (10-bit center) → RGB ~128.
     let buf = solid_y210(8, 512, 512, 512);
     let mut rgb = [0u8; 8 * 3];
-    y210_to_rgb_or_rgba_row::<false>(&buf, &mut rgb, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_or_rgba_row::<false, false>(&buf, &mut rgb, 8, ColorMatrix::Bt709, true);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -346,7 +391,7 @@ mod tests {
   fn scalar_y210_to_rgba_alpha_is_opaque() {
     let buf = solid_y210(8, 512, 512, 512);
     let mut rgba = [0u8; 8 * 4];
-    y210_to_rgb_or_rgba_row::<true>(&buf, &mut rgba, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_or_rgba_row::<true, false>(&buf, &mut rgba, 8, ColorMatrix::Bt709, true);
     for px in rgba.chunks(4) {
       assert_eq!(px[3], 0xFF);
     }
@@ -357,7 +402,7 @@ mod tests {
     // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023).
     let buf = solid_y210(8, 512, 512, 512);
     let mut rgb = [0u16; 8 * 3];
-    y210_to_rgb_u16_or_rgba_u16_row::<false>(&buf, &mut rgb, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_u16_or_rgba_u16_row::<false, false>(&buf, &mut rgb, 8, ColorMatrix::Bt709, true);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(512) <= 2, "px expected ~512, got {}", px[0]);
       assert_eq!(px[0], px[1]);
@@ -369,7 +414,7 @@ mod tests {
   fn scalar_y210_to_rgba_u16_alpha_is_max() {
     let buf = solid_y210(8, 512, 512, 512);
     let mut rgba = [0u16; 8 * 4];
-    y210_to_rgb_u16_or_rgba_u16_row::<true>(&buf, &mut rgba, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_u16_or_rgba_u16_row::<true, false>(&buf, &mut rgba, 8, ColorMatrix::Bt709, true);
     for px in rgba.chunks(4) {
       assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1");
     }
@@ -388,7 +433,7 @@ mod tests {
       buf[i * 4 + 3] = 128u16 << 6; // V
     }
     let mut luma = [0u8; 6];
-    y210_to_luma_row(&buf, &mut luma, 6);
+    y210_to_luma_row::<false>(&buf, &mut luma, 6);
     assert_eq!(luma[0], (100u16 >> 2) as u8);
     assert_eq!(luma[1], (200u16 >> 2) as u8);
     assert_eq!(luma[2], (300u16 >> 2) as u8);
@@ -408,7 +453,7 @@ mod tests {
       buf[i * 4 + 3] = 128u16 << 6;
     }
     let mut luma = [0u16; 6];
-    y210_to_luma_u16_row(&buf, &mut luma, 6);
+    y210_to_luma_u16_row::<false>(&buf, &mut luma, 6);
     assert_eq!(luma[0], 100);
     assert_eq!(luma[1], 200);
     assert_eq!(luma[2], 300);
@@ -416,4 +461,64 @@ mod tests {
     assert_eq!(luma[4], 500);
     assert_eq!(luma[5], 600);
   }
+
+  // ---- BE=true parity tests -------------------------------------------
+
+  /// Verify that byte-swapped Y210 input + BE=true produces the same
+  /// RGB output as the native LE input + BE=false.
+  #[test]
+  fn scalar_y210_be_rgb_matches_le() {
+    let le = solid_y210(8, 512, 512, 512);
+    let be = to_be_u16(&le);
+    let mut rgb_le = [0u8; 8 * 3];
+    let mut rgb_be = [0u8; 8 * 3];
+    y210_to_rgb_or_rgba_row::<false, false>(&le, &mut rgb_le, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_or_rgba_row::<false, true>(&be, &mut rgb_be, 8, ColorMatrix::Bt709, true);
+    assert_eq!(
+      rgb_le, rgb_be,
+      "BE and LE paths must produce identical output"
+    );
+  }
+
+  #[test]
+  fn scalar_y210_be_rgb_u16_matches_le() {
+    let le = solid_y210(8, 512, 512, 512);
+    let be = to_be_u16(&le);
+    let mut out_le = [0u16; 8 * 3];
+    let mut out_be = [0u16; 8 * 3];
+    y210_to_rgb_u16_or_rgba_u16_row::<false, false>(&le, &mut out_le, 8, ColorMatrix::Bt709, true);
+    y210_to_rgb_u16_or_rgba_u16_row::<false, true>(&be, &mut out_be, 8, ColorMatrix::Bt709, true);
+    assert_eq!(
+      out_le, out_be,
+      "BE and LE u16 paths must produce identical output"
+    );
+  }
+
+  #[test]
+  fn scalar_y210_be_luma_matches_le() {
+    let le = solid_y210(8, 512, 512, 512);
+    let be = to_be_u16(&le);
+    let mut luma_le = [0u8; 8];
+    let mut luma_be = [0u8; 8];
+    y210_to_luma_row::<false>(&le, &mut luma_le, 8);
+    y210_to_luma_row::<true>(&be, &mut luma_be, 8);
+    assert_eq!(
+      luma_le, luma_be,
+      "BE and LE luma paths must produce identical output"
+    );
+  }
+
+  #[test]
+  fn scalar_y210_be_luma_u16_matches_le() {
+    let le = solid_y210(8, 512, 512, 512);
+    let be = to_be_u16(&le);
+    let mut luma_le = [0u16; 8];
+    let mut luma_be = [0u16; 8];
+    y210_to_luma_u16_row::<false>(&le, &mut luma_le, 8);
+    y210_to_luma_u16_row::<true>(&be, &mut luma_be, 8);
+    assert_eq!(
+      luma_le, luma_be,
+      "BE and LE luma_u16 paths must produce identical output"
+    );
+  }
 }
diff --git a/src/sinker/mixed/v210.rs b/src/sinker/mixed/v210.rs
index 42da55d3..e59a624a 100644
--- a/src/sinker/mixed/v210.rs
+++ b/src/sinker/mixed/v210.rs
@@ -212,6 +212,7 @@ impl PixelSink for MixedSinker<'_, V210> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 10-bit Y values at native depth.
@@ -221,6 +222,7 @@ impl PixelSink for MixedSinker<'_, V210> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -241,6 +243,7 @@ impl PixelSink for MixedSinker<'_, V210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -261,6 +264,7 @@ impl PixelSink for MixedSinker<'_, V210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -291,6 +295,7 @@ impl PixelSink for MixedSinker<'_, V210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -307,7 +312,15 @@ impl PixelSink for MixedSinker<'_, V210> {
       w,
       h,
     )?;
-    v210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    v210_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(
diff --git a/src/sinker/mixed/y210.rs b/src/sinker/mixed/y210.rs
index cf9caaa5..430b2955 100644
--- a/src/sinker/mixed/y210.rs
+++ b/src/sinker/mixed/y210.rs
@@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y210> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 10-bit Y values at native depth.
@@ -222,6 +223,7 @@ impl PixelSink for MixedSinker<'_, Y210> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -242,6 +244,7 @@ impl PixelSink for MixedSinker<'_, Y210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -262,6 +265,7 @@ impl PixelSink for MixedSinker<'_, Y210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -292,6 +296,7 @@ impl PixelSink for MixedSinker<'_, Y210> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -308,7 +313,15 @@ impl PixelSink for MixedSinker<'_, Y210> {
       w,
       h,
     )?;
-    y210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    y210_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(
diff --git a/src/sinker/mixed/y212.rs b/src/sinker/mixed/y212.rs
index e7c1c959..1582e61e 100644
--- a/src/sinker/mixed/y212.rs
+++ b/src/sinker/mixed/y212.rs
@@ -211,6 +211,7 @@ impl PixelSink for MixedSinker<'_, Y212> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 12-bit Y values at native depth.
@@ -220,6 +221,7 @@ impl PixelSink for MixedSinker<'_, Y212> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -240,6 +242,7 @@ impl PixelSink for MixedSinker<'_, Y212> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -260,6 +263,7 @@ impl PixelSink for MixedSinker<'_, Y212> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -290,6 +294,7 @@ impl PixelSink for MixedSinker<'_, Y212> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -306,7 +311,15 @@ impl PixelSink for MixedSinker<'_, Y212> {
       w,
       h,
     )?;
-    y212_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    y212_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(
diff --git a/src/sinker/mixed/y216.rs b/src/sinker/mixed/y216.rs
index a8ce416d..4fdbb951 100644
--- a/src/sinker/mixed/y216.rs
+++ b/src/sinker/mixed/y216.rs
@@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y216> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 16-bit Y values at native depth (direct
@@ -223,6 +224,7 @@ impl PixelSink for MixedSinker<'_, Y216> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -243,6 +245,7 @@ impl PixelSink for MixedSinker<'_, Y216> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -263,6 +266,7 @@ impl PixelSink for MixedSinker<'_, Y216> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -293,6 +297,7 @@ impl PixelSink for MixedSinker<'_, Y216> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -309,7 +314,15 @@ impl PixelSink for MixedSinker<'_, Y216> {
       w,
       h,
     )?;
-    y216_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    y216_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(