diff --git a/benches/ayuv64_a_plus_combo.rs b/benches/ayuv64_a_plus_combo.rs
index a2f2c307..68d27faa 100644
--- a/benches/ayuv64_a_plus_combo.rs
+++ b/benches/ayuv64_a_plus_combo.rs
@@ -116,6 +116,7 @@ fn bench_u8(c: &mut Criterion) {
                 MATRIX,
                 FULL_RANGE,
                 use_simd,
+                false,
               );
               ayuv64_to_rgba_row(
                 black_box(&packed[p_off..p_off + row_elems]),
@@ -124,6 +125,7 @@ fn bench_u8(c: &mut Criterion) {
                 MATRIX,
                 FULL_RANGE,
                 use_simd,
+                false,
               );
             }
             black_box((&rgb, &rgba));
@@ -201,6 +203,7 @@ fn bench_u16(c: &mut Criterion) {
                 MATRIX,
                 FULL_RANGE,
                 use_simd,
+                false,
               );
               ayuv64_to_rgba_u16_row(
                 black_box(&packed[p_off..p_off + row_elems]),
@@ -209,6 +212,7 @@ fn bench_u16(c: &mut Criterion) {
                 MATRIX,
                 FULL_RANGE,
                 use_simd,
+                false,
               );
             }
             black_box((&rgb, &rgba));
diff --git a/src/row/arch/neon/ayuv64.rs b/src/row/arch/neon/ayuv64.rs
index e7fe910b..6ee9791c 100644
--- a/src/row/arch/neon/ayuv64.rs
+++ b/src/row/arch/neon/ayuv64.rs
@@ -14,6 +14,10 @@
 //! producing `uint16x8_t` halves for each of the four channels:
 //! `a_lo/a_hi`, `y_lo/y_hi`, `u_lo/u_hi`, `v_lo/v_hi`.
 //!
+//! For BE wire format (`BE = true`), each deinterleaved `uint16x8_t`
+//! channel is byte-swapped via `bswap_u16x8_if_be::<true>` after the
+//! `vld4q_u16` call.
+//!
 //! - u8 output: Y values are full 16-bit (0..65535), so
 //!   `scale_y_u16_to_i16` is used (not `scale_y`, which would corrupt
 //!   values > 32767). i32 chroma via `chroma_i16x8`.
@@ -26,7 +30,7 @@
 //! ## Tail
 //!
 //! `width % 16` remaining pixels fall through to the scalar
-//! `ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>` (or u16 version).
+//! `ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>` (or u16 version).
 
 use core::arch::aarch64::*;
 
@@ -37,13 +41,13 @@ use crate::{ColorMatrix, row::scalar};
 
 /// NEON AYUV64 → packed u8 RGB or RGBA.
 ///
-/// Byte-identical to `scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>`.
+/// Byte-identical to `scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>`.
 ///
 /// Valid monomorphizations:
-/// - `<false, false>` — RGB (α dropped)
-/// - `<true, true>`  — RGBA, source α depth-converted u16 → u8 (`>> 8`)
+/// - `<false, false, _>` — RGB (α dropped)
+/// - `<true, true, _>`  — RGBA, source α depth-converted u16 → u8 (`>> 8`)
 ///
-/// `<false, true>` is rejected at monomorphization via `const { assert! }`.
+/// `<false, true, _>` is rejected at monomorphization via `const { assert! }`.
 ///
 /// # Safety
 ///
@@ -52,7 +56,11 @@ use crate::{ColorMatrix, row::scalar};
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -91,16 +99,16 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let q_lo = vld4q_u16(packed.as_ptr().add(x * 4));
       let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32));
 
-      // Extract channels (no shift needed — 16-bit native samples).
-      let a_lo_u16 = q_lo.0; // uint16x8_t — A for pixels 0..7
-      let y_lo_u16 = q_lo.1; // uint16x8_t — Y for pixels 0..7
-      let u_lo_u16 = q_lo.2; // uint16x8_t — U for pixels 0..7
-      let v_lo_u16 = q_lo.3; // uint16x8_t — V for pixels 0..7
+      // Apply BE byte-swap per-channel if needed.
+      let a_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.0);
+      let y_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.1);
+      let u_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.2);
+      let v_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.3);
 
-      let a_hi_u16 = q_hi.0; // uint16x8_t — A for pixels 8..15
-      let y_hi_u16 = q_hi.1; // uint16x8_t — Y for pixels 8..15
-      let u_hi_u16 = q_hi.2; // uint16x8_t — U for pixels 8..15
-      let v_hi_u16 = q_hi.3; // uint16x8_t — V for pixels 8..15
+      let a_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.0);
+      let y_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.1);
+      let u_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.2);
+      let v_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.3);
 
       // Reinterpret chroma as signed i16 (bias subtraction fits i16:
       // chroma ∈ [0,65535], bias=32768, so (chroma-bias) ∈ [-32768,32767]).
@@ -194,7 +202,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -210,13 +218,13 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// NEON AYUV64 → packed native-depth u16 RGB or RGBA.
 ///
 /// Uses i64 chroma (`chroma_i64x4`) to avoid overflow at BITS=16/16.
-/// Byte-identical to `scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>`.
+/// Byte-identical to `scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>`.
 ///
 /// Valid monomorphizations:
-/// - `<false, false>` — RGB u16 (α dropped)
-/// - `<true, true>`  — RGBA u16, source α written direct (no conversion)
+/// - `<false, false, _>` — RGB u16 (α dropped)
+/// - `<true, true, _>`  — RGBA u16, source α written direct (no conversion)
 ///
-/// `<false, true>` is rejected at monomorphization via `const { assert! }`.
+/// `<false, true, _>` is rejected at monomorphization via `const { assert! }`.
 ///
 /// # Safety
 ///
@@ -225,7 +233,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -265,15 +277,16 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let q_lo = vld4q_u16(packed.as_ptr().add(x * 4));
       let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32));
 
-      let a_lo_u16 = q_lo.0;
-      let y_lo_u16 = q_lo.1;
-      let u_lo_u16 = q_lo.2;
-      let v_lo_u16 = q_lo.3;
+      // Apply BE byte-swap per-channel if needed.
+      let a_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.0);
+      let y_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.1);
+      let u_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.2);
+      let v_lo_u16 = bswap_u16x8_if_be::<BE>(q_lo.3);
 
-      let a_hi_u16 = q_hi.0;
-      let y_hi_u16 = q_hi.1;
-      let u_hi_u16 = q_hi.2;
-      let v_hi_u16 = q_hi.3;
+      let a_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.0);
+      let y_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.1);
+      let u_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.2);
+      let v_hi_u16 = bswap_u16x8_if_be::<BE>(q_hi.3);
 
       // Chroma: widen u16 → i32, subtract bias, apply c_scale (Q15).
       // 4:4:4 — 8 per-pixel chroma values per half, split into 2 × i32x4.
@@ -411,7 +424,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -427,7 +440,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
 /// NEON AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgb_row(
+pub(crate) unsafe fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
@@ -435,7 +448,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -443,7 +456,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
 /// to u8 via `>> 8`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgba_row(
+pub(crate) unsafe fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -451,14 +464,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
   }
 }
 
 /// NEON AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
@@ -466,7 +479,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(
+      packed, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -474,7 +489,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
 /// is written direct (no conversion).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -482,7 +497,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(
+      packed, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -491,7 +508,7 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// NEON AYUV64 → u8 luma. Y is the second u16 (slot 1) of each pixel
 /// quadruple; `vshrn_n_u16::<8>` narrows u16 → u8 (high byte = `>> 8`).
 ///
-/// Byte-identical to `scalar::ayuv64_to_luma_row`.
+/// Byte-identical to `scalar::ayuv64_to_luma_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -500,7 +517,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -510,16 +531,19 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
       // Two vld4q_u16 loads: channel 1 (.1) = Y for each group of 8 pixels.
       let q_lo = vld4q_u16(packed.as_ptr().add(x * 4));
       let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32));
+      // Apply BE byte-swap to Y channel if needed.
+      let y_lo = bswap_u16x8_if_be::<BE>(q_lo.1);
+      let y_hi = bswap_u16x8_if_be::<BE>(q_hi.1);
       // vshrn_n_u16::<8>: narrows 8 u16 → 8 u8 by taking high byte (>> 8).
-      let y_lo_u8 = vshrn_n_u16::<8>(q_lo.1);
-      let y_hi_u8 = vshrn_n_u16::<8>(q_hi.1);
+      let y_lo_u8 = vshrn_n_u16::<8>(y_lo);
+      let y_hi_u8 = vshrn_n_u16::<8>(y_hi);
       vst1_u8(luma_out.as_mut_ptr().add(x), y_lo_u8);
       vst1_u8(luma_out.as_mut_ptr().add(x + 8), y_hi_u8);
       x += 16;
     }
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_row(
+      scalar::ayuv64_to_luma_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
@@ -533,7 +557,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// NEON AYUV64 → u16 luma. Direct copy of Y samples (slot 1, no shift —
 /// 16-bit native).
 ///
-/// Byte-identical to `scalar::ayuv64_to_luma_u16_row`.
+/// Byte-identical to `scalar::ayuv64_to_luma_u16_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -542,7 +566,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -552,14 +580,16 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
       // Two vld4q_u16 loads: channel 1 (.1) = Y.
       let q_lo = vld4q_u16(packed.as_ptr().add(x * 4));
       let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32));
-      // Direct copy — Y samples are 16-bit native (no shift needed).
-      vst1q_u16(luma_out.as_mut_ptr().add(x), q_lo.1);
-      vst1q_u16(luma_out.as_mut_ptr().add(x + 8), q_hi.1);
+      // Apply BE byte-swap to Y channel if needed, then direct copy.
+      let y_lo = bswap_u16x8_if_be::<BE>(q_lo.1);
+      let y_hi = bswap_u16x8_if_be::<BE>(q_hi.1);
+      vst1q_u16(luma_out.as_mut_ptr().add(x), y_lo);
+      vst1q_u16(luma_out.as_mut_ptr().add(x + 8), y_hi);
       x += 16;
     }
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_u16_row(
+      scalar::ayuv64_to_luma_u16_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
diff --git a/src/row/arch/neon/mod.rs b/src/row/arch/neon/mod.rs
index 86b8d00a..91144808 100644
--- a/src/row/arch/neon/mod.rs
+++ b/src/row/arch/neon/mod.rs
@@ -251,5 +251,65 @@ pub(super) fn scale_y_u16_i64(
   }
 }
 
+// ---- BE helpers ----------------------------------------------------------
+
+/// Compile-time host endianness. `true` on BE targets (e.g. `s390x`,
+/// `powerpc`-BE), `false` on LE targets (e.g. `aarch64-apple-darwin`,
+/// `x86_64`).
+///
+/// Used by the conditional byte-swap helpers below to decide whether a raw
+/// NEON load already matches the wire endian. Without this, the helpers
+/// would only correctly handle two of the four `host × wire` quadrants.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
+/// Conditionally byte-swap 8 u16 lanes in a NEON register so that the
+/// returned value is in **host-native** byte order, regardless of the
+/// host endianness.
+///
+/// The gate is `BE != HOST_NATIVE_BE`:
+///
+/// | wire `BE` | host       | gate    | action            |
+/// |-----------|------------|---------|-------------------|
+/// | `false`   | LE         | `false` | no swap (LE→LE)   |
+/// | `false`   | BE         | `true`  | swap (LE→BE)      |
+/// | `true`    | LE         | `true`  | swap (BE→LE)      |
+/// | `true`    | BE         | `false` | no swap (BE→BE)   |
+///
+/// The unused branch is eliminated by the compiler — `BE` and
+/// `HOST_NATIVE_BE` are both compile-time constants, so the gate folds.
+///
+/// Used by the packed YUV 4:4:4 kernels (XV36, AYUV64) after `vld4q_u16`
+/// to correct samples loaded from a wire-encoded buffer.
+///
+/// Mirrors PR #82's `9c7d533` dispatcher routing fix and PR #85's
+/// `9e678b0` Ya16 SIMD gate — both addressed the same bug class
+/// (only swapping on `BE = true` rather than `BE != HOST_NATIVE_BE`).
+#[inline(always)]
+pub(super) unsafe fn bswap_u16x8_if_be<const BE: bool>(v: uint16x8_t) -> uint16x8_t {
+  if BE != HOST_NATIVE_BE {
+    unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) }
+  } else {
+    v
+  }
+}
+
+/// Conditionally byte-swap 4 u32 lanes in a NEON register so that the
+/// returned value is in **host-native** byte order, regardless of the
+/// host endianness.
+///
+/// Same `BE != HOST_NATIVE_BE` gate as [`bswap_u16x8_if_be`] — see that
+/// helper for the truth table.
+///
+/// Used by the V410 kernel after `vld1q_u32` to correct u32 words loaded
+/// from a wire-encoded buffer.
+#[inline(always)]
+pub(super) unsafe fn bswap_u32x4_if_be<const BE: bool>(v: uint32x4_t) -> uint32x4_t {
+  if BE != HOST_NATIVE_BE {
+    unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) }
+  } else {
+    v
+  }
+}
+
 #[cfg(all(test, feature = "std"))]
 mod tests;
diff --git a/src/row/arch/neon/tests/ayuv64.rs b/src/row/arch/neon/tests/ayuv64.rs
index 84c02944..6f5938fe 100644
--- a/src/row/arch/neon/tests/ayuv64.rs
+++ b/src/row/arch/neon/tests/ayuv64.rs
@@ -22,9 +22,11 @@ fn check_rgb<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut s, width, matrix, full_range);
+  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -43,11 +45,13 @@ fn check_rgb_u16<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
     &p, &mut s, width, matrix, full_range,
   );
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
+      &p, &mut k, width, matrix, full_range,
+    );
   }
   assert_eq!(
     s,
@@ -61,9 +65,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::ayuv64_to_luma_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_row(&p, &mut k, width);
+    ayuv64_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON ayuv64→luma diverges (width={width})");
 }
@@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::ayuv64_to_luma_u16_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_u16_row(&p, &mut k, width);
+    ayuv64_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON ayuv64→luma u16 diverges (width={width})");
 }
@@ -154,7 +158,7 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() {
   // --- luma_u16 path: Y values should be direct (no conversion). ---
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    ayuv64_to_luma_u16_row(&packed, &mut luma_out, W);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=16).map(|n| n as u16).collect();
   assert_eq!(
@@ -167,7 +171,7 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() {
   // produces a well-defined Y output. Matrix doesn't matter for neutral chroma.
   let mut rgba_out = std::vec![0u16; W * 4];
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
       &packed,
       &mut rgba_out,
       W,
@@ -183,3 +187,169 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() {
     "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test: probes the `bswap_u16x8_if_be<BE>` gate
+/// added in `b7fb9d3` (PR #86) at the SIMD layer for AYUV64.
+///
+/// Covers all four valid `(ALPHA, ALPHA_SRC)` quadrant subsets used by the
+/// public API: (false,false) and (true,true). Source-α paths route the α
+/// channel directly through the SIMD endian gate, so this also covers the
+/// source-α-specific code path.
+///
+/// On an LE host:
+/// - SIMD `<…BE=false>` on LE input  → no-swap path (gate doesn't fire).
+/// - SIMD `<…BE=true>`  on BE input  → swap path (gate fires).
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_ayuv64_be_le_simd_parity() {
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_ayuv64(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    // u8 RGB (ALPHA=false, ALPHA_SRC=false)
+    {
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w}) — endian gate broken"
+      );
+    }
+
+    // u8 RGBA + source α (ALPHA=true, ALPHA_SRC=true) — exercises the
+    // source-α path through the endian gate.
+    {
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w}) — endian gate broken"
+      );
+    }
+
+    // u16 RGB
+    {
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w}) — endian gate broken"
+      );
+    }
+
+    // u16 RGBA + source α
+    {
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u8
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        ayuv64_to_luma_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u16
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        ayuv64_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/neon/tests/v410.rs b/src/row/arch/neon/tests/v410.rs
index 1da5f45d..1d5fbb1c 100644
--- a/src/row/arch/neon/tests/v410.rs
+++ b/src/row/arch/neon/tests/v410.rs
@@ -27,9 +27,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -44,9 +44,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -60,9 +60,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v410_to_luma_row(&p, &mut s, width);
+  scalar::v410_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_row(&p, &mut k, width);
+    v410_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON v410→luma diverges (width={width})");
 }
@@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v410_to_luma_u16_row(&p, &mut s, width);
+  scalar::v410_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_u16_row(&p, &mut k, width);
+    v410_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON v410→luma u16 diverges (width={width})");
 }
@@ -158,7 +158,7 @@ fn neon_v410_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v410_to_luma_u16_row(&packed, &mut luma, W);
+    v410_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "neon v410 luma reorder bug");
@@ -167,9 +167,15 @@ fn neon_v410_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v410_to_rgb_or_rgba_row::<false>(
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -181,3 +187,129 @@ fn neon_v410_lane_order_per_pixel_y_and_u() {
     "neon v410 SIMD vs scalar diverges — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test: probes the `bswap_u32x4_if_be<BE>` gate
+/// added in `b7fb9d3` (PR #86) at the SIMD layer, which existing tests miss
+/// (per-backend tests use `BE=false`; dispatcher BE-vs-LE comparisons use
+/// `use_simd=false`).
+///
+/// On an LE host:
+/// - SIMD `<BE=false>` on LE input  → gate doesn't fire → exercises no-swap path.
+/// - SIMD `<BE=true>`  on BE input  → gate fires        → exercises swap path.
+///
+/// On a BE host (s390x QEMU when Phase 3 lands), the same test exercises the
+/// opposite quadrant.
+///
+/// Widths chosen to cover the SIMD main loop (>=8 multiples) + scalar tail
+/// (e.g. 17 = 16-lane main + 1 tail; 33 = 32-lane main + 1 tail).
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_v410_be_le_simd_parity() {
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_v410(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u32> = le_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let be: std::vec::Vec<u32> = be_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+
+    // u8 RGB / RGBA
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          v410_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w}) — endian gate broken"
+      );
+    }
+
+    // u16 RGB / RGBA
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          v410_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u8
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        v410_to_luma_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon v410 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u16
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        v410_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon v410 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/neon/tests/xv36.rs b/src/row/arch/neon/tests/xv36.rs
index b254da48..b1e216bc 100644
--- a/src/row/arch/neon/tests/xv36.rs
+++ b/src/row/arch/neon/tests/xv36.rs
@@ -17,9 +17,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -34,9 +34,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -50,9 +50,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::xv36_to_luma_row(&p, &mut s, width);
+  scalar::xv36_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_row(&p, &mut k, width);
+    xv36_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON xv36→luma diverges (width={width})");
 }
@@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::xv36_to_luma_u16_row(&p, &mut s, width);
+  scalar::xv36_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_u16_row(&p, &mut k, width);
+    xv36_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "NEON xv36→luma u16 diverges (width={width})");
 }
@@ -154,7 +154,7 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1)
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    xv36_to_luma_u16_row(&packed, &mut luma_u16, W);
+    xv36_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "neon xv36 luma_u16 reorder bug");
@@ -163,9 +163,15 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -177,3 +183,124 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() {
     "neon xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test: probes the `bswap_u16x8_if_be<BE>` gate
+/// added in `b7fb9d3` (PR #86) at the SIMD layer. Existing per-backend tests
+/// use `BE=false`; existing dispatcher BE-vs-LE tests use `use_simd=false`,
+/// so the SIMD endian gate is otherwise untested.
+///
+/// Builds an LE pseudo-random buffer, byte-swaps every u16 to obtain the
+/// equivalent BE-encoded buffer, then asserts that:
+///   SIMD<BE=false>(LE) == SIMD<BE=true>(BE)
+/// for every output variant (u8 RGB/RGBA, u16 RGB/RGBA, luma u8/u16).
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_xv36_be_le_simd_parity() {
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_xv36(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    // u8 RGB / RGBA
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          xv36_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w}) — endian gate broken"
+      );
+    }
+
+    // u16 RGB / RGBA
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u8
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        xv36_to_luma_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon xv36 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken"
+      );
+    }
+
+    // luma u16
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        xv36_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "neon xv36 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/neon/v410.rs b/src/row/arch/neon/v410.rs
index ec06d822..9d266750 100644
--- a/src/row/arch/neon/v410.rs
+++ b/src/row/arch/neon/v410.rs
@@ -16,6 +16,12 @@
 //! for `chroma_i16x8` / `scale_y`. Only the low 4 lanes carry valid
 //! data; the high 4 are don't-care.
 //!
+//! ## BE support (`<const BE: bool>`)
+//!
+//! When `BE = true`, each loaded `uint32x4_t` is byte-swapped via
+//! `bswap_u32x4_if_be::<BE>` before field extraction. The scalar tail
+//! also forwards `BE`.
+//!
 //! ## Tail
 //!
 //! `width % 4` remaining pixels fall through to `scalar::v410_*`.
@@ -29,7 +35,7 @@ use crate::{ColorMatrix, row::scalar};
 
 /// NEON V410 → packed u8 RGB or RGBA.
 ///
-/// Byte-identical to `scalar::v410_to_rgb_or_rgba_row::<ALPHA>`.
+/// Byte-identical to `scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -38,7 +44,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -71,8 +77,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 4 <= width {
-      // Load 4 V410 words.
-      let words = vld1q_u32(packed.as_ptr().add(x));
+      // Load 4 V410 words; byte-swap each u32 for BE wire format.
+      let words = bswap_u32x4_if_be::<BE>(vld1q_u32(packed.as_ptr().add(x)));
 
       // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20).
       let u_u32 = vandq_u32(words, mask);
@@ -140,7 +146,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -150,7 +162,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// NEON V410 → packed native-depth u16 RGB or RGBA (low-bit-packed at
 /// 10-bit).
 ///
-/// Byte-identical to `scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>`.
+/// Byte-identical to `scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -159,7 +171,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -196,7 +208,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = vld1q_u32(packed.as_ptr().add(x));
+      let words = bswap_u32x4_if_be::<BE>(vld1q_u32(packed.as_ptr().add(x)));
 
       let u_u32 = vandq_u32(words, mask);
       let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask);
@@ -253,7 +265,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -268,7 +280,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
 /// NEON V410 → u8 luma. Y is `(word >> 10) & 0x3FF`, then `>> 2`.
 ///
-/// Byte-identical to `scalar::v410_to_luma_row`.
+/// Byte-identical to `scalar::v410_to_luma_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -277,7 +289,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v410_to_luma_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -285,7 +301,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
     let mask = vdupq_n_u32(0x3FF);
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = vld1q_u32(packed.as_ptr().add(x));
+      let words = bswap_u32x4_if_be::<BE>(vld1q_u32(packed.as_ptr().add(x)));
       // Y field: bits 19:10 → shift right 10, mask to 10-bit.
       let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask);
       // Narrow u32→u16, then >> 2, then narrow u16→u8.
@@ -301,7 +317,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
       x += 4;
     }
     if x < width {
-      scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
@@ -310,7 +326,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
 /// NEON V410 → u16 luma (low-bit-packed at 10-bit).
 ///
-/// Byte-identical to `scalar::v410_to_luma_u16_row`.
+/// Byte-identical to `scalar::v410_to_luma_u16_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -319,7 +335,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v410_to_luma_u16_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -327,7 +347,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
     let mask = vdupq_n_u32(0x3FF);
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = vld1q_u32(packed.as_ptr().add(x));
+      let words = bswap_u32x4_if_be::<BE>(vld1q_u32(packed.as_ptr().add(x)));
       let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask);
       // Narrow u32→u16 (values ≤ 1023, no saturation needed).
       let y_u16 = vmovn_u32(y_u32);
@@ -338,7 +358,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
       x += 4;
     }
     if x < width {
-      scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_u16_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/neon/xv36.rs b/src/row/arch/neon/xv36.rs
index 7ab03379..6e7b31ed 100644
--- a/src/row/arch/neon/xv36.rs
+++ b/src/row/arch/neon/xv36.rs
@@ -18,6 +18,10 @@
 //! fit in i16, so `scale_y` is used (not `scale_y_u16_to_i16`).
 //! The Q15 pipeline uses i32 chroma (`chroma_i16x8`) at BITS=12.
 //!
+//! For BE wire format (`BE = true`), each deinterleaved `uint16x8_t`
+//! channel is byte-swapped via `bswap_u16x8_if_be::<true>` after the
+//! `vld4q_u16` call.
+//!
 //! ## Tail
 //!
 //! `width % 8` remaining pixels fall through to `scalar::xv36_*`.
@@ -31,7 +35,7 @@ use crate::{ColorMatrix, row::scalar};
 
 /// NEON XV36 → packed u8 RGB or RGBA.
 ///
-/// Byte-identical to `scalar::xv36_to_rgb_or_rgba_row::<ALPHA>`.
+/// Byte-identical to `scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -40,7 +44,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -74,11 +78,16 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       // Load 8 XV36 quadruples (8 × 4 × u16 = 64 bytes).
       // vld4q_u16 deinterleaves: .0=U8, .1=Y8, .2=V8, .3=A8 (padding).
       let q = vld4q_u16(packed.as_ptr().add(x * 4));
+      // Apply BE byte-swap per-channel if needed.
+      let u_raw = bswap_u16x8_if_be::<BE>(q.0);
+      let y_raw = bswap_u16x8_if_be::<BE>(q.1);
+      let v_raw = bswap_u16x8_if_be::<BE>(q.2);
+      // q.3 (A) is padding — discarded (no swap needed).
+
       // Right-shift by 4 to drop the 4 padding LSBs → 12-bit range [0, 4095].
-      let u_u16 = vshrq_n_u16::<4>(q.0); // 8 lanes of U
-      let y_u16 = vshrq_n_u16::<4>(q.1); // 8 lanes of Y
-      let v_u16 = vshrq_n_u16::<4>(q.2); // 8 lanes of V
-      // q.3 (A) is padding — discarded.
+      let u_u16 = vshrq_n_u16::<4>(u_raw); // 8 lanes of U
+      let y_u16 = vshrq_n_u16::<4>(y_raw); // 8 lanes of Y
+      let v_u16 = vshrq_n_u16::<4>(v_raw); // 8 lanes of V
 
       // Reinterpret as signed i16 (values ≤ 4095 < 32767, safe).
       let u_i16 = vreinterpretq_s16_u16(u_u16);
@@ -133,7 +142,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -143,7 +158,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// NEON XV36 → packed native-depth u16 RGB or RGBA (low-bit-packed at
 /// 12-bit).
 ///
-/// Byte-identical to `scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>`.
+/// Byte-identical to `scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>`.
 ///
 /// # Safety
 ///
@@ -152,7 +167,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -190,11 +205,15 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 8 <= width {
       let q = vld4q_u16(packed.as_ptr().add(x * 4));
-      let u_u16 = vshrq_n_u16::<4>(q.0);
-      let y_u16 = vshrq_n_u16::<4>(q.1);
-      let v_u16 = vshrq_n_u16::<4>(q.2);
+      let u_raw = bswap_u16x8_if_be::<BE>(q.0);
+      let y_raw = bswap_u16x8_if_be::<BE>(q.1);
+      let v_raw = bswap_u16x8_if_be::<BE>(q.2);
       // q.3 (A) is padding — discarded.
 
+      let u_u16 = vshrq_n_u16::<4>(u_raw);
+      let y_u16 = vshrq_n_u16::<4>(y_raw);
+      let v_u16 = vshrq_n_u16::<4>(v_raw);
+
       let u_i16 = vreinterpretq_s16_u16(u_u16);
       let y_i16 = vreinterpretq_s16_u16(y_u16);
       let v_i16 = vreinterpretq_s16_u16(v_u16);
@@ -239,7 +258,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -255,7 +274,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// NEON XV36 → u8 luma. Y is quadruple element 1; `>> 8` brings the
 /// 12-bit MSB-aligned sample to 8-bit (drops 4 padding LSBs + 4 more).
 ///
-/// Byte-identical to `scalar::xv36_to_luma_row`.
+/// Byte-identical to `scalar::xv36_to_luma_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -264,7 +283,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -272,15 +295,17 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     let mut x = 0usize;
     while x + 8 <= width {
       let q = vld4q_u16(packed.as_ptr().add(x * 4));
-      // Y is q.1. Scalar does `packed[x*4+1] >> 8`; apply the same shift.
+      // Y is q.1. Apply BE byte-swap if needed before the shift.
+      let y_raw = bswap_u16x8_if_be::<BE>(q.1);
+      // Scalar does `packed[x*4+1] >> 8`; apply the same shift.
       // vshrn_n_u16::<8> narrows (u16 >> 8) → u8x8, handling 8 lanes.
-      let y_u8 = vshrn_n_u16::<8>(q.1);
+      let y_u8 = vshrn_n_u16::<8>(y_raw);
       vst1_u8(out.as_mut_ptr().add(x), y_u8);
       x += 8;
     }
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
@@ -291,7 +316,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// element 1; `>> 4` drops the 4 padding LSBs to give a 12-bit value
 /// in `[0, 4095]`.
 ///
-/// Byte-identical to `scalar::xv36_to_luma_u16_row`.
+/// Byte-identical to `scalar::xv36_to_luma_u16_row::<BE>`.
 ///
 /// # Safety
 ///
@@ -300,7 +325,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -308,14 +337,15 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
     let mut x = 0usize;
     while x + 8 <= width {
       let q = vld4q_u16(packed.as_ptr().add(x * 4));
-      // Y is q.1. Scalar does `packed[x*4+1] >> 4`.
-      let y_u16 = vshrq_n_u16::<4>(q.1);
+      // Y is q.1. Apply BE byte-swap if needed, then `>> 4`.
+      let y_raw = bswap_u16x8_if_be::<BE>(q.1);
+      let y_u16 = vshrq_n_u16::<4>(y_raw);
       vst1q_u16(out.as_mut_ptr().add(x), y_u16);
       x += 8;
     }
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_u16_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/ayuv64.rs b/src/row/arch/wasm_simd128/ayuv64.rs
index 04465dfc..e10f01c1 100644
--- a/src/row/arch/wasm_simd128/ayuv64.rs
+++ b/src/row/arch/wasm_simd128/ayuv64.rs
@@ -44,7 +44,7 @@
 
 use core::arch::wasm32::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Deinterleave helper ------------------------------------------------
@@ -67,13 +67,14 @@ use crate::{ColorMatrix, row::scalar};
 /// `simd128` must be enabled at compile time.
 #[inline]
 #[target_feature(enable = "simd128")]
-unsafe fn deinterleave_ayuv64_8px(ptr: *const u16) -> (v128, v128, v128, v128) {
+unsafe fn deinterleave_ayuv64_8px<const BE: bool>(ptr: *const u16) -> (v128, v128, v128, v128) {
   unsafe {
     // Load 4 × v128, each covering 2 pixels (8 × u16 = 16 bytes).
-    let raw0 = v128_load(ptr.cast()); // [A0,Y0,U0,V0, A1,Y1,U1,V1]
-    let raw1 = v128_load(ptr.add(8).cast()); // [A2,Y2,U2,V2, A3,Y3,U3,V3]
-    let raw2 = v128_load(ptr.add(16).cast()); // [A4,Y4,U4,V4, A5,Y5,U5,V5]
-    let raw3 = v128_load(ptr.add(24).cast()); // [A6,Y6,U6,V6, A7,Y7,U7,V7]
+    // For BE wire format, `load_endian_u16x8` byte-swaps each u16 lane.
+    let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8); // [A0,Y0,U0,V0, A1,Y1,U1,V1]
+    let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8); // [A2,Y2,U2,V2, A3,Y3,U3,V3]
+    let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8); // [A4,Y4,U4,V4, A5,Y5,U5,V5]
+    let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8); // [A6,Y6,U6,V6, A7,Y7,U7,V7]
 
     // Per-channel byte positions within a 2-pixel v128 (16 bytes):
     //   A → bytes  0,1  (pixel n) and  8,9  (pixel n+1)
@@ -148,7 +149,11 @@ unsafe fn deinterleave_ayuv64_8px(ptr: *const u16) -> (v128, v128, v128, v128) {
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -184,7 +189,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
     while x + 16 <= width {
       // --- lo half: pixels x..x+7 -------------------------------------------
       let (a_lo_u16, y_lo_u16, u_lo_u16, v_lo_u16) =
-        deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4));
+        deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma via wrapping i16 subtraction.
       let u_lo_i16 = i16x8_sub(u_lo_u16, bias16_v);
@@ -216,7 +221,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 
       // --- hi half: pixels x+8..x+15 ----------------------------------------
       let (a_hi_u16, y_hi_u16, u_hi_u16, v_hi_u16) =
-        deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4 + 32));
+        deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       let u_hi_i16 = i16x8_sub(u_hi_u16, bias16_v);
       let v_hi_i16 = i16x8_sub(v_hi_u16, bias16_v);
@@ -278,7 +283,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -300,7 +305,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// 3. `out.len() >= width * 3`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgb_row(
+pub(crate) unsafe fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
@@ -308,7 +313,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -322,7 +327,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
 /// 3. `out.len() >= width * 4`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgba_row(
+pub(crate) unsafe fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -330,7 +335,7 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
   }
 }
 
@@ -355,7 +360,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -393,7 +402,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave 8 AYUV64 quadruples → A, Y, U, V as u16x8.
-      let (a_u16, y_vec, u_u16, v_u16) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4));
+      let (a_u16, y_vec, u_u16, v_u16) = deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma via wrapping i16 subtraction.
       let u_i16 = i16x8_sub(u_u16, bias16);
@@ -483,7 +492,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -505,7 +514,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
 /// 3. `out.len() >= width * 3` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
@@ -513,7 +522,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(
+      packed, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -527,7 +538,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
 /// 3. `out.len() >= width * 4` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -535,7 +546,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(
+      packed, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -555,7 +568,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -563,8 +580,9 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
     let mut x = 0usize;
     while x + 16 <= width {
       // Two deinterleaves for 8 pixels each.
-      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4));
-      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4 + 32));
+      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4));
+      let (_a_hi, y_hi, _u_hi, _v_hi) =
+        deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       // >> 8 to get u8 luma (high byte of each Y u16 sample).
       // Logical shift (u16x8_shr) — arithmetic shift (i16x8_shr) would
@@ -581,7 +599,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_row(
+      scalar::ayuv64_to_luma_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
@@ -604,7 +622,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -612,8 +634,9 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
     let mut x = 0usize;
     while x + 16 <= width {
       // Two deinterleaves for 8 pixels each.
-      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4));
-      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4 + 32));
+      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4));
+      let (_a_hi, y_hi, _u_hi, _v_hi) =
+        deinterleave_ayuv64_8px::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       // Direct copy — Y samples are 16-bit native (no shift needed).
       v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo);
@@ -624,7 +647,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_u16_row(
+      scalar::ayuv64_to_luma_u16_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
diff --git a/src/row/arch/wasm_simd128/tests/ayuv64.rs b/src/row/arch/wasm_simd128/tests/ayuv64.rs
index cb42ec13..3521aafc 100644
--- a/src/row/arch/wasm_simd128/tests/ayuv64.rs
+++ b/src/row/arch/wasm_simd128/tests/ayuv64.rs
@@ -21,9 +21,11 @@ fn check_rgb<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut s, width, matrix, full_range);
+  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -42,11 +44,13 @@ fn check_rgb_u16<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
     &p, &mut s, width, matrix, full_range,
   );
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
+      &p, &mut k, width, matrix, full_range,
+    );
   }
   assert_eq!(
     s,
@@ -60,9 +64,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::ayuv64_to_luma_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_row(&p, &mut k, width);
+    ayuv64_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "wasm ayuv64→luma diverges (width={width})");
 }
@@ -71,9 +75,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::ayuv64_to_luma_u16_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_u16_row(&p, &mut k, width);
+    ayuv64_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "wasm ayuv64→luma u16 diverges (width={width})");
 }
@@ -150,7 +154,7 @@ fn wasm_ayuv64_lane_order_high_bit_set_values() {
   // luma u8 high-byte extraction: 0x8001 >> 8 = 0x80 for every pixel
   let mut luma_u8 = std::vec![0u8; W];
   unsafe {
-    ayuv64_to_luma_row(&packed, &mut luma_u8, W);
+    ayuv64_to_luma_row::<false>(&packed, &mut luma_u8, W);
   }
   let expected_luma: std::vec::Vec<u8> = std::vec![0x80; W];
   assert_eq!(
@@ -161,7 +165,13 @@ fn wasm_ayuv64_lane_order_high_bit_set_values() {
   // u8 RGBA α depth-convert: 0x8000+n >> 8 = 0x80 for n in 0..16 (since n < 256)
   let mut rgba_u8 = std::vec![0u8; W * 4];
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(&packed, &mut rgba_u8, W, ColorMatrix::Bt709, true);
+    ayuv64_to_rgb_or_rgba_row::<true, true, false>(
+      &packed,
+      &mut rgba_u8,
+      W,
+      ColorMatrix::Bt709,
+      true,
+    );
   }
   let alpha_out: std::vec::Vec<u8> = (0..W).map(|n| rgba_u8[n * 4 + 3]).collect();
   let expected_alpha: std::vec::Vec<u8> = std::vec![0x80; W];
@@ -206,7 +216,7 @@ fn wasm_ayuv64_lane_order_per_pixel_y_and_a() {
   // --- luma_u16 path: Y values should be direct (no shift, no conversion). ---
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    ayuv64_to_luma_u16_row(&packed, &mut luma_out, W);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=16u16).collect();
   assert_eq!(luma_out, expected_luma, "wasm ayuv64→luma_u16 reorder bug");
@@ -215,7 +225,7 @@ fn wasm_ayuv64_lane_order_per_pixel_y_and_a() {
   // Use full_range=true so neutral chroma gives a well-defined Y output.
   let mut rgba_out = std::vec![0u16; W * 4];
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
       &packed,
       &mut rgba_out,
       W,
diff --git a/src/row/arch/wasm_simd128/tests/v410.rs b/src/row/arch/wasm_simd128/tests/v410.rs
index 0c5310a4..30f2272e 100644
--- a/src/row/arch/wasm_simd128/tests/v410.rs
+++ b/src/row/arch/wasm_simd128/tests/v410.rs
@@ -24,9 +24,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v410_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -38,9 +38,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v410_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -52,9 +52,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -66,9 +66,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -80,9 +80,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v410_words(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v410_to_luma_row(&p, &mut s, width);
+  scalar::v410_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_row(&p, &mut k, width);
+    v410_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 v410→luma diverges (width={width})");
 }
@@ -91,9 +91,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v410_words(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v410_to_luma_u16_row(&p, &mut s, width);
+  scalar::v410_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_u16_row(&p, &mut k, width);
+    v410_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "simd128 v410→luma u16 diverges (width={width})");
 }
@@ -197,7 +197,7 @@ fn wasm_simd128_v410_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    v410_to_luma_u16_row(&packed, &mut luma_out, W);
+    v410_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -209,9 +209,15 @@ fn wasm_simd128_v410_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v410_to_rgb_or_rgba_row::<false>(
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/wasm_simd128/tests/xv36.rs b/src/row/arch/wasm_simd128/tests/xv36.rs
index dd8375ca..06f53ea4 100644
--- a/src/row/arch/wasm_simd128/tests/xv36.rs
+++ b/src/row/arch/wasm_simd128/tests/xv36.rs
@@ -17,9 +17,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -34,9 +34,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -50,9 +50,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::xv36_to_luma_row(&p, &mut s, width);
+  scalar::xv36_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_row(&p, &mut k, width);
+    xv36_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "wasm xv36→luma diverges (width={width})");
 }
@@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::xv36_to_luma_u16_row(&p, &mut s, width);
+  scalar::xv36_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_u16_row(&p, &mut k, width);
+    xv36_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "wasm xv36→luma u16 diverges (width={width})");
 }
@@ -171,7 +171,7 @@ fn wasm_simd128_xv36_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1)
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    xv36_to_luma_u16_row(&packed, &mut luma_u16, W);
+    xv36_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -183,9 +183,15 @@ fn wasm_simd128_xv36_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
diff --git a/src/row/arch/wasm_simd128/v410.rs b/src/row/arch/wasm_simd128/v410.rs
index 24dead51..aa135e78 100644
--- a/src/row/arch/wasm_simd128/v410.rs
+++ b/src/row/arch/wasm_simd128/v410.rs
@@ -22,7 +22,7 @@
 
 use core::arch::wasm32::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- u8 RGB / RGBA output -----------------------------------------------
@@ -38,7 +38,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -71,8 +71,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 4 <= width {
-      // Load 4 V410 words.
-      let words = v128_load(packed.as_ptr().add(x).cast());
+      // Load 4 V410 words (with BE byte-swap if required).
+      let words = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
 
       // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20).
       let u_i32 = v128_and(words, mask);
@@ -152,7 +152,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -171,7 +177,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -208,7 +214,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = v128_load(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
 
       let u_i32 = v128_and(words, mask);
       let y_i32 = v128_and(u32x4_shr(words, 10), mask);
@@ -276,7 +282,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -300,7 +306,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v410_to_luma_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -310,7 +320,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = v128_load(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
       // Y field: bits 19:10 → shift right 10, mask to 10-bit.
       let y_i32 = v128_and(u32x4_shr(words, 10), mask);
       // Narrow i32x4 → i16x8 (4 valid lo lanes + 4 zero hi lanes).
@@ -327,7 +337,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     // Scalar tail — remaining < 4 pixels.
     if x < width {
-      scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
@@ -345,7 +355,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v410_to_luma_u16_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -355,7 +369,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     let mut x = 0usize;
     while x + 4 <= width {
-      let words = v128_load(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
       let y_i32 = v128_and(u32x4_shr(words, 10), mask);
       // Narrow i32x4 → i16x8: 4 valid lo lanes (values ≤ 1023, no saturation).
       let y_i16 = i16x8_narrow_i32x4(y_i32, zero4);
@@ -368,7 +382,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     // Scalar tail.
     if x < width {
-      scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_u16_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/xv36.rs b/src/row/arch/wasm_simd128/xv36.rs
index f34bde20..5e4770e1 100644
--- a/src/row/arch/wasm_simd128/xv36.rs
+++ b/src/row/arch/wasm_simd128/xv36.rs
@@ -39,7 +39,7 @@
 
 use core::arch::wasm32::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 /// Deinterleave 8 XV36 pixels (4 × v128 = 64 bytes) into separate
@@ -55,13 +55,14 @@ use crate::{ColorMatrix, row::scalar};
 /// caller must `u16x8_shr(v, 4)` to drop the 4 padding LSBs.
 #[inline]
 #[target_feature(enable = "simd128")]
-unsafe fn deinterleave_xv36_8px(ptr: *const u16) -> (v128, v128, v128) {
+unsafe fn deinterleave_xv36_8px<const BE: bool>(ptr: *const u16) -> (v128, v128, v128) {
   unsafe {
     // Load 4 × v128, each covering 2 pixels.
-    let raw0 = v128_load(ptr.cast()); // [U0,Y0,V0,A0,  U1,Y1,V1,A1]
-    let raw1 = v128_load(ptr.add(8).cast()); // [U2,Y2,V2,A2,  U3,Y3,V3,A3]
-    let raw2 = v128_load(ptr.add(16).cast()); // [U4,Y4,V4,A4,  U5,Y5,V5,A5]
-    let raw3 = v128_load(ptr.add(24).cast()); // [U6,Y6,V6,A6,  U7,Y7,V7,A7]
+    // For BE wire format, `load_endian_u16x8` byte-swaps each u16 lane.
+    let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8); // [U0,Y0,V0,A0,  U1,Y1,V1,A1]
+    let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8); // [U2,Y2,V2,A2,  U3,Y3,V3,A3]
+    let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8); // [U4,Y4,V4,A4,  U5,Y5,V5,A5]
+    let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8); // [U6,Y6,V6,A6,  U7,Y7,V7,A7]
 
     // Per-channel byte positions inside a 2-pixel v128:
     //   U → bytes 0,1 (pixel n) and 8,9 (pixel n+1)
@@ -137,7 +138,7 @@ unsafe fn deinterleave_xv36_8px(ptr: *const u16) -> (v128, v128, v128) {
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -169,7 +170,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave 8 XV36 pixels (64 bytes) into U/Y/V channels.
-      let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px(packed.as_ptr().add(x * 4));
+      let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px::<BE>(packed.as_ptr().add(x * 4));
 
       // Right-shift by 4 to drop the 4 padding LSBs → 12-bit [0, 4095].
       // Values ≤ 4095 fit safely in i16.
@@ -241,7 +242,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -261,7 +268,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -296,7 +303,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px(packed.as_ptr().add(x * 4));
+      let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px::<BE>(packed.as_ptr().add(x * 4));
 
       let u_i16 = u16x8_shr(u_raw, 4);
       let y_i16 = u16x8_shr(y_raw, 4);
@@ -359,7 +366,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -384,7 +391,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -396,10 +407,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     let mut x = 0usize;
     while x + 8 <= width {
       let ptr = packed.as_ptr().add(x * 4);
-      let raw0 = v128_load(ptr.cast()); // pixels 0,1
-      let raw1 = v128_load(ptr.add(8).cast()); // pixels 2,3
-      let raw2 = v128_load(ptr.add(16).cast()); // pixels 4,5
-      let raw3 = v128_load(ptr.add(24).cast()); // pixels 6,7
+      // For BE wire format, byte-swap each u16 before extracting Y via swizzle.
+      let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8); // pixels 0,1
+      let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8); // pixels 2,3
+      let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8); // pixels 4,5
+      let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8); // pixels 6,7
 
       // Extract Y from each pair → 2 u16 in low 4 bytes.
       let y0 = u8x16_swizzle(raw0, y_idx); // [Y0,Y1, 0..12]
@@ -425,7 +437,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
@@ -445,7 +457,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -455,10 +471,10 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
     let mut x = 0usize;
     while x + 8 <= width {
       let ptr = packed.as_ptr().add(x * 4);
-      let raw0 = v128_load(ptr.cast());
-      let raw1 = v128_load(ptr.add(8).cast());
-      let raw2 = v128_load(ptr.add(16).cast());
-      let raw3 = v128_load(ptr.add(24).cast());
+      let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8);
+      let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8);
+      let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8);
+      let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8);
 
       let y0 = u8x16_swizzle(raw0, y_idx);
       let y1 = u8x16_swizzle(raw1, y_idx);
@@ -477,7 +493,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_u16_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/ayuv64.rs b/src/row/arch/x86_avx2/ayuv64.rs
index c052f9c1..d3616d10 100644
--- a/src/row/arch/x86_avx2/ayuv64.rs
+++ b/src/row/arch/x86_avx2/ayuv64.rs
@@ -73,7 +73,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Deinterleave helper (16 pixels / 64 u16 / 128 bytes) ---------------
@@ -112,7 +112,9 @@ use crate::{ColorMatrix, row::scalar};
 /// elements). Caller's `target_feature` must include AVX2.
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i, __m256i) {
+unsafe fn deinterleave_ayuv64_16px_avx2<const BE: bool>(
+  ptr: *const u16,
+) -> (__m256i, __m256i, __m256i, __m256i) {
   // SAFETY: caller obligation — `ptr` has 128 bytes readable; AVX2 is
   // available.
   unsafe {
@@ -123,10 +125,12 @@ unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, _
     //   raw_c1 lo=A4..V4,A5..V5             hi=A6..V6,A7..V7    (pixels 4..7)
     //   raw_c2 lo=A8..V8,A9..V9             hi=A10..V10,A11..V11 (pixels 8..11)
     //   raw_c3 lo=A12..V12,A13..V13         hi=A14..V14,A15..V15 (pixels 12..15)
-    let raw_c0 = _mm256_loadu_si256(ptr.cast());
-    let raw_c1 = _mm256_loadu_si256(ptr.add(16).cast());
-    let raw_c2 = _mm256_loadu_si256(ptr.add(32).cast());
-    let raw_c3 = _mm256_loadu_si256(ptr.add(48).cast());
+    //
+    // For BE wire format, `load_endian_u16x16` byte-swaps each u16 lane.
+    let raw_c0 = endian::load_endian_u16x16::<BE>(ptr as *const u8);
+    let raw_c1 = endian::load_endian_u16x16::<BE>(ptr.add(16) as *const u8);
+    let raw_c2 = endian::load_endian_u16x16::<BE>(ptr.add(32) as *const u8);
+    let raw_c3 = endian::load_endian_u16x16::<BE>(ptr.add(48) as *const u8);
 
     // Reshape via cross-lane permute so each register holds the layout
     // the per-128-bit-lane cascade below expects:
@@ -206,7 +210,11 @@ unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, _
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -246,7 +254,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
     while x + 32 <= width {
       // --- lo half: pixels x..x+15 (one 16-pixel deinterleave) ----------
       let (a_lo_u16, y_lo_u16, u_lo_u16, v_lo_u16) =
-        deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4));
+        deinterleave_ayuv64_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma: subtract 32768 via wrapping i16 (-32768i16 == 0x8000).
       let u_lo_i16 = _mm256_sub_epi16(u_lo_u16, bias16_v);
@@ -286,7 +294,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 
       // --- hi half: pixels x+16..x+31 (one more 16-pixel deinterleave) --
       let (a_hi_u16, y_hi_u16, u_hi_u16, v_hi_u16) =
-        deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4 + 64));
+        deinterleave_ayuv64_16px_avx2::<BE>(packed.as_ptr().add(x * 4 + 64));
 
       let u_hi_i16 = _mm256_sub_epi16(u_hi_u16, bias16_v);
       let v_hi_i16 = _mm256_sub_epi16(v_hi_u16, bias16_v);
@@ -362,7 +370,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -395,7 +403,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -433,7 +445,8 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
     while x + 16 <= width {
       // Deinterleave 16 AYUV64 quadruples → A, Y, U, V as u16x16 in
       // natural pixel order.
-      let (a_u16, y_vec, u_u16, v_u16) = deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4));
+      let (a_u16, y_vec, u_u16, v_u16) =
+        deinterleave_ayuv64_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma via wrapping i16 subtraction.
       let u_i16 = _mm256_sub_epi16(u_u16, bias16_v);
@@ -569,7 +582,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -585,7 +598,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
 /// AVX2 AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgb_row(
+pub(crate) unsafe fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
@@ -593,7 +606,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -601,7 +614,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
 /// to u8 via `>> 8`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgba_row(
+pub(crate) unsafe fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -609,14 +622,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
   }
 }
 
 /// AVX2 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
@@ -624,7 +637,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(
+      packed, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -632,7 +647,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
 /// is written direct (no conversion).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -640,7 +655,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(
+      packed, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -664,7 +681,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -673,7 +694,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
     let mut x = 0usize;
     while x + 16 <= width {
       // Deinterleave 16 pixels and discard A/U/V.
-      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4));
+      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec lo lane = [Y0..Y7], hi lane = [Y8..Y15] (16 u16 in natural order).
       // `>> 8` → high byte of each Y u16. Then narrow to u8.
@@ -693,7 +714,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_row(
+      scalar::ayuv64_to_luma_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
@@ -719,7 +740,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -727,7 +752,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
   unsafe {
     let mut x = 0usize;
     while x + 16 <= width {
-      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4));
+      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
       // Direct store — Y samples are 16-bit native, in natural pixel order.
       _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_vec);
       x += 16;
@@ -735,7 +760,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_u16_row(
+      scalar::ayuv64_to_luma_u16_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
diff --git a/src/row/arch/x86_avx2/tests/ayuv64.rs b/src/row/arch/x86_avx2/tests/ayuv64.rs
index c9f89115..7344ddd9 100644
--- a/src/row/arch/x86_avx2/tests/ayuv64.rs
+++ b/src/row/arch/x86_avx2/tests/ayuv64.rs
@@ -22,9 +22,11 @@ fn check_rgb<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut s, width, matrix, full_range);
+  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -43,11 +45,13 @@ fn check_rgb_u16<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
     &p, &mut s, width, matrix, full_range,
   );
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
+      &p, &mut k, width, matrix, full_range,
+    );
   }
   assert_eq!(
     s,
@@ -61,9 +65,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::ayuv64_to_luma_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_row(&p, &mut k, width);
+    ayuv64_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 ayuv64→luma diverges (width={width})");
 }
@@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::ayuv64_to_luma_u16_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_u16_row(&p, &mut k, width);
+    ayuv64_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 ayuv64→luma u16 diverges (width={width})");
 }
@@ -179,7 +183,7 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() {
   // --- luma_u16 path: Y values should be direct (no conversion). ---
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    ayuv64_to_luma_u16_row(&packed, &mut luma_out, W);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -192,7 +196,7 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() {
   // a well-defined Y output. Matrix choice does not affect neutral chroma.
   let mut rgba_out = std::vec![0u16; W * 4];
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
       &packed,
       &mut rgba_out,
       W,
@@ -208,3 +212,160 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() {
     "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test for AYUV64 — exercises the host-aware
+/// endian gate via `endian::load_endian_u16x*::<BE>` and covers the
+/// source-α path explicitly via `(ALPHA=true, ALPHA_SRC=true)`.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_ayuv64_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_ayuv64(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    {
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        ayuv64_to_luma_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        ayuv64_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx2/tests/v410.rs b/src/row/arch/x86_avx2/tests/v410.rs
index 3acec1aa..a57e0d70 100644
--- a/src/row/arch/x86_avx2/tests/v410.rs
+++ b/src/row/arch/x86_avx2/tests/v410.rs
@@ -27,9 +27,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -44,9 +44,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -60,9 +60,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v410_to_luma_row(&p, &mut s, width);
+  scalar::v410_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_row(&p, &mut k, width);
+    v410_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 v410→luma diverges (width={width})");
 }
@@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v410_to_luma_u16_row(&p, &mut s, width);
+  scalar::v410_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_u16_row(&p, &mut k, width);
+    v410_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 v410→luma u16 diverges (width={width})");
 }
@@ -183,7 +183,7 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v410_to_luma_u16_row(&packed, &mut luma, W);
+    v410_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "avx2 v410 luma reorder bug");
@@ -192,9 +192,15 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v410_to_rgb_or_rgba_row::<false>(
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -206,3 +212,121 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() {
     "avx2 v410 SIMD vs scalar diverges — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u32x*::<BE>`. Existing per-backend tests use
+/// `BE=false` only; existing dispatcher BE-vs-LE tests use `use_simd=false`,
+/// so the SIMD endian gate is otherwise untested.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_v410_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_v410(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u32> = le_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let be: std::vec::Vec<u32> = be_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          v410_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          v410_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        v410_to_luma_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        v410_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx2/tests/xv36.rs b/src/row/arch/x86_avx2/tests/xv36.rs
index be5cc011..7b703951 100644
--- a/src/row/arch/x86_avx2/tests/xv36.rs
+++ b/src/row/arch/x86_avx2/tests/xv36.rs
@@ -17,9 +17,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -34,9 +34,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -50,9 +50,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::xv36_to_luma_row(&p, &mut s, width);
+  scalar::xv36_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_row(&p, &mut k, width);
+    xv36_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 xv36→luma diverges (width={width})");
 }
@@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::xv36_to_luma_u16_row(&p, &mut s, width);
+  scalar::xv36_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_u16_row(&p, &mut k, width);
+    xv36_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX2 xv36→luma u16 diverges (width={width})");
 }
@@ -178,7 +178,7 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1)
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    xv36_to_luma_u16_row(&packed, &mut luma_u16, W);
+    xv36_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "avx2 xv36 luma_u16 reorder bug");
@@ -187,9 +187,15 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -201,3 +207,120 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() {
     "avx2 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u16x*::<BE>` for AVX2. See sibling v410 test for
+/// rationale.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_xv36_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_xv36(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          xv36_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        xv36_to_luma_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        xv36_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx2 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx2/v410.rs b/src/row/arch/x86_avx2/v410.rs
index 90144637..934cad98 100644
--- a/src/row/arch/x86_avx2/v410.rs
+++ b/src/row/arch/x86_avx2/v410.rs
@@ -28,7 +28,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Bit-extraction helper -----------------------------------------------
@@ -47,11 +47,11 @@ use crate::{ColorMatrix, row::scalar};
 /// that `target_feature` includes AVX2.
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn unpack_v410_8px_avx2(ptr: *const u32) -> (__m256i, __m256i, __m256i) {
+unsafe fn unpack_v410_8px_avx2<const BE: bool>(ptr: *const u32) -> (__m256i, __m256i, __m256i) {
   // SAFETY: caller obligation — `ptr` has 32 bytes readable; AVX2 is
   // available.
   unsafe {
-    let words = _mm256_loadu_si256(ptr.cast());
+    let words = endian::load_endian_u32x8::<BE>(ptr as *const u8);
     let mask = _mm256_set1_epi32(0x3FF);
 
     // Extract 10-bit fields in i32x8 (values ≤ 1023 — no overflow risk).
@@ -88,7 +88,7 @@ unsafe fn unpack_v410_8px_avx2(ptr: *const u32) -> (__m256i, __m256i, __m256i) {
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -121,7 +121,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 8 <= width {
       // Unpack 8 V410 words → three i16x16 with valid data in lanes 0..7.
-      let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2(packed.as_ptr().add(x));
+      let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2::<BE>(packed.as_ptr().add(x));
 
       // Subtract chroma bias (512 for 10-bit).
       let u_sub = _mm256_sub_epi16(u_i16, bias_v);
@@ -201,7 +201,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -222,7 +228,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -257,7 +263,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2(packed.as_ptr().add(x));
+      let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2::<BE>(packed.as_ptr().add(x));
 
       let u_sub = _mm256_sub_epi16(u_i16, bias_v);
       let v_sub = _mm256_sub_epi16(v_i16, bias_v);
@@ -331,7 +337,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -357,7 +363,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v410_to_luma_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -367,7 +377,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let words = _mm256_loadu_si256(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x8::<BE>(packed.as_ptr().add(x) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each i32 lane.
       let y_i32 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask);
@@ -390,7 +400,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     // Scalar tail — remaining < 8 pixels.
     if x < width {
-      scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
@@ -411,7 +421,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v410_to_luma_u16_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -421,7 +435,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let words = _mm256_loadu_si256(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x8::<BE>(packed.as_ptr().add(x) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each i32 lane.
       let y_i32 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask);
@@ -440,7 +454,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     // Scalar tail — remaining < 8 pixels.
     if x < width {
-      scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_u16_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/xv36.rs b/src/row/arch/x86_avx2/xv36.rs
index a2eb686f..eb5b6440 100644
--- a/src/row/arch/x86_avx2/xv36.rs
+++ b/src/row/arch/x86_avx2/xv36.rs
@@ -49,7 +49,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Deinterleave helper ------------------------------------------------
@@ -77,7 +77,7 @@ use crate::{ColorMatrix, row::scalar};
 /// Caller's `target_feature` must include AVX2.
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) {
+unsafe fn unpack_xv36_16px_avx2<const BE: bool>(ptr: *const u16) -> (__m256i, __m256i, __m256i) {
   // SAFETY: caller obligation — `ptr` has 128 bytes readable; AVX2 is
   // available.
   unsafe {
@@ -88,10 +88,12 @@ unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i)
     //   raw_c1 = pixels 4..7   (lo=P4,P5; hi=P6,P7)
     //   raw_c2 = pixels 8..11  (lo=P8,P9; hi=P10,P11)
     //   raw_c3 = pixels 12..15 (lo=P12,P13; hi=P14,P15)
-    let raw_c0 = _mm256_loadu_si256(ptr.cast());
-    let raw_c1 = _mm256_loadu_si256(ptr.add(16).cast());
-    let raw_c2 = _mm256_loadu_si256(ptr.add(32).cast());
-    let raw_c3 = _mm256_loadu_si256(ptr.add(48).cast());
+    //
+    // For BE wire format, `load_endian_u16x16` byte-swaps each u16 lane.
+    let raw_c0 = endian::load_endian_u16x16::<BE>(ptr as *const u8);
+    let raw_c1 = endian::load_endian_u16x16::<BE>(ptr.add(16) as *const u8);
+    let raw_c2 = endian::load_endian_u16x16::<BE>(ptr.add(32) as *const u8);
+    let raw_c3 = endian::load_endian_u16x16::<BE>(ptr.add(48) as *const u8);
 
     // Reshape via cross-lane permute so each register holds the layout the
     // per-128-bit-lane cascade below expects:
@@ -173,7 +175,7 @@ unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i)
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -206,7 +208,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 16 <= width {
       // Deinterleave 16 XV36 quadruples → U, Y, V as i16x16 in [0, 4095].
-      let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // Values ≤ 4095 < 32767 — safe to treat as signed i16.
       let u_i16 = u_u16;
@@ -288,7 +290,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -309,7 +317,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -346,7 +354,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       let u_i16 = u_u16;
       let y_i16 = y_u16;
@@ -423,7 +431,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -452,7 +460,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -460,7 +472,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
   unsafe {
     let mut x = 0usize;
     while x + 16 <= width {
-      let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec is already >> 4 (values in [0, 4095]).
       // Scalar does `packed[x*4+1] >> 8` — that is the MSB-aligned value >> 4
@@ -482,7 +494,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 
     // Scalar tail — remaining < 16 pixels.
     if x < width {
-      scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
@@ -504,7 +516,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -512,7 +528,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
   unsafe {
     let mut x = 0usize;
     while x + 16 <= width {
-      let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec already has >> 4 applied (= 12-bit value in [0, 4095]).
       // Direct store of 16 × u16.
@@ -523,7 +539,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 
     // Scalar tail — remaining < 16 pixels.
     if x < width {
-      scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_u16_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/ayuv64.rs b/src/row/arch/x86_avx512/ayuv64.rs
index 3ad3bdee..e0b767a4 100644
--- a/src/row/arch/x86_avx512/ayuv64.rs
+++ b/src/row/arch/x86_avx512/ayuv64.rs
@@ -74,7 +74,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Static permute index tables ----------------------------------------
@@ -192,7 +192,9 @@ static COMBINE_IDX: [i16; 32] = [
 /// AVX-512BW (BW provides `vpermt2w`).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i, __m512i) {
+unsafe fn deinterleave_ayuv64_32px_avx512<const BE: bool>(
+  ptr: *const u16,
+) -> (__m512i, __m512i, __m512i, __m512i) {
   // SAFETY: caller obligation — `ptr` has 256 bytes readable; AVX-512F +
   // AVX-512BW are available.
   unsafe {
@@ -203,10 +205,12 @@ unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i,
     //   v1 lanes: A8..V8,...,A15..V15            (pixels  8..15)
     //   v2 lanes: A16..V16,...,A23..V23          (pixels 16..23)
     //   v3 lanes: A24..V24,...,A31..V31          (pixels 24..31)
-    let v0 = _mm512_loadu_si512(ptr.cast());
-    let v1 = _mm512_loadu_si512(ptr.add(32).cast());
-    let v2 = _mm512_loadu_si512(ptr.add(64).cast());
-    let v3 = _mm512_loadu_si512(ptr.add(96).cast());
+    //
+    // For BE wire format, `load_endian_u16x32` byte-swaps each u16 lane.
+    let v0 = endian::load_endian_u16x32::<BE>(ptr as *const u8);
+    let v1 = endian::load_endian_u16x32::<BE>(ptr.add(32) as *const u8);
+    let v2 = endian::load_endian_u16x32::<BE>(ptr.add(64) as *const u8);
+    let v3 = endian::load_endian_u16x32::<BE>(ptr.add(96) as *const u8);
 
     // Load permute index tables.
     let a_idx = _mm512_loadu_si512(A_FROM_PAIR_IDX.as_ptr().cast());
@@ -259,7 +263,11 @@ unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i,
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -300,7 +308,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
     while x + 64 <= width {
       // --- lo half: pixels x..x+31 (one 32-pixel deinterleave) ----------
       let (a_lo_u16, y_lo_u16, u_lo_u16, v_lo_u16) =
-        deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4));
+        deinterleave_ayuv64_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma: subtract 32768 via wrapping i16 (-32768i16 == 0x8000).
       let u_lo_i16 = _mm512_sub_epi16(u_lo_u16, bias16_v);
@@ -346,7 +354,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 
       // --- hi half: pixels x+32..x+63 (one more 32-pixel deinterleave) --
       let (a_hi_u16, y_hi_u16, u_hi_u16, v_hi_u16) =
-        deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4 + 128));
+        deinterleave_ayuv64_32px_avx512::<BE>(packed.as_ptr().add(x * 4 + 128));
 
       let u_hi_i16 = _mm512_sub_epi16(u_hi_u16, bias16_v);
       let v_hi_i16 = _mm512_sub_epi16(v_hi_u16, bias16_v);
@@ -430,7 +438,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -463,7 +471,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -508,7 +520,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       // Deinterleave 32 AYUV64 quadruples → A, Y, U, V as u16x32 in
       // natural pixel order.
       let (a_u16, y_vec, u_u16, v_u16) =
-        deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4));
+        deinterleave_ayuv64_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma via wrapping i16 subtraction.
       let u_i16 = _mm512_sub_epi16(u_u16, bias16_v);
@@ -651,7 +663,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -667,7 +679,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
 /// AVX-512 AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgb_row(
+pub(crate) unsafe fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
@@ -675,7 +687,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -683,7 +695,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
 /// to u8 via `>> 8`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgba_row(
+pub(crate) unsafe fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -691,14 +703,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
   }
 }
 
 /// AVX-512 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
@@ -706,7 +718,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(
+      packed, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -714,7 +728,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
 /// is written direct (no conversion).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -722,7 +736,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(
+      packed, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -746,7 +762,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -758,7 +778,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
     let mut x = 0usize;
     while x + 32 <= width {
       // Deinterleave 32 pixels and discard A/U/V.
-      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4));
+      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec is i16x32 with Y0..Y31 (16-bit native).
       // `>> 8` → high byte of each Y u16. Then narrow to u8.
@@ -778,7 +798,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_row(
+      scalar::ayuv64_to_luma_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
@@ -804,7 +824,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -812,7 +836,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
   unsafe {
     let mut x = 0usize;
     while x + 32 <= width {
-      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4));
+      let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
       // Direct store — Y samples are 16-bit native, in natural pixel order.
       _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_vec);
       x += 32;
@@ -820,7 +844,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_u16_row(
+      scalar::ayuv64_to_luma_u16_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
diff --git a/src/row/arch/x86_avx512/tests/ayuv64.rs b/src/row/arch/x86_avx512/tests/ayuv64.rs
index a2b2872c..be84a70b 100644
--- a/src/row/arch/x86_avx512/tests/ayuv64.rs
+++ b/src/row/arch/x86_avx512/tests/ayuv64.rs
@@ -22,9 +22,11 @@ fn check_rgb<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut s, width, matrix, full_range);
+  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -43,11 +45,13 @@ fn check_rgb_u16<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
     &p, &mut s, width, matrix, full_range,
   );
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
+      &p, &mut k, width, matrix, full_range,
+    );
   }
   assert_eq!(
     s,
@@ -61,9 +65,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::ayuv64_to_luma_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_row(&p, &mut k, width);
+    ayuv64_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 ayuv64→luma diverges (width={width})");
 }
@@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::ayuv64_to_luma_u16_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_u16_row(&p, &mut k, width);
+    ayuv64_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 ayuv64→luma u16 diverges (width={width})");
 }
@@ -195,7 +199,7 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() {
   // --- luma_u16 path: Y values should be direct (no conversion). ---
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    ayuv64_to_luma_u16_row(&packed, &mut luma_out, W);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(
@@ -208,7 +212,7 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() {
   // a well-defined Y output. Matrix choice does not affect neutral chroma.
   let mut rgba_out = std::vec![0u16; W * 4];
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
       &packed,
       &mut rgba_out,
       W,
@@ -224,3 +228,162 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() {
     "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test for AYUV64 — exercises the host-aware
+/// endian gate via `endian::load_endian_u16x*::<BE>` and covers the
+/// source-α path explicitly via `(ALPHA=true, ALPHA_SRC=true)`.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_ayuv64_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [15usize, 16, 33, 65] {
+    let intended = pseudo_random_ayuv64(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    {
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        ayuv64_to_luma_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        ayuv64_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx512/tests/v410.rs b/src/row/arch/x86_avx512/tests/v410.rs
index d1b1e17c..928eb178 100644
--- a/src/row/arch/x86_avx512/tests/v410.rs
+++ b/src/row/arch/x86_avx512/tests/v410.rs
@@ -24,9 +24,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u8; width * 3];
   let mut k = std::vec![0u8; width * 3];
-  scalar::v410_to_rgb_or_rgba_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -38,9 +38,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u8; width * 4];
   let mut k = std::vec![0u8; width * 4];
-  scalar::v410_to_rgb_or_rgba_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<true>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -52,9 +52,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u16; width * 3];
   let mut k = std::vec![0u16; width * 3];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<false>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<false, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -66,9 +66,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) {
   let p = pseudo_random_v410_words(width, 0xAA55);
   let mut s = std::vec![0u16; width * 4];
   let mut k = std::vec![0u16; width * 4];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s, k,
@@ -80,9 +80,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v410_words(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v410_to_luma_row(&p, &mut s, width);
+  scalar::v410_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_row(&p, &mut k, width);
+    v410_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 v410→luma diverges (width={width})");
 }
@@ -91,9 +91,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v410_words(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v410_to_luma_u16_row(&p, &mut s, width);
+  scalar::v410_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_u16_row(&p, &mut k, width);
+    v410_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 v410→luma u16 diverges (width={width})");
 }
@@ -215,7 +215,7 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v410_to_luma_u16_row(&packed, &mut luma, W);
+    v410_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "avx512 v410 luma reorder bug");
@@ -224,9 +224,15 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v410_to_rgb_or_rgba_row::<false>(
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -238,3 +244,124 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() {
     "avx512 v410 SIMD vs scalar diverges — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u32x*::<BE>` for AVX-512. Existing per-backend
+/// tests use `BE=false` only; existing dispatcher BE-vs-LE tests use
+/// `use_simd=false`, so the SIMD endian gate is otherwise untested. Widths
+/// 33 and 65 cover ≥1 main-loop iteration plus a scalar tail.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_v410_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [15usize, 16, 33, 65] {
+    let intended = pseudo_random_v410_words(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u32> = le_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let be: std::vec::Vec<u32> = be_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          v410_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          v410_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        v410_to_luma_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        v410_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx512/tests/xv36.rs b/src/row/arch/x86_avx512/tests/xv36.rs
index c73a4d1a..fd202408 100644
--- a/src/row/arch/x86_avx512/tests/xv36.rs
+++ b/src/row/arch/x86_avx512/tests/xv36.rs
@@ -17,9 +17,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -34,9 +34,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -50,9 +50,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::xv36_to_luma_row(&p, &mut s, width);
+  scalar::xv36_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_row(&p, &mut k, width);
+    xv36_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 xv36→luma diverges (width={width})");
 }
@@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::xv36_to_luma_u16_row(&p, &mut s, width);
+  scalar::xv36_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_u16_row(&p, &mut k, width);
+    xv36_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "AVX-512 xv36→luma u16 diverges (width={width})");
 }
@@ -190,7 +190,7 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1)
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    xv36_to_luma_u16_row(&packed, &mut luma_u16, W);
+    xv36_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "avx512 xv36 luma_u16 reorder bug");
@@ -199,9 +199,15 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -213,3 +219,122 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() {
     "avx512 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u16x*::<BE>` for AVX-512. See sibling v410 test
+/// for rationale.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_xv36_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [15usize, 16, 33, 65] {
+    let intended = pseudo_random_xv36(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          xv36_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        xv36_to_luma_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        xv36_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "avx512 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_avx512/v410.rs b/src/row/arch/x86_avx512/v410.rs
index 0862aaf8..79cddd11 100644
--- a/src/row/arch/x86_avx512/v410.rs
+++ b/src/row/arch/x86_avx512/v410.rs
@@ -30,7 +30,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Bit-extraction helper -----------------------------------------------
@@ -49,11 +49,11 @@ use crate::{ColorMatrix, row::scalar};
 /// that `target_feature` includes AVX-512F + AVX-512BW.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn unpack_v410_16px_avx512(ptr: *const u32) -> (__m512i, __m512i, __m512i) {
+unsafe fn unpack_v410_16px_avx512<const BE: bool>(ptr: *const u32) -> (__m512i, __m512i, __m512i) {
   // SAFETY: caller obligation — `ptr` has 64 bytes readable; AVX-512F
   // + AVX-512BW are available.
   unsafe {
-    let words = _mm512_loadu_si512(ptr.cast());
+    let words = endian::load_endian_u32x16::<BE>(ptr as *const u8);
     let mask = _mm512_set1_epi32(0x3FF);
 
     // Extract 10-bit fields in i32x16 (values ≤ 1023 — no overflow risk).
@@ -87,7 +87,7 @@ unsafe fn unpack_v410_16px_avx512(ptr: *const u32) -> (__m512i, __m512i, __m512i
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -121,7 +121,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 16 <= width {
       // Unpack 16 V410 words → three i16x32 with valid data in lanes 0..16.
-      let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512(packed.as_ptr().add(x));
+      let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512::<BE>(packed.as_ptr().add(x));
 
       // Subtract chroma bias (512 for 10-bit).
       let u_sub = _mm512_sub_epi16(u_i16, bias_v);
@@ -201,7 +201,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -222,7 +228,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -258,7 +264,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512(packed.as_ptr().add(x));
+      let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512::<BE>(packed.as_ptr().add(x));
 
       let u_sub = _mm512_sub_epi16(u_i16, bias_v);
       let v_sub = _mm512_sub_epi16(v_i16, bias_v);
@@ -332,7 +338,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -358,7 +364,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v410_to_luma_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -369,7 +379,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let words = _mm512_loadu_si512(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x16::<BE>(packed.as_ptr().add(x) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each i32 lane.
       let y_i32 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask);
@@ -393,7 +403,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     // Scalar tail — remaining < 16 pixels.
     if x < width {
-      scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
@@ -414,7 +424,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v410_to_luma_u16_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -424,7 +438,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let words = _mm512_loadu_si512(packed.as_ptr().add(x).cast());
+      let words = endian::load_endian_u32x16::<BE>(packed.as_ptr().add(x) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each i32 lane.
       let y_i32 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask);
@@ -442,7 +456,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     // Scalar tail — remaining < 16 pixels.
     if x < width {
-      scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_u16_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/xv36.rs b/src/row/arch/x86_avx512/xv36.rs
index e0833228..fa50208d 100644
--- a/src/row/arch/x86_avx512/xv36.rs
+++ b/src/row/arch/x86_avx512/xv36.rs
@@ -40,7 +40,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Static permute index tables -----------------------------------------
@@ -145,15 +145,16 @@ static COMBINE_IDX: [i16; 32] = [
 /// `vpermt2w` — the u16 cross-vector permute).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn unpack_xv36_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i) {
+unsafe fn unpack_xv36_32px_avx512<const BE: bool>(ptr: *const u16) -> (__m512i, __m512i, __m512i) {
   // SAFETY: caller obligation — `ptr` has 256 bytes readable; AVX-512F +
   // AVX-512BW are available.
   unsafe {
     // Load 4 × __m512i (32 pixels × 4 u16 channels = 128 u16 = 256 bytes).
-    let v0 = _mm512_loadu_si512(ptr.cast()); // pixels  0.. 7
-    let v1 = _mm512_loadu_si512(ptr.add(32).cast()); // pixels  8..15
-    let v2 = _mm512_loadu_si512(ptr.add(64).cast()); // pixels 16..23
-    let v3 = _mm512_loadu_si512(ptr.add(96).cast()); // pixels 24..31
+    // For BE wire format, `load_endian_u16x32` byte-swaps each u16 lane.
+    let v0 = endian::load_endian_u16x32::<BE>(ptr as *const u8); // pixels  0.. 7
+    let v1 = endian::load_endian_u16x32::<BE>(ptr.add(32) as *const u8); // pixels  8..15
+    let v2 = endian::load_endian_u16x32::<BE>(ptr.add(64) as *const u8); // pixels 16..23
+    let v3 = endian::load_endian_u16x32::<BE>(ptr.add(96) as *const u8); // pixels 24..31
 
     // Load permute index tables.
     let uv_idx = _mm512_loadu_si512(UV_FROM_PAIR_IDX.as_ptr().cast());
@@ -201,7 +202,7 @@ unsafe fn unpack_xv36_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -235,7 +236,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 32 <= width {
       // Deinterleave 32 XV36 quadruples → U, Y, V as i16x32 in [0, 4095].
-      let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // Values ≤ 4095 < 32767 — safe to treat as signed i16.
       let u_i16 = u_u16;
@@ -317,7 +318,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -338,7 +345,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -376,7 +383,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       let u_i16 = u_u16;
       let y_i16 = y_u16;
@@ -437,7 +444,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -466,7 +473,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -477,7 +488,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec is already >> 4 (values in [0, 4095]).
       // Scalar does `packed[x*4+1] >> 8` — that is MSB-aligned >> 4 to get
@@ -495,7 +506,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 
     // Scalar tail — remaining < 32 pixels.
     if x < width {
-      scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
@@ -517,7 +528,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -525,7 +540,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
   unsafe {
     let mut x = 0usize;
     while x + 32 <= width {
-      let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec already has >> 4 applied (= 12-bit value in [0, 4095]).
       // Direct store of 32 × u16.
@@ -536,7 +551,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 
     // Scalar tail — remaining < 32 pixels.
     if x < width {
-      scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_u16_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/ayuv64.rs b/src/row/arch/x86_sse41/ayuv64.rs
index 561a3207..f6b11298 100644
--- a/src/row/arch/x86_sse41/ayuv64.rs
+++ b/src/row/arch/x86_sse41/ayuv64.rs
@@ -55,7 +55,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Deinterleave helper ------------------------------------------------
@@ -74,13 +74,16 @@ use crate::{ColorMatrix, row::scalar};
 /// Caller's `target_feature` must include SSE4.1.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-unsafe fn deinterleave_ayuv64(ptr: *const u16) -> (__m128i, __m128i, __m128i, __m128i) {
+unsafe fn deinterleave_ayuv64<const BE: bool>(
+  ptr: *const u16,
+) -> (__m128i, __m128i, __m128i, __m128i) {
   unsafe {
     // Load 4 × __m128i (8 pixels × 4 channels × u16 = 64 bytes).
-    let raw0 = _mm_loadu_si128(ptr.cast()); // A0,Y0,U0,V0, A1,Y1,U1,V1
-    let raw1 = _mm_loadu_si128(ptr.add(8).cast()); // A2,Y2,U2,V2, A3,Y3,U3,V3
-    let raw2 = _mm_loadu_si128(ptr.add(16).cast()); // A4,Y4,U4,V4, A5,Y5,U5,V5
-    let raw3 = _mm_loadu_si128(ptr.add(24).cast()); // A6,Y6,U6,V6, A7,Y7,U7,V7
+    // BE=true: byte-swap within each u16 lane to correct wire endianness.
+    let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8); // A0,Y0,U0,V0, A1,Y1,U1,V1
+    let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8); // A2,Y2,U2,V2, A3,Y3,U3,V3
+    let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8); // A4,Y4,U4,V4, A5,Y5,U5,V5
+    let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8); // A6,Y6,U6,V6, A7,Y7,U7,V7
 
     // Level 1 unpack (pairs 0-1, pairs 2-3).
     let s1_lo = _mm_unpacklo_epi16(raw0, raw1); // A0,A2,Y0,Y2,U0,U2,V0,V2
@@ -123,7 +126,11 @@ unsafe fn deinterleave_ayuv64(ptr: *const u16) -> (__m128i, __m128i, __m128i, __
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -159,7 +166,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
     while x + 16 <= width {
       // --- lo half: pixels x..x+7 ----------------------------------------
       let (a_lo_u16, y_lo_u16, u_lo_u16, v_lo_u16) =
-        deinterleave_ayuv64(packed.as_ptr().add(x * 4));
+        deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma: subtract 32768 via wrapping i16.
       let u_lo_i16 = _mm_sub_epi16(u_lo_u16, bias16_v);
@@ -200,7 +207,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 
       // --- hi half: pixels x+8..x+15 ------------------------------------
       let (a_hi_u16, y_hi_u16, u_hi_u16, v_hi_u16) =
-        deinterleave_ayuv64(packed.as_ptr().add(x * 4 + 32));
+        deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       let u_hi_i16 = _mm_sub_epi16(u_hi_u16, bias16_v);
       let v_hi_i16 = _mm_sub_epi16(v_hi_u16, bias16_v);
@@ -266,7 +273,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -297,7 +304,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SR
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -333,7 +344,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave 8 AYUV64 quadruples → A, Y, U, V as u16x8.
-      let (a_u16, y_vec, u_u16, v_u16) = deinterleave_ayuv64(packed.as_ptr().add(x * 4));
+      let (a_u16, y_vec, u_u16, v_u16) = deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4));
 
       // Center chroma via wrapping i16 subtraction.
       let u_i16 = _mm_sub_epi16(u_u16, bias16_v);
@@ -460,7 +471,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+      scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -476,7 +487,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const
 /// SSE4.1 AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgb_row(
+pub(crate) unsafe fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
@@ -484,7 +495,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -492,7 +503,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row(
 /// to u8 via `>> 8`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgba_row(
+pub(crate) unsafe fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -500,14 +511,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
   }
 }
 
 /// SSE4.1 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
@@ -515,7 +526,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(
+      packed, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -523,7 +536,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row(
 /// is written direct (no conversion).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
+pub(crate) unsafe fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -531,7 +544,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(
+      packed, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
@@ -551,7 +566,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row(
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -559,8 +578,8 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
     let mut x = 0usize;
     while x + 16 <= width {
       // Two deinterleaves for 8 pixels each.
-      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64(packed.as_ptr().add(x * 4));
-      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64(packed.as_ptr().add(x * 4 + 32));
+      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4));
+      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       // >> 8 to get u8 luma (high byte of each Y u16 sample).
       let y_lo_shr = _mm_srli_epi16::<8>(y_lo);
@@ -574,7 +593,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_row(
+      scalar::ayuv64_to_luma_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
@@ -597,7 +616,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid
 /// 3. `luma_out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
 
@@ -605,8 +628,8 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
     let mut x = 0usize;
     while x + 16 <= width {
       // Two deinterleaves for 8 pixels each.
-      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64(packed.as_ptr().add(x * 4));
-      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64(packed.as_ptr().add(x * 4 + 32));
+      let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4));
+      let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64::<BE>(packed.as_ptr().add(x * 4 + 32));
 
       // Direct copy — Y samples are 16-bit native (no shift needed).
       _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_lo);
@@ -617,7 +640,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16]
 
     // Scalar tail.
     if x < width {
-      scalar::ayuv64_to_luma_u16_row(
+      scalar::ayuv64_to_luma_u16_row::<BE>(
         &packed[x * 4..width * 4],
         &mut luma_out[x..width],
         width - x,
diff --git a/src/row/arch/x86_sse41/tests/ayuv64.rs b/src/row/arch/x86_sse41/tests/ayuv64.rs
index 1fad101f..50440a01 100644
--- a/src/row/arch/x86_sse41/tests/ayuv64.rs
+++ b/src/row/arch/x86_sse41/tests/ayuv64.rs
@@ -22,9 +22,11 @@ fn check_rgb<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut s, width, matrix, full_range);
+  scalar::ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(
+    &p, &mut s, width, matrix, full_range,
+  );
   unsafe {
-    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -43,11 +45,13 @@ fn check_rgb_u16<const ALPHA: bool, const ALPHA_SRC: bool>(
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(
+  scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
     &p, &mut s, width, matrix, full_range,
   );
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC>(&p, &mut k, width, matrix, full_range);
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<ALPHA, ALPHA_SRC, false>(
+      &p, &mut k, width, matrix, full_range,
+    );
   }
   assert_eq!(
     s,
@@ -61,9 +65,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::ayuv64_to_luma_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_row(&p, &mut k, width);
+    ayuv64_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 ayuv64→luma diverges (width={width})");
 }
@@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_ayuv64(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::ayuv64_to_luma_u16_row(&p, &mut s, width);
+  scalar::ayuv64_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    ayuv64_to_luma_u16_row(&p, &mut k, width);
+    ayuv64_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 ayuv64→luma u16 diverges (width={width})");
 }
@@ -167,7 +171,7 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() {
   // --- luma_u16 path: Y values should be direct (no conversion). ---
   let mut luma_out = std::vec![0u16; W];
   unsafe {
-    ayuv64_to_luma_u16_row(&packed, &mut luma_out, W);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut luma_out, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=16).map(|n| n as u16).collect();
   assert_eq!(
@@ -180,7 +184,7 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() {
   // a well-defined Y output. Matrix choice does not affect neutral chroma.
   let mut rgba_out = std::vec![0u16; W * 4];
   unsafe {
-    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(
+    ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
       &packed,
       &mut rgba_out,
       W,
@@ -196,3 +200,160 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() {
     "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u16x8::<BE>` for AYUV64. Covers the source-α
+/// path explicitly via `(ALPHA=true, ALPHA_SRC=true)`.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_ayuv64_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_ayuv64(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    {
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      unsafe {
+        ayuv64_to_rgb_or_rgba_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+        ayuv64_to_rgb_or_rgba_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          false,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      unsafe {
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, false>(
+          &le,
+          &mut out_le,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+        ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, true>(
+          &be,
+          &mut out_be,
+          w,
+          ColorMatrix::Bt709,
+          true,
+        );
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        ayuv64_to_luma_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        ayuv64_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        ayuv64_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_sse41/tests/v410.rs b/src/row/arch/x86_sse41/tests/v410.rs
index ee908301..07b9d911 100644
--- a/src/row/arch/x86_sse41/tests/v410.rs
+++ b/src/row/arch/x86_sse41/tests/v410.rs
@@ -27,9 +27,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -44,9 +44,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -60,9 +60,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::v410_to_luma_row(&p, &mut s, width);
+  scalar::v410_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_row(&p, &mut k, width);
+    v410_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 v410→luma diverges (width={width})");
 }
@@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_v410(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::v410_to_luma_u16_row(&p, &mut s, width);
+  scalar::v410_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    v410_to_luma_u16_row(&p, &mut k, width);
+    v410_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 v410→luma u16 diverges (width={width})");
 }
@@ -181,7 +181,7 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order (u16, no shift loss)
   let mut luma = std::vec![0u16; W];
   unsafe {
-    v410_to_luma_u16_row(&packed, &mut luma, W);
+    v410_to_luma_u16_row::<false>(&packed, &mut luma, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma, expected_luma, "sse4.1 v410 luma reorder bug");
@@ -190,9 +190,15 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u8; W * 3];
   let mut scalar_rgb = std::vec![0u8; W * 3];
   unsafe {
-    v410_to_rgb_or_rgba_row::<false>(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      crate::ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::v410_to_rgb_or_rgba_row::<false>(
+  scalar::v410_to_rgb_or_rgba_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -204,3 +210,125 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() {
     "sse4.1 v410 SIMD vs scalar diverges — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u32x4::<BE>`. Existing per-backend tests use
+/// `BE=false` only; existing dispatcher BE-vs-LE tests use `use_simd=false`,
+/// so the SIMD endian gate is otherwise untested.
+///
+/// On an LE host:
+/// - SIMD `<…BE=false>` on LE input  → no-swap path.
+/// - SIMD `<…BE=true>`  on BE input  → swap path.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_v410_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_v410(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u32> = le_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let be: std::vec::Vec<u32> = be_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          v410_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          v410_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          v410_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          v410_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          v410_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        v410_to_luma_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        v410_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        v410_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_sse41/tests/xv36.rs b/src/row/arch/x86_sse41/tests/xv36.rs
index c6d18c9a..b4f1dd0d 100644
--- a/src/row/arch/x86_sse41/tests/xv36.rs
+++ b/src/row/arch/x86_sse41/tests/xv36.rs
@@ -17,9 +17,9 @@ fn check_rgb<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_range: b
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u8; width * bpp];
   let mut k = std::vec![0u8; width * bpp];
-  scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_or_rgba_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_or_rgba_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -34,9 +34,9 @@ fn check_rgb_u16<const ALPHA: bool>(width: usize, matrix: ColorMatrix, full_rang
   let bpp = if ALPHA { 4 } else { 3 };
   let mut s = std::vec![0u16; width * bpp];
   let mut k = std::vec![0u16; width * bpp];
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut s, width, matrix, full_range);
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut s, width, matrix, full_range);
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(&p, &mut k, width, matrix, full_range);
+    xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, false>(&p, &mut k, width, matrix, full_range);
   }
   assert_eq!(
     s,
@@ -50,9 +50,9 @@ fn check_luma(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u8; width];
   let mut k = std::vec![0u8; width];
-  scalar::xv36_to_luma_row(&p, &mut s, width);
+  scalar::xv36_to_luma_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_row(&p, &mut k, width);
+    xv36_to_luma_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 xv36→luma diverges (width={width})");
 }
@@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) {
   let p = pseudo_random_xv36(width, 0xC001);
   let mut s = std::vec![0u16; width];
   let mut k = std::vec![0u16; width];
-  scalar::xv36_to_luma_u16_row(&p, &mut s, width);
+  scalar::xv36_to_luma_u16_row::<false>(&p, &mut s, width);
   unsafe {
-    xv36_to_luma_u16_row(&p, &mut k, width);
+    xv36_to_luma_u16_row::<false>(&p, &mut k, width);
   }
   assert_eq!(s, k, "SSE4.1 xv36→luma u16 diverges (width={width})");
 }
@@ -176,7 +176,7 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() {
   // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1)
   let mut luma_u16 = std::vec![0u16; W];
   unsafe {
-    xv36_to_luma_u16_row(&packed, &mut luma_u16, W);
+    xv36_to_luma_u16_row::<false>(&packed, &mut luma_u16, W);
   }
   let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
   assert_eq!(luma_u16, expected_luma, "sse4.1 xv36 luma_u16 reorder bug");
@@ -185,9 +185,15 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() {
   let mut simd_rgb = std::vec![0u16; W * 3];
   let mut scalar_rgb = std::vec![0u16; W * 3];
   unsafe {
-    xv36_to_rgb_u16_or_rgba_u16_row::<false>(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      &packed,
+      &mut simd_rgb,
+      W,
+      ColorMatrix::Bt709,
+      false,
+    );
   }
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(
+  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
     &packed,
     &mut scalar_rgb,
     W,
@@ -199,3 +205,119 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() {
     "sse4.1 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug"
   );
 }
+
+/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate
+/// in `endian::load_endian_u16x8::<BE>`. See sibling v410 test for rationale.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_xv36_be_le_simd_parity() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+  // so semantics are host-independent. The earlier `swap_bytes` pattern only
+  // validated this on LE hosts (on BE hosts both buffers degenerate to
+  // equal-but-wrong values and the test passed vacuously).
+  for w in [7usize, 8, 17, 33] {
+    let intended = pseudo_random_xv36(w, 0xBEEF);
+    let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le: std::vec::Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be: std::vec::Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u8; w * bpp];
+      let mut out_be = std::vec![0u8; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_or_rgba_row::<true, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<true, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        } else {
+          xv36_to_rgb_or_rgba_row::<false, false>(&le, &mut out_le, w, ColorMatrix::Bt709, false);
+          xv36_to_rgb_or_rgba_row::<false, true>(&be, &mut out_be, w, ColorMatrix::Bt709, false);
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})"
+      );
+    }
+
+    for (alpha, bpp) in [(false, 3usize), (true, 4)] {
+      let mut out_le = std::vec![0u16; w * bpp];
+      let mut out_be = std::vec![0u16; w * bpp];
+      unsafe {
+        if alpha {
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        } else {
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+            &le,
+            &mut out_le,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+          xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(
+            &be,
+            &mut out_be,
+            w,
+            ColorMatrix::Bt709,
+            true,
+          );
+        }
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u8; w];
+      let mut out_be = std::vec![0u8; w];
+      unsafe {
+        xv36_to_luma_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})"
+      );
+    }
+
+    {
+      let mut out_le = std::vec![0u16; w];
+      let mut out_be = std::vec![0u16; w];
+      unsafe {
+        xv36_to_luma_u16_row::<false>(&le, &mut out_le, w);
+        xv36_to_luma_u16_row::<true>(&be, &mut out_be, w);
+      }
+      assert_eq!(
+        out_le, out_be,
+        "sse4.1 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})"
+      );
+    }
+  }
+}
diff --git a/src/row/arch/x86_sse41/v410.rs b/src/row/arch/x86_sse41/v410.rs
index 0c38d996..adb0ce98 100644
--- a/src/row/arch/x86_sse41/v410.rs
+++ b/src/row/arch/x86_sse41/v410.rs
@@ -24,7 +24,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- u8 RGB / RGBA output (8 px/iter) -----------------------------------
@@ -40,7 +40,7 @@ use crate::{ColorMatrix, row::scalar};
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -73,8 +73,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 8 <= width {
       // Load 8 V410 words = 8 pixels (32 bytes = 2 × __m128i).
-      let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast());
-      let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast());
+      let words_lo = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
+      let words_hi = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x + 4) as *const u8);
 
       // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20) for each
       // 4-pixel batch as i32x4. Values ≤ 1023 — safe for i16.
@@ -154,7 +154,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::v410_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -173,7 +179,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -208,8 +214,8 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast());
-      let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast());
+      let words_lo = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
+      let words_hi = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x + 4) as *const u8);
 
       let u_lo_i32 = _mm_and_si128(words_lo, mask);
       let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask);
@@ -283,7 +289,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::v410_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -307,7 +313,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn v410_to_luma_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -316,8 +326,8 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast());
-      let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast());
+      let words_lo = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
+      let words_hi = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x + 4) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each lane.
       let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask);
@@ -340,7 +350,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 
     // Scalar tail.
     if x < width {
-      scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
@@ -359,7 +369,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn v410_to_luma_u16_row<const BE: bool>(
+  packed: &[u32],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
 
@@ -368,8 +382,8 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast());
-      let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast());
+      let words_lo = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x) as *const u8);
+      let words_hi = endian::load_endian_u32x4::<BE>(packed.as_ptr().add(x + 4) as *const u8);
 
       // Y = (word >> 10) & 0x3FF for each lane.
       let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask);
@@ -386,7 +400,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width
 
     // Scalar tail.
     if x < width {
-      scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x);
+      scalar::v410_to_luma_u16_row::<BE>(&packed[x..width], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/xv36.rs b/src/row/arch/x86_sse41/xv36.rs
index 7beaf02c..cda808ab 100644
--- a/src/row/arch/x86_sse41/xv36.rs
+++ b/src/row/arch/x86_sse41/xv36.rs
@@ -53,7 +53,7 @@
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian, *};
 use crate::{ColorMatrix, row::scalar};
 
 // ---- Deinterleave helper ------------------------------------------------
@@ -63,6 +63,9 @@ use crate::{ColorMatrix, row::scalar};
 /// `u16` samples **after** the 4-bit right-shift to drop padding LSBs.
 /// The A channel is computed but returned separately (caller discards it).
 ///
+/// When `BE = true`, each 128-bit load is byte-swapped within every 2-byte
+/// lane via `endian::load_endian_u16x8::<true>`.
+///
 /// See module-level doc for the 3-level unpack cascade.
 ///
 /// # Safety
@@ -71,13 +74,14 @@ use crate::{ColorMatrix, row::scalar};
 /// Caller's `target_feature` must include SSE4.1.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-unsafe fn deinterleave_xv36(ptr: *const u16) -> (__m128i, __m128i, __m128i) {
+unsafe fn deinterleave_xv36<const BE: bool>(ptr: *const u16) -> (__m128i, __m128i, __m128i) {
   unsafe {
     // Load 4 × __m128i (8 pixels × 4 channels × u16 = 64 bytes).
-    let raw0 = _mm_loadu_si128(ptr.cast()); // U0,Y0,V0,A0,U1,Y1,V1,A1
-    let raw1 = _mm_loadu_si128(ptr.add(8).cast()); // U2,Y2,V2,A2,U3,Y3,V3,A3
-    let raw2 = _mm_loadu_si128(ptr.add(16).cast()); // U4,Y4,V4,A4,U5,Y5,V5,A5
-    let raw3 = _mm_loadu_si128(ptr.add(24).cast()); // U6,Y6,V6,A6,U7,Y7,V7,A7
+    // BE=true: byte-swap within each u16 lane to correct wire endianness.
+    let raw0 = endian::load_endian_u16x8::<BE>(ptr as *const u8); // U0,Y0,V0,A0,U1,Y1,V1,A1
+    let raw1 = endian::load_endian_u16x8::<BE>(ptr.add(8) as *const u8); // U2,Y2,V2,A2,U3,Y3,V3,A3
+    let raw2 = endian::load_endian_u16x8::<BE>(ptr.add(16) as *const u8); // U4,Y4,V4,A4,U5,Y5,V5,A5
+    let raw3 = endian::load_endian_u16x8::<BE>(ptr.add(24) as *const u8); // U6,Y6,V6,A6,U7,Y7,V7,A7
 
     // Level 1 unpack (pairs 0-1, pairs 2-3).
     let s1_lo = _mm_unpacklo_epi16(raw0, raw1); // U0,U2,Y0,Y2,V0,V2,A0,A2
@@ -119,7 +123,7 @@ unsafe fn deinterleave_xv36(ptr: *const u16) -> (__m128i, __m128i, __m128i) {
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -151,7 +155,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave 8 XV36 quadruples → U, Y, V as i16x8 in [0, 4095].
-      let (u_u16, y_u16, v_u16) = deinterleave_xv36(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = deinterleave_xv36::<BE>(packed.as_ptr().add(x * 4));
 
       // Reinterpret as signed i16 (values ≤ 4095 < 32767, safe).
       let u_i16 = u_u16; // u16 values fit in i16 range
@@ -221,7 +225,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_or_rgba_row::<ALPHA>(tail_packed, tail_out, tail_w, matrix, full_range);
+      scalar::xv36_to_rgb_or_rgba_row::<ALPHA, BE>(
+        tail_packed,
+        tail_out,
+        tail_w,
+        matrix,
+        full_range,
+      );
     }
   }
 }
@@ -240,7 +250,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements).
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -276,7 +286,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let (u_u16, y_u16, v_u16) = deinterleave_xv36(packed.as_ptr().add(x * 4));
+      let (u_u16, y_u16, v_u16) = deinterleave_xv36::<BE>(packed.as_ptr().add(x * 4));
 
       let u_i16 = u_u16;
       let y_i16 = y_u16;
@@ -341,7 +351,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
       let tail_packed = &packed[x * 4..width * 4];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA>(
+      scalar::xv36_to_rgb_u16_or_rgba_u16_row::<ALPHA, BE>(
         tail_packed,
         tail_out,
         tail_w,
@@ -367,7 +377,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -375,7 +389,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave to get Y channel, then shift >> 8 for u8 luma.
-      let (_u_vec, y_vec, _v_vec) = deinterleave_xv36(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = deinterleave_xv36::<BE>(packed.as_ptr().add(x * 4));
 
       // y_vec already has >> 4 applied (values in [0, 4095]).
       // Scalar does `packed[x*4+1] >> 8` — that's (MSB-aligned >> 4) >> 4
@@ -395,7 +409,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
@@ -415,7 +429,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi
 /// 3. `out.len() >= width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) unsafe fn xv36_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
 
@@ -423,7 +441,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
     let mut x = 0usize;
     while x + 8 <= width {
       // Deinterleave — y_vec already has >> 4 applied (= 12-bit value).
-      let (_u_vec, y_vec, _v_vec) = deinterleave_xv36(packed.as_ptr().add(x * 4));
+      let (_u_vec, y_vec, _v_vec) = deinterleave_xv36::<BE>(packed.as_ptr().add(x * 4));
 
       // Direct store of 8 × u16 (12-bit values in low bits).
       _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec);
@@ -433,7 +451,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width
 
     // Scalar tail.
     if x < width {
-      scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x);
+      scalar::xv36_to_luma_u16_row::<BE>(&packed[x * 4..width * 4], &mut out[x..width], width - x);
     }
   }
 }
diff --git a/src/row/dispatch/ayuv64.rs b/src/row/dispatch/ayuv64.rs
index 4c1442ac..0d757ee8 100644
--- a/src/row/dispatch/ayuv64.rs
+++ b/src/row/dispatch/ayuv64.rs
@@ -14,6 +14,9 @@
 //!
 //! Source α is real (depth-converted u16 → u8 via `>> 8` for u8 RGBA;
 //! written direct as u16 for u16 RGBA).
+//!
+//! `be_input = true` selects the big-endian wire variant: each u16
+//! element is byte-swapped before unpacking, matching BE AYUV64 streams.
 
 #[cfg(any(
   target_arch = "aarch64",
@@ -46,7 +49,8 @@ fn ayuv64_packed_elems(width: usize) -> usize {
 
 /// Converts one row of AYUV64 to packed RGB (u8). Source α is discarded.
 /// See [`scalar::ayuv64_to_rgb_or_rgba_row`] for pixel layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `be_input = true` selects
+/// the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn ayuv64_to_rgb_row(
   packed: &[u16],
@@ -55,6 +59,7 @@ pub fn ayuv64_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
@@ -70,31 +75,51 @@ pub fn ayuv64_to_rgb_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -102,12 +127,16 @@ pub fn ayuv64_to_rgb_row(
     }
   }
 
-  scalar::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::ayuv64_to_rgb_row::<true>(packed, rgb_out, width, matrix, full_range);
+  } else {
+    scalar::ayuv64_to_rgb_row::<false>(packed, rgb_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of AYUV64 to packed RGBA (u8). The source A u16 at slot 0
 /// of each pixel quadruple is depth-converted to u8 via `>> 8`. `use_simd =
-/// false` forces scalar.
+/// false` forces scalar. `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn ayuv64_to_rgba_row(
   packed: &[u16],
@@ -116,6 +145,7 @@ pub fn ayuv64_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
@@ -131,31 +161,51 @@ pub fn ayuv64_to_rgba_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -163,11 +213,16 @@ pub fn ayuv64_to_rgba_row(
     }
   }
 
-  scalar::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::ayuv64_to_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  } else {
+    scalar::ayuv64_to_rgba_row::<false>(packed, rgba_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of AYUV64 to packed `u16` RGB at native 16-bit
 /// depth. Source α is discarded. `use_simd = false` forces scalar.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn ayuv64_to_rgb_u16_row(
   packed: &[u16],
@@ -176,6 +231,7 @@ pub fn ayuv64_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
@@ -191,31 +247,51 @@ pub fn ayuv64_to_rgb_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -223,12 +299,17 @@ pub fn ayuv64_to_rgb_u16_row(
     }
   }
 
-  scalar::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::ayuv64_to_rgb_u16_row::<true>(packed, rgb_out, width, matrix, full_range);
+  } else {
+    scalar::ayuv64_to_rgb_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of AYUV64 to packed `u16` RGBA at native 16-bit
 /// depth. The source A u16 at slot 0 of each pixel quadruple is written
 /// direct (no conversion). `use_simd = false` forces scalar.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn ayuv64_to_rgba_u16_row(
   packed: &[u16],
@@ -237,6 +318,7 @@ pub fn ayuv64_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
@@ -252,31 +334,51 @@ pub fn ayuv64_to_rgba_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -284,14 +386,24 @@ pub fn ayuv64_to_rgba_u16_row(
     }
   }
 
-  scalar::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::ayuv64_to_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  } else {
+    scalar::ayuv64_to_rgba_u16_row::<false>(packed, rgba_out, width, matrix, full_range);
+  }
 }
 
 /// Extracts one row of 8-bit luma from a packed AYUV64 buffer. Y is at slot 1
 /// of each pixel quadruple; extracted via `>> 8` (high byte). `use_simd =
-/// false` forces scalar.
+/// false` forces scalar. `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn ayuv64_to_luma_row(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
     "packed row too short"
@@ -303,31 +415,51 @@ pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::ayuv64_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -335,14 +467,25 @@ pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use
     }
   }
 
-  scalar::ayuv64_to_luma_row(packed, luma_out, width);
+  if be_input {
+    scalar::ayuv64_to_luma_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::ayuv64_to_luma_row::<false>(packed, luma_out, width);
+  }
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed AYUV64 buffer.
 /// Y is at slot 1 of each pixel quadruple; written direct (no shift — 16-bit
-/// native). `use_simd = false` forces scalar.
+/// native). `use_simd = false` forces scalar. `be_input = true` selects the
+/// big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn ayuv64_to_luma_u16_row(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(
     packed.len() >= ayuv64_packed_elems(width),
     "packed row too short"
@@ -354,31 +497,51 @@ pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::ayuv64_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -386,7 +549,11 @@ pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize
     }
   }
 
-  scalar::ayuv64_to_luma_u16_row(packed, luma_out, width);
+  if be_input {
+    scalar::ayuv64_to_luma_u16_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::ayuv64_to_luma_u16_row::<false>(packed, luma_out, width);
+  }
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -404,6 +571,16 @@ mod tests {
     [a, y, u, v]
   }
 
+  /// Pack one AYUV64 pixel in big-endian wire format.
+  fn pack_ayuv64_be(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] {
+    [
+      a.swap_bytes(),
+      y.swap_bytes(),
+      u.swap_bytes(),
+      v.swap_bytes(),
+    ]
+  }
+
   /// Build a `Vec<u16>` AYUV64 row of `width` pixels with neutral
   /// chroma (U=V=32768) and the given Y / alpha values. Any positive
   /// width is valid (4:4:4, no chroma subsampling).
@@ -412,6 +589,12 @@ mod tests {
     (0..width).flat_map(|_| quad).collect()
   }
 
+  /// Build a `Vec<u16>` AYUV64 row in big-endian wire format.
+  fn solid_ayuv64_be(width: usize, y: u16, a: u16) -> std::vec::Vec<u16> {
+    let quad = pack_ayuv64_be(a, y, 32768, 32768);
+    (0..width).flat_map(|_| quad).collect()
+  }
+
   // ---- panic guards -------------------------------------------------------
 
   #[test]
@@ -420,7 +603,7 @@ mod tests {
     // packed buffer has only 2×4=8 u16 elements for width=4 (needs 4×4=16).
     let packed = [0u16; 8];
     let mut rgb = [0u8; 4 * 3];
-    ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -428,7 +611,7 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_rgb_output() {
     let packed = [0u16; 4 * 4];
     let mut rgb = [0u8; 2];
-    ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -436,7 +619,15 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_rgba_output() {
     let packed = [0u16; 4 * 4];
     let mut rgba = [0u8; 2];
-    ayuv64_to_rgba_row(&packed, &mut rgba, 4, ColorMatrix::Bt709, true, false);
+    ayuv64_to_rgba_row(
+      &packed,
+      &mut rgba,
+      4,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
   }
 
   #[test]
@@ -444,7 +635,7 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_rgb_u16_output() {
     let packed = [0u16; 4 * 4];
     let mut rgb = [0u16; 2];
-    ayuv64_to_rgb_u16_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    ayuv64_to_rgb_u16_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -452,7 +643,15 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_rgba_u16_output() {
     let packed = [0u16; 4 * 4];
     let mut rgba = [0u16; 2];
-    ayuv64_to_rgba_u16_row(&packed, &mut rgba, 4, ColorMatrix::Bt709, true, false);
+    ayuv64_to_rgba_u16_row(
+      &packed,
+      &mut rgba,
+      4,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
   }
 
   #[test]
@@ -460,7 +659,7 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_luma_output() {
     let packed = [0u16; 4 * 4];
     let mut luma = [0u8; 2];
-    ayuv64_to_luma_row(&packed, &mut luma, 4, false);
+    ayuv64_to_luma_row(&packed, &mut luma, 4, false, false);
   }
 
   #[test]
@@ -468,7 +667,7 @@ mod tests {
   fn ayuv64_dispatcher_rejects_short_luma_u16_output() {
     let packed = [0u16; 4 * 4];
     let mut luma = [0u16; 2];
-    ayuv64_to_luma_u16_row(&packed, &mut luma, 4, false);
+    ayuv64_to_luma_u16_row(&packed, &mut luma, 4, false, false);
   }
 
   // ---- functional smoke ---------------------------------------------------
@@ -482,7 +681,7 @@ mod tests {
 
     // u8 RGB — limited-range white → near 255 on every channel
     let mut rgb = [0u8; 8 * 3];
-    ayuv64_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, false, false);
+    ayuv64_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, false, false, false);
     for px in rgb.chunks(3) {
       assert!(
         px[0].abs_diff(255) <= 2,
@@ -495,7 +694,7 @@ mod tests {
 
     // u8 RGBA — source α 0xABCD >> 8 = 0xAB in output α channel
     let mut rgba = [0u8; 8 * 4];
-    ayuv64_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, false, false);
+    ayuv64_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, false, false, false);
     for px in rgba.chunks(4) {
       assert!(
         px[0].abs_diff(255) <= 2,
@@ -510,7 +709,15 @@ mod tests {
 
     // u16 RGB — near-white (65535 or close)
     let mut rgb_u16 = [0u16; 8 * 3];
-    ayuv64_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, false, false);
+    ayuv64_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(
         px[0].abs_diff(0xFFFF) <= 256,
@@ -523,7 +730,15 @@ mod tests {
 
     // u16 RGBA — source α 0xABCD must appear direct in output α channel
     let mut rgba_u16 = [0u16; 8 * 4];
-    ayuv64_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, false, false);
+    ayuv64_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(
         px[3], 0xABCDu16,
@@ -533,19 +748,90 @@ mod tests {
 
     // u8 luma — Y=60160; >> 8 = 234 (0xEA)
     let mut luma = [0u8; 8];
-    ayuv64_to_luma_row(&buf, &mut luma, 8, false);
+    ayuv64_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, (60160u16 >> 8) as u8, "luma u8 must be Y >> 8");
     }
 
     // u16 luma — Y=60160 written direct
     let mut luma_u16 = [0u16; 8];
-    ayuv64_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    ayuv64_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 60160u16, "luma u16 must be Y direct");
     }
   }
 
+  #[test]
+  fn ayuv64_be_and_le_dispatchers_agree() {
+    // BE-encoded data decoded with be_input=true must produce the same
+    // output as LE-encoded data decoded with be_input=false.
+    // Use a distinctive Y/alpha so both output channels are exercised.
+    let le_buf = solid_ayuv64(8, 60160, 0xABCD);
+    let be_buf = solid_ayuv64_be(8, 60160, 0xABCD);
+
+    // u8 RGB
+    let mut rgb_le = [0u8; 8 * 3];
+    let mut rgb_be = [0u8; 8 * 3];
+    ayuv64_to_rgb_row(
+      &le_buf,
+      &mut rgb_le,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      false,
+    );
+    ayuv64_to_rgb_row(
+      &be_buf,
+      &mut rgb_be,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      true,
+    );
+    assert_eq!(
+      rgb_le, rgb_be,
+      "LE and BE must produce identical RGB output"
+    );
+
+    // u8 luma
+    let mut luma_le = [0u8; 8];
+    let mut luma_be = [0u8; 8];
+    ayuv64_to_luma_row(&le_buf, &mut luma_le, 8, false, false);
+    ayuv64_to_luma_row(&be_buf, &mut luma_be, 8, false, true);
+    assert_eq!(
+      luma_le, luma_be,
+      "LE and BE must produce identical luma output"
+    );
+
+    // u16 RGBA — alpha pass-through must survive BE swap
+    let mut rgba_u16_le = [0u16; 8 * 4];
+    let mut rgba_u16_be = [0u16; 8 * 4];
+    ayuv64_to_rgba_u16_row(
+      &le_buf,
+      &mut rgba_u16_le,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      false,
+    );
+    ayuv64_to_rgba_u16_row(
+      &be_buf,
+      &mut rgba_u16_be,
+      8,
+      ColorMatrix::Bt709,
+      false,
+      false,
+      true,
+    );
+    assert_eq!(
+      rgba_u16_le, rgba_u16_be,
+      "LE and BE must produce identical u16 RGBA output"
+    );
+  }
+
   // ---- 32-bit width × 4 overflow guard ------------------------------------
   //
   // AYUV64 packed rows consume `4 * width` u16 elements. Without the
@@ -572,6 +858,7 @@ mod tests {
       ColorMatrix::Bt709,
       true,
       false,
+      false,
     );
   }
 }
diff --git a/src/row/dispatch/v410.rs b/src/row/dispatch/v410.rs
index d51e50cc..436e695b 100644
--- a/src/row/dispatch/v410.rs
+++ b/src/row/dispatch/v410.rs
@@ -11,6 +11,10 @@
 //! complete pixel as 10-bit U / Y / V packed into bits [9:0] / [19:10]
 //! / [29:20] with 2-bit padding at the top. Buffer length is `width`
 //! u32 elements — no even-width restriction, no width×2 scaling.
+//!
+//! `be_input = true` selects the big-endian wire variant: each u32 word
+//! is byte-swapped before unpacking, matching QuickTime-style BE V410
+//! streams.
 
 #[cfg(any(
   target_arch = "aarch64",
@@ -31,7 +35,8 @@ use crate::{
 
 /// Converts one row of V410 to packed RGB (u8). See
 /// [`scalar::v410_to_rgb_or_rgba_row`] for word layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `be_input = true` selects
+/// the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn v410_to_rgb_row(
   packed: &[u32],
@@ -40,6 +45,7 @@ pub fn v410_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(
@@ -52,31 +58,51 @@ pub fn v410_to_rgb_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -84,10 +110,15 @@ pub fn v410_to_rgb_row(
     }
   }
 
-  scalar::v410_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::v410_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range);
+  } else {
+    scalar::v410_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of V410 to packed RGBA (u8) with `α = 0xFF`.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn v410_to_rgba_row(
   packed: &[u32],
@@ -96,6 +127,7 @@ pub fn v410_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(
@@ -108,31 +140,51 @@ pub fn v410_to_rgba_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -140,11 +192,16 @@ pub fn v410_to_rgba_row(
     }
   }
 
-  scalar::v410_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::v410_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+  } else {
+    scalar::v410_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of V410 to packed `u16` RGB at native 10-bit
-/// depth (low-bit-packed, `[0, 1023]`).
+/// depth (low-bit-packed, `[0, 1023]`). `be_input = true` selects
+/// the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn v410_to_rgb_u16_row(
   packed: &[u32],
@@ -153,6 +210,7 @@ pub fn v410_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(
@@ -165,31 +223,51 @@ pub fn v410_to_rgb_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -197,11 +275,20 @@ pub fn v410_to_rgb_u16_row(
     }
   }
 
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::v410_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range,
+    );
+  } else {
+    scalar::v410_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range,
+    );
+  }
 }
 
 /// Converts one row of V410 to packed `u16` RGBA at native 10-bit
-/// depth with `α = 1023` (10-bit opaque maximum).
+/// depth with `α = 1023` (10-bit opaque maximum). `be_input = true`
+/// selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn v410_to_rgba_u16_row(
   packed: &[u32],
@@ -210,6 +297,7 @@ pub fn v410_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(
@@ -222,31 +310,51 @@ pub fn v410_to_rgba_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -254,13 +362,28 @@ pub fn v410_to_rgba_u16_row(
     }
   }
 
-  scalar::v410_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::v410_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range,
+    );
+  } else {
+    scalar::v410_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range,
+    );
+  }
 }
 
 /// Extracts one row of 8-bit luma from a packed V410 buffer.
 /// Y values are downshifted from 10-bit to 8-bit via `>> 2`.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn v410_to_luma_row(
+  packed: &[u32],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(luma_out.len() >= width, "luma_out row too short");
 
@@ -269,31 +392,51 @@ pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_s
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v410_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::v410_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::v410_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -301,14 +444,25 @@ pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_s
     }
   }
 
-  scalar::v410_to_luma_row(packed, luma_out, width);
+  if be_input {
+    scalar::v410_to_luma_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::v410_to_luma_row::<false>(packed, luma_out, width);
+  }
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed V410
 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in
-/// its low 10 bits).
+/// its low 10 bits). `be_input = true` selects the big-endian wire
+/// variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn v410_to_luma_u16_row(
+  packed: &[u32],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(packed.len() >= width, "packed row too short");
   assert!(luma_out.len() >= width, "luma_out row too short");
 
@@ -317,31 +471,51 @@ pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize,
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::v410_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::v410_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::v410_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::v410_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::v410_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::v410_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::v410_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::v410_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::v410_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::v410_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::v410_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::v410_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::v410_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::v410_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::v410_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -349,7 +523,11 @@ pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize,
     }
   }
 
-  scalar::v410_to_luma_u16_row(packed, luma_out, width);
+  if be_input {
+    scalar::v410_to_luma_u16_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::v410_to_luma_u16_row::<false>(packed, luma_out, width);
+  }
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -367,19 +545,29 @@ mod tests {
     (v << 20) | (y << 10) | u
   }
 
+  /// Pack one V410 word in big-endian wire format.
+  fn pack_v410_be(u: u32, y: u32, v: u32) -> u32 {
+    pack_v410(u, y, v).swap_bytes()
+  }
+
   /// Build a `Vec<u32>` V410 row of `width` pixels with `(U, Y, V)`
   /// repeated. Any positive width is valid (4:4:4, no chroma subsampling).
   fn solid_v410(width: usize, u: u32, y: u32, v: u32) -> std::vec::Vec<u32> {
     (0..width).map(|_| pack_v410(u, y, v)).collect()
   }
 
+  /// Build a `Vec<u32>` V410 row in big-endian wire format.
+  fn solid_v410_be(width: usize, u: u32, y: u32, v: u32) -> std::vec::Vec<u32> {
+    (0..width).map(|_| pack_v410_be(u, y, v)).collect()
+  }
+
   #[test]
   #[should_panic(expected = "packed row too short")]
   fn v410_dispatcher_rejects_short_packed() {
     // packed buffer has only 2 elements for width=4 (needs 4).
     let packed = [0u32; 2];
     let mut rgb = [0u8; 4 * 3];
-    v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -388,7 +576,7 @@ mod tests {
     // output buffer has only 2 bytes for width=4 (needs 12).
     let packed = [0u32; 4];
     let mut rgb = [0u8; 2];
-    v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -400,7 +588,7 @@ mod tests {
 
     // u8 RGB
     let mut rgb = [0u8; 8 * 3];
-    v410_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false);
+    v410_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[0], px[1]);
@@ -409,7 +597,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 8 * 4];
-    v410_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false);
+    v410_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 1);
       assert_eq!(px[3], 0xFF);
@@ -417,7 +605,15 @@ mod tests {
 
     // u16 RGB at native 10-bit depth.
     let mut rgb_u16 = [0u16; 8 * 3];
-    v410_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false);
+    v410_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(512) <= 2);
       assert_eq!(px[0], px[1]);
@@ -426,23 +622,75 @@ mod tests {
 
     // u16 RGBA — alpha = 1023 (10-bit opaque maximum).
     let mut rgba_u16 = [0u16; 8 * 4];
-    v410_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false);
+    v410_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 1023);
     }
 
     // u8 luma — Y=512 → 128 after `>> 2`.
     let mut luma = [0u8; 8];
-    v410_to_luma_row(&buf, &mut luma, 8, false);
+    v410_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, (512u32 >> 2) as u8);
     }
 
     // u16 luma — low-packed 10-bit Y value.
     let mut luma_u16 = [0u16; 8];
-    v410_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    v410_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 512);
     }
   }
+
+  #[test]
+  fn v410_be_and_le_dispatchers_agree() {
+    // BE-encoded data decoded with be_input=true must produce the same
+    // output as LE-encoded data decoded with be_input=false.
+    let le_buf = solid_v410(8, 512, 512, 512);
+    let be_buf = solid_v410_be(8, 512, 512, 512);
+
+    // u8 RGB
+    let mut rgb_le = [0u8; 8 * 3];
+    let mut rgb_be = [0u8; 8 * 3];
+    v410_to_rgb_row(
+      &le_buf,
+      &mut rgb_le,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
+    v410_to_rgb_row(
+      &be_buf,
+      &mut rgb_be,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      true,
+    );
+    assert_eq!(
+      rgb_le, rgb_be,
+      "LE and BE must produce identical RGB output"
+    );
+
+    // u8 luma
+    let mut luma_le = [0u8; 8];
+    let mut luma_be = [0u8; 8];
+    v410_to_luma_row(&le_buf, &mut luma_le, 8, false, false);
+    v410_to_luma_row(&be_buf, &mut luma_be, 8, false, true);
+    assert_eq!(
+      luma_le, luma_be,
+      "LE and BE must produce identical luma output"
+    );
+  }
 }
diff --git a/src/row/dispatch/xv36.rs b/src/row/dispatch/xv36.rs
index ec40d054..81d83b03 100644
--- a/src/row/dispatch/xv36.rs
+++ b/src/row/dispatch/xv36.rs
@@ -11,6 +11,9 @@
 //! quadruple `[U, Y, V, A]` MSB-aligned at 12-bit (low 4 bits zero
 //! per sample). Buffer length is `width × 4` u16 elements — no
 //! even-width restriction.
+//!
+//! `be_input = true` selects the big-endian wire variant: each u16
+//! element is byte-swapped before unpacking, matching BE XV36 streams.
 
 #[cfg(any(
   target_arch = "aarch64",
@@ -43,7 +46,8 @@ fn xv36_packed_elems(width: usize) -> usize {
 
 /// Converts one row of XV36 to packed RGB (u8). See
 /// [`scalar::xv36_to_rgb_or_rgba_row`] for pixel layout / numerical
-/// contract. `use_simd = false` forces scalar.
+/// contract. `use_simd = false` forces scalar. `be_input = true` selects
+/// the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn xv36_to_rgb_row(
   packed: &[u16],
@@ -52,6 +56,7 @@ pub fn xv36_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
@@ -67,31 +72,51 @@ pub fn xv36_to_rgb_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified at runtime.
-          unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -99,10 +124,15 @@ pub fn xv36_to_rgb_row(
     }
   }
 
-  scalar::xv36_to_rgb_or_rgba_row::<false>(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::xv36_to_rgb_or_rgba_row::<false, true>(packed, rgb_out, width, matrix, full_range);
+  } else {
+    scalar::xv36_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of XV36 to packed RGBA (u8) with `α = 0xFF`.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn xv36_to_rgba_row(
   packed: &[u16],
@@ -111,6 +141,7 @@ pub fn xv36_to_rgba_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
@@ -126,31 +157,51 @@ pub fn xv36_to_rgba_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -158,11 +209,16 @@ pub fn xv36_to_rgba_row(
     }
   }
 
-  scalar::xv36_to_rgb_or_rgba_row::<true>(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::xv36_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+  } else {
+    scalar::xv36_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range);
+  }
 }
 
 /// Converts one row of XV36 to packed `u16` RGB at native 12-bit
-/// depth (low-bit-packed, `[0, 4095]`).
+/// depth (low-bit-packed, `[0, 4095]`). `be_input = true` selects
+/// the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn xv36_to_rgb_u16_row(
   packed: &[u16],
@@ -171,6 +227,7 @@ pub fn xv36_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
@@ -186,31 +243,51 @@ pub fn xv36_to_rgb_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(packed, rgb_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -218,11 +295,20 @@ pub fn xv36_to_rgb_u16_row(
     }
   }
 
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false>(packed, rgb_out, width, matrix, full_range);
+  if be_input {
+    scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, true>(
+      packed, rgb_out, width, matrix, full_range,
+    );
+  } else {
+    scalar::xv36_to_rgb_u16_or_rgba_u16_row::<false, false>(
+      packed, rgb_out, width, matrix, full_range,
+    );
+  }
 }
 
 /// Converts one row of XV36 to packed `u16` RGBA at native 12-bit
-/// depth with `α = 4095` (12-bit opaque maximum).
+/// depth with `α = 4095` (12-bit opaque maximum). `be_input = true`
+/// selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub fn xv36_to_rgba_u16_row(
   packed: &[u16],
@@ -231,6 +317,7 @@ pub fn xv36_to_rgba_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
   use_simd: bool,
+  be_input: bool,
 ) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
@@ -246,31 +333,51 @@ pub fn xv36_to_rgba_u16_row(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(packed, rgba_out, width, matrix, full_range); }
+          }
           return;
         }
       },
@@ -278,13 +385,28 @@ pub fn xv36_to_rgba_u16_row(
     }
   }
 
-  scalar::xv36_to_rgb_u16_or_rgba_u16_row::<true>(packed, rgba_out, width, matrix, full_range);
+  if be_input {
+    scalar::xv36_to_rgb_u16_or_rgba_u16_row::<true, true>(
+      packed, rgba_out, width, matrix, full_range,
+    );
+  } else {
+    scalar::xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(
+      packed, rgba_out, width, matrix, full_range,
+    );
+  }
 }
 
 /// Extracts one row of 8-bit luma from a packed XV36 buffer.
 /// Y values are downshifted from 12-bit MSB-aligned to 8-bit via `>> 8`.
+/// `be_input = true` selects the big-endian wire variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn xv36_to_luma_row(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
     "packed row too short"
@@ -296,31 +418,51 @@ pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::xv36_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::xv36_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_luma_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_luma_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_luma_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -328,14 +470,25 @@ pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s
     }
   }
 
-  scalar::xv36_to_luma_row(packed, luma_out, width);
+  if be_input {
+    scalar::xv36_to_luma_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::xv36_to_luma_row::<false>(packed, luma_out, width);
+  }
 }
 
 /// Extracts one row of native-depth `u16` luma from a packed XV36
 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in
-/// its low 12 bits).
+/// its low 12 bits). `be_input = true` selects the big-endian wire
+/// variant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn xv36_to_luma_u16_row(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+  be_input: bool,
+) {
   assert!(
     packed.len() >= xv36_packed_elems(width),
     "packed row too short"
@@ -347,31 +500,51 @@ pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified.
-          unsafe { arch::neon::xv36_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::neon::xv36_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::neon::xv36_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified.
-          unsafe { arch::x86_avx512::xv36_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx512::xv36_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx512::xv36_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::xv36_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_avx2::xv36_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_avx2::xv36_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::xv36_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::x86_sse41::xv36_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::x86_sse41::xv36_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::xv36_to_luma_u16_row(packed, luma_out, width); }
+          if be_input {
+            unsafe { arch::wasm_simd128::xv36_to_luma_u16_row::<true>(packed, luma_out, width); }
+          } else {
+            unsafe { arch::wasm_simd128::xv36_to_luma_u16_row::<false>(packed, luma_out, width); }
+          }
           return;
         }
       },
@@ -379,7 +552,11 @@ pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize,
     }
   }
 
-  scalar::xv36_to_luma_u16_row(packed, luma_out, width);
+  if be_input {
+    scalar::xv36_to_luma_u16_row::<true>(packed, luma_out, width);
+  } else {
+    scalar::xv36_to_luma_u16_row::<false>(packed, luma_out, width);
+  }
 }
 
 #[cfg(all(test, feature = "std"))]
@@ -398,6 +575,17 @@ mod tests {
     [u << 4, y << 4, v << 4, a << 4]
   }
 
+  /// Pack one XV36 pixel in big-endian wire format.
+  fn pack_xv36_be(u: u16, y: u16, v: u16, a: u16) -> [u16; 4] {
+    let le = pack_xv36(u, y, v, a);
+    [
+      le[0].swap_bytes(),
+      le[1].swap_bytes(),
+      le[2].swap_bytes(),
+      le[3].swap_bytes(),
+    ]
+  }
+
   /// Build a `Vec<u16>` XV36 row of `width` pixels with `(U, Y, V, A)`
   /// repeated. Any positive width is valid (4:4:4, no chroma subsampling).
   fn solid_xv36(width: usize, u: u16, y: u16, v: u16) -> std::vec::Vec<u16> {
@@ -405,13 +593,19 @@ mod tests {
     (0..width).flat_map(|_| quad).collect()
   }
 
+  /// Build a `Vec<u16>` XV36 row in big-endian wire format.
+  fn solid_xv36_be(width: usize, u: u16, y: u16, v: u16) -> std::vec::Vec<u16> {
+    let quad = pack_xv36_be(u, y, v, 0);
+    (0..width).flat_map(|_| quad).collect()
+  }
+
   #[test]
   #[should_panic(expected = "packed row too short")]
   fn xv36_dispatcher_rejects_short_packed() {
     // packed buffer has only 2*4=8 u16 elements for width=4 (needs 4*4=16).
     let packed = [0u16; 8];
     let mut rgb = [0u8; 4 * 3];
-    xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -420,7 +614,7 @@ mod tests {
     // output buffer has only 2 bytes for width=4 (needs 12).
     let packed = [0u16; 4 * 4];
     let mut rgb = [0u8; 2];
-    xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false);
+    xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false);
   }
 
   #[test]
@@ -433,7 +627,7 @@ mod tests {
 
     // u8 RGB — full-range gray 0x800/0xFFF * 255 ≈ 128
     let mut rgb = [0u8; 8 * 3];
-    xv36_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false);
+    xv36_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgb.chunks(3) {
       assert!(px[0].abs_diff(128) <= 2);
       assert_eq!(px[0], px[1]);
@@ -442,7 +636,7 @@ mod tests {
 
     // u8 RGBA — alpha = 0xFF
     let mut rgba = [0u8; 8 * 4];
-    xv36_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false);
+    xv36_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false);
     for px in rgba.chunks(4) {
       assert!(px[0].abs_diff(128) <= 2);
       assert_eq!(px[3], 0xFF);
@@ -450,7 +644,15 @@ mod tests {
 
     // u16 RGB at native 12-bit depth.
     let mut rgb_u16 = [0u16; 8 * 3];
-    xv36_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false);
+    xv36_to_rgb_u16_row(
+      &buf,
+      &mut rgb_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgb_u16.chunks(3) {
       assert!(px[0].abs_diff(0x800) <= 4);
       assert_eq!(px[0], px[1]);
@@ -459,26 +661,78 @@ mod tests {
 
     // u16 RGBA — alpha = 4095 (12-bit opaque maximum).
     let mut rgba_u16 = [0u16; 8 * 4];
-    xv36_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false);
+    xv36_to_rgba_u16_row(
+      &buf,
+      &mut rgba_u16,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
     for px in rgba_u16.chunks(4) {
       assert_eq!(px[3], 0x0FFF);
     }
 
     // u8 luma — Y=0x800 MSB-aligned → u16 value 0x8000; >> 8 = 128.
     let mut luma = [0u8; 8];
-    xv36_to_luma_row(&buf, &mut luma, 8, false);
+    xv36_to_luma_row(&buf, &mut luma, 8, false, false);
     for &y in &luma {
       assert_eq!(y, 0x80u8);
     }
 
     // u16 luma — low-packed 12-bit Y value: 0x8000 >> 4 = 0x800.
     let mut luma_u16 = [0u16; 8];
-    xv36_to_luma_u16_row(&buf, &mut luma_u16, 8, false);
+    xv36_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false);
     for &y in &luma_u16 {
       assert_eq!(y, 0x800);
     }
   }
 
+  #[test]
+  fn xv36_be_and_le_dispatchers_agree() {
+    // BE-encoded data decoded with be_input=true must produce the same
+    // output as LE-encoded data decoded with be_input=false.
+    let le_buf = solid_xv36(8, 0x800, 0x800, 0x800);
+    let be_buf = solid_xv36_be(8, 0x800, 0x800, 0x800);
+
+    // u8 RGB
+    let mut rgb_le = [0u8; 8 * 3];
+    let mut rgb_be = [0u8; 8 * 3];
+    xv36_to_rgb_row(
+      &le_buf,
+      &mut rgb_le,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      false,
+    );
+    xv36_to_rgb_row(
+      &be_buf,
+      &mut rgb_be,
+      8,
+      ColorMatrix::Bt709,
+      true,
+      false,
+      true,
+    );
+    assert_eq!(
+      rgb_le, rgb_be,
+      "LE and BE must produce identical RGB output"
+    );
+
+    // u8 luma
+    let mut luma_le = [0u8; 8];
+    let mut luma_be = [0u8; 8];
+    xv36_to_luma_row(&le_buf, &mut luma_le, 8, false, false);
+    xv36_to_luma_row(&be_buf, &mut luma_be, 8, false, true);
+    assert_eq!(
+      luma_le, luma_be,
+      "LE and BE must produce identical luma output"
+    );
+  }
+
   // ---- 32-bit width × 4 overflow guard ------------------------------------
   //
   // XV36 packed rows consume `4 * width` u16 elements. Without the
@@ -507,6 +761,7 @@ mod tests {
       ColorMatrix::Bt709,
       true,
       false,
+      false,
     );
   }
 }
diff --git a/src/row/scalar/ayuv64.rs b/src/row/scalar/ayuv64.rs
index de7a9b7e..b5c4a12d 100644
--- a/src/row/scalar/ayuv64.rs
+++ b/src/row/scalar/ayuv64.rs
@@ -15,6 +15,10 @@
 //! u8 output uses i32 chroma (output-range scaling keeps within i32);
 //! u16 output uses **i64 chroma** via `q15_chroma64` (Q15 sums
 //! overflow i32 at BITS=16/16, peak ~3.7e9 for BT.2020).
+//!
+//! `<const BE: bool>` — when `true`, each `u16` element of the input
+//! slice is byte-swapped before use. This handles the `AYUV64BE`
+//! big-endian wire format; `BE = false` is the standard LE path.
 
 use super::*;
 
@@ -23,6 +27,8 @@ use super::*;
 /// Channel slot order: A at slot 0, Y at slot 1, U at slot 2, V at slot 3
 /// (differs from VUYA which has A at slot 3). No right-shift needed — 16-bit
 /// native samples with no padding bits.
+///
+/// Samples are passed already endian-corrected by the caller.
 #[cfg_attr(not(tarpaulin), inline(always))]
 const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) {
   let a = quad[0]; // slot 0 = A (source α)
@@ -32,6 +38,15 @@ const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) {
   (u, y, v, a) // returned as (u, y, v, a) for consistency with chroma pipeline
 }
 
+/// Load one AYUV64 u16 sample, applying a byte-swap for BE wire format
+/// when `BE = true`. Uses target-endian aware `u16::from_be`/`u16::from_le`
+/// — these are no-ops when the source byte order matches the host, so the
+/// helper produces correct samples on both LE and BE hosts (e.g. s390x).
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn load_ayuv64_u16<const BE: bool>(v: u16) -> u16 {
+  if BE { u16::from_be(v) } else { u16::from_le(v) }
+}
+
 // ---- u8 output (i32 chroma) --------------------------------------------
 
 /// Shared scalar kernel for AYUV64 → packed **RGB** (`ALPHA = false,
@@ -49,7 +64,11 @@ const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) {
 /// - `packed.len() >= width * 4`.
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) fn ayuv64_to_rgb_or_rgba_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -70,7 +89,13 @@ pub(crate) fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool
 
   for x in 0..width {
     let pix_off = x * 4;
-    let (u, y, v, a) = extract_ayuv64(&packed[pix_off..pix_off + 4]);
+    let quad = [
+      load_ayuv64_u16::<BE>(packed[pix_off]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 1]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 2]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 3]),
+    ];
+    let (u, y, v, a) = extract_ayuv64(&quad);
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
     let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
@@ -94,27 +119,27 @@ pub(crate) fn ayuv64_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool
 
 /// Scalar AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgb_row(
+pub(crate) fn ayuv64_to_rgb_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  ayuv64_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+  ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
 }
 
 /// Scalar AYUV64 → packed **RGBA** (4 bpp). The source A u16 at slot 0
 /// of each pixel quadruple is depth-converted to u8 via `>> 8`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgba_row(
+pub(crate) fn ayuv64_to_rgba_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  ayuv64_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+  ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
 }
 
 // ---- u16 output (i64 chroma) -------------------------------------------
@@ -132,7 +157,11 @@ pub(crate) fn ayuv64_to_rgba_row(
 /// - `packed.len() >= width * 4`.
 /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_SRC: bool>(
+pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row<
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+  const BE: bool,
+>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -152,7 +181,13 @@ pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_S
 
   for x in 0..width {
     let pix_off = x * 4;
-    let (u, y, v, a) = extract_ayuv64(&packed[pix_off..pix_off + 4]);
+    let quad = [
+      load_ayuv64_u16::<BE>(packed[pix_off]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 1]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 2]),
+      load_ayuv64_u16::<BE>(packed[pix_off + 3]),
+    ];
+    let (u, y, v, a) = extract_ayuv64(&quad);
     // q15_scale returns i32; q15_chroma64 handles the i32→i64 promotion
     // internally — pass i32 values directly (same API as q15_chroma).
     let u_d = q15_scale(u - bias, c_scale);
@@ -180,27 +215,27 @@ pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const ALPHA_S
 
 /// Scalar AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgb_u16_row(
+pub(crate) fn ayuv64_to_rgb_u16_row<const BE: bool>(
   packed: &[u16],
   rgb_out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false>(packed, rgb_out, width, matrix, full_range);
+  ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
 }
 
 /// Scalar AYUV64 → packed **RGBA u16** (4 × u16 per pixel). The source A u16
 /// at slot 0 of each pixel quadruple is written direct (no conversion).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_rgba_u16_row(
+pub(crate) fn ayuv64_to_rgba_u16_row<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true>(packed, rgba_out, width, matrix, full_range);
+  ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
 }
 
 // ---- Luma extraction ---------------------------------------------------
@@ -208,22 +243,30 @@ pub(crate) fn ayuv64_to_rgba_u16_row(
 /// Copies only the Y u16 from each AYUV64 pixel into a u8 luma plane,
 /// extracting the high byte via `>> 8`. Y is at slot 1 of each quadruple.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) {
+pub(crate) fn ayuv64_to_luma_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
   for x in 0..width {
-    luma_out[x] = (packed[x * 4 + 1] >> 8) as u8;
+    luma_out[x] = (load_ayuv64_u16::<BE>(packed[x * 4 + 1]) >> 8) as u8;
   }
 }
 
 /// Copies only the Y u16 from each AYUV64 pixel into a u16 luma plane,
 /// direct (no shift — 16-bit native). Y is at slot 1 of each quadruple.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) {
+pub(crate) fn ayuv64_to_luma_u16_row<const BE: bool>(
+  packed: &[u16],
+  luma_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed row too short");
   debug_assert!(luma_out.len() >= width, "luma row too short");
   for x in 0..width {
-    luma_out[x] = packed[x * 4 + 1];
+    luma_out[x] = load_ayuv64_u16::<BE>(packed[x * 4 + 1]);
   }
 }
 
@@ -252,7 +295,7 @@ mod tests {
       .copied()
       .collect();
     let mut out = vec![0u8; 4 * 3];
-    ayuv64_to_rgb_row(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    ayuv64_to_rgb_row::<false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
     // Black pixels → [0, 0, 0]
     assert_eq!(&out[0..3], &[0u8, 0, 0], "black pixel 0");
     assert_eq!(&out[3..6], &[0u8, 0, 0], "black pixel 1");
@@ -269,7 +312,7 @@ mod tests {
     let p1 = pack_ayuv64(0x9999, 60160, 32768, 32768);
     let packed: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
     let mut out = vec![0u8; 2 * 4];
-    ayuv64_to_rgba_row(&packed, &mut out, 2, ColorMatrix::Bt709, false);
+    ayuv64_to_rgba_row::<false>(&packed, &mut out, 2, ColorMatrix::Bt709, false);
     assert_eq!(out[3], 0x42, "pixel 0 alpha (0x4242 >> 8 = 0x42)");
     assert_eq!(out[7], 0x99, "pixel 1 alpha (0x9999 >> 8 = 0x99)");
   }
@@ -282,7 +325,7 @@ mod tests {
     let p1 = pack_ayuv64(0x9999, 60160, 32768, 32768);
     let packed: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
     let mut out = vec![0u16; 2 * 4];
-    ayuv64_to_rgba_u16_row(&packed, &mut out, 2, ColorMatrix::Bt709, false);
+    ayuv64_to_rgba_u16_row::<false>(&packed, &mut out, 2, ColorMatrix::Bt709, false);
     assert_eq!(out[3], 0x4242, "pixel 0 alpha u16 direct");
     assert_eq!(out[7], 0x9999, "pixel 1 alpha u16 direct");
   }
@@ -295,7 +338,7 @@ mod tests {
     let p1 = pack_ayuv64(0, 0x4000, 0, 0);
     let packed: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
     let mut out = vec![0u8; 2];
-    ayuv64_to_luma_row(&packed, &mut out, 2);
+    ayuv64_to_luma_row::<false>(&packed, &mut out, 2);
     assert_eq!(&out[..], &[0xFFu8, 0x40], "luma u8 high-byte extract");
   }
 
@@ -307,7 +350,37 @@ mod tests {
     let p1 = pack_ayuv64(0, 0x1234, 0, 0);
     let packed: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
     let mut out = vec![0u16; 2];
-    ayuv64_to_luma_u16_row(&packed, &mut out, 2);
+    ayuv64_to_luma_u16_row::<false>(&packed, &mut out, 2);
     assert_eq!(&out[..], &[0xABCDu16, 0x1234], "luma u16 direct extract");
   }
+
+  #[test]
+  fn ayuv64_be_roundtrip_matches_byte_swapped_le() {
+    // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+    // so semantics are host-independent: on every host, `le` carries the
+    // intended values as LE-encoded bytes and `be` carries the same values as
+    // BE-encoded bytes. Both kernels should therefore decode to the same
+    // intended host-native values (and produce identical RGB output) on both
+    // LE and BE hosts. The earlier `swap_bytes` pattern only validated this
+    // on LE hosts and degenerated to equal-but-wrong on BE hosts.
+    let intended = pack_ayuv64(0xFFFF, 60160, 32768, 32768);
+    let le_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le_buf: Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be_buf: Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let mut out_le = vec![0u8; 3];
+    let mut out_be = vec![0u8; 3];
+    ayuv64_to_rgb_row::<false>(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false);
+    ayuv64_to_rgb_row::<true>(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false);
+    assert_eq!(
+      out_le, out_be,
+      "AYUV64 BE scalar must match byte-swapped LE"
+    );
+  }
 }
diff --git a/src/row/scalar/v410.rs b/src/row/scalar/v410.rs
index 26820811..51935d6c 100644
--- a/src/row/scalar/v410.rs
+++ b/src/row/scalar/v410.rs
@@ -3,6 +3,10 @@
 //! 10-bit U / Y / V channels with 2-bit padding (see
 //! [`crate::frame::V410Frame`]). 4:4:4 means no chroma deinterleave
 //! step — each word yields a complete `(Y, U, V)` triple.
+//!
+//! `<const BE: bool>` — when `true`, each `u32` element of the input
+//! slice is byte-swapped before field extraction. This handles the
+//! `V410BE` big-endian wire format; `BE = false` is the standard LE path.
 
 use super::*;
 
@@ -18,7 +22,7 @@ const fn extract_v410(word: u32) -> (i32, i32, i32) {
 // ---- u8 RGB / RGBA output ----------------------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn v410_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u8],
   width: usize,
@@ -33,7 +37,12 @@ pub(crate) fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
   let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range);
   let bias = chroma_bias::<10>();
 
-  for (x, &word) in packed[..width].iter().enumerate() {
+  for (x, &raw) in packed[..width].iter().enumerate() {
+    let word = if BE {
+      u32::from_be(raw)
+    } else {
+      u32::from_le(raw)
+    };
     let (u, y, v) = extract_v410(word);
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -55,7 +64,7 @@ pub(crate) fn v410_to_rgb_or_rgba_row<const ALPHA: bool>(
 // ---- u16 RGB / RGBA native-depth output --------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u32],
   out: &mut [u16],
   width: usize,
@@ -72,7 +81,12 @@ pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   let alpha_max: u16 = 0x3FF;
   let out_max: i32 = 0x3FF;
 
-  for (x, &word) in packed[..width].iter().enumerate() {
+  for (x, &raw) in packed[..width].iter().enumerate() {
+    let word = if BE {
+      u32::from_be(raw)
+    } else {
+      u32::from_le(raw)
+    };
     let (u, y, v) = extract_v410(word);
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
@@ -94,11 +108,16 @@ pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 // ---- Luma (u8) — `>> 2` ------------------------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
+pub(crate) fn v410_to_luma_row<const BE: bool>(packed: &[u32], out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
   for x in 0..width {
-    let y = (packed[x] >> 10) & 0x3FF;
+    let word = if BE {
+      u32::from_be(packed[x])
+    } else {
+      u32::from_le(packed[x])
+    };
+    let y = (word >> 10) & 0x3FF;
     out[x] = (y >> 2) as u8;
   }
 }
@@ -106,11 +125,16 @@ pub(crate) fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) {
 // ---- Luma (u16, low-bit-packed at 10-bit) ------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) {
+pub(crate) fn v410_to_luma_u16_row<const BE: bool>(packed: &[u32], out: &mut [u16], width: usize) {
   debug_assert!(packed.len() >= width);
   debug_assert!(out.len() >= width);
   for x in 0..width {
-    let y = (packed[x] >> 10) & 0x3FF;
+    let word = if BE {
+      u32::from_be(packed[x])
+    } else {
+      u32::from_le(packed[x])
+    };
+    let y = (word >> 10) & 0x3FF;
     out[x] = y as u16;
   }
 }
@@ -138,7 +162,7 @@ mod tests {
       pack_v410(512, 940, 512),
     ];
     let mut out = vec![0u8; 4 * 3];
-    v410_to_rgb_or_rgba_row::<false>(&p, &mut out, 4, ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, false>(&p, &mut out, 4, ColorMatrix::Bt709, false);
     // Two black pixels followed by two white pixels.
     assert_eq!(&out[0..3], &[0u8, 0, 0]);
     assert_eq!(&out[3..6], &[0u8, 0, 0]);
@@ -150,7 +174,7 @@ mod tests {
   fn v410_known_pattern_rgba_alpha_max() {
     let p = vec![pack_v410(512, 940, 512)];
     let mut out = vec![0u8; 4];
-    v410_to_rgb_or_rgba_row::<true>(&p, &mut out, 1, ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<true, false>(&p, &mut out, 1, ColorMatrix::Bt709, false);
     assert_eq!(out[3], 0xFF);
   }
 
@@ -161,7 +185,7 @@ mod tests {
       pack_v410(0, 0x100, 0), // Y = 0x100
     ];
     let mut out = vec![0u8; 2];
-    v410_to_luma_row(&p, &mut out, 2);
+    v410_to_luma_row::<false>(&p, &mut out, 2);
     // 0x3FF >> 2 = 0xFF; 0x100 >> 2 = 0x40.
     assert_eq!(&out[..], &[0xFFu8, 0x40]);
   }
@@ -170,7 +194,7 @@ mod tests {
   fn v410_luma_extract_u16_low_bit_packed() {
     let p = vec![pack_v410(0, 0x3FF, 0), pack_v410(0, 0x123, 0)];
     let mut out = vec![0u16; 2];
-    v410_to_luma_u16_row(&p, &mut out, 2);
+    v410_to_luma_u16_row::<false>(&p, &mut out, 2);
     assert_eq!(&out[..], &[0x3FFu16, 0x123]);
   }
 
@@ -178,8 +202,35 @@ mod tests {
   fn v410_known_pattern_rgba_u16_alpha_max() {
     let p = vec![pack_v410(512, 940, 512)];
     let mut out = vec![0u16; 4];
-    v410_to_rgb_u16_or_rgba_u16_row::<true>(&p, &mut out, 1, ColorMatrix::Bt709, false);
+    v410_to_rgb_u16_or_rgba_u16_row::<true, false>(&p, &mut out, 1, ColorMatrix::Bt709, false);
     // 10-bit alpha max is 0x3FF (low-bit-packed).
     assert_eq!(out[3], 0x3FF);
   }
+
+  #[test]
+  fn v410_be_roundtrip_matches_byte_swapped_le() {
+    // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+    // so semantics are host-independent: on every host, `le` carries the
+    // intended value as LE-encoded bytes and `be` carries the same value as
+    // BE-encoded bytes. Both kernels should therefore decode to the same
+    // intended host-native value (and produce identical RGB output) on both
+    // LE and BE hosts. The earlier `swap_bytes` pattern only validated this
+    // on LE hosts and degenerated to equal-but-wrong on BE hosts.
+    let intended = pack_v410(200, 500, 800);
+    let le_bytes: std::vec::Vec<u8> = intended.to_le_bytes().to_vec();
+    let be_bytes: std::vec::Vec<u8> = intended.to_be_bytes().to_vec();
+    let le_buf: std::vec::Vec<u32> = le_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let be_buf: std::vec::Vec<u32> = be_bytes
+      .chunks_exact(4)
+      .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]]))
+      .collect();
+    let mut out_le = vec![0u8; 3];
+    let mut out_be = vec![0u8; 3];
+    v410_to_rgb_or_rgba_row::<false, false>(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false);
+    v410_to_rgb_or_rgba_row::<false, true>(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false);
+    assert_eq!(out_le, out_be, "V410 BE scalar must match byte-swapped LE");
+  }
 }
diff --git a/src/row/scalar/xv36.rs b/src/row/scalar/xv36.rs
index 61b82352..0c49e7ef 100644
--- a/src/row/scalar/xv36.rs
+++ b/src/row/scalar/xv36.rs
@@ -8,6 +8,10 @@
 //! are independent. Bit extraction is `>> 4` to drop the 4 padding
 //! LSBs, then the standard Q15 chroma + Y pipeline at BITS=12 (i32
 //! chroma — same depth as Y2xx at BITS=12).
+//!
+//! `<const BE: bool>` — when `true`, each `u16` element of the input
+//! slice is byte-swapped before use. This handles the `XV36BE`
+//! big-endian wire format; `BE = false` is the standard LE path.
 
 use super::*;
 
@@ -15,6 +19,8 @@ use super::*;
 /// is padding and is not returned. Each channel is `>> 4` to drop
 /// the 4 padding LSBs, bringing the 12-bit MSB-aligned sample to
 /// the BITS=12 range `[0, 4095]`.
+///
+/// Samples are passed already endian-corrected by the caller.
 #[cfg_attr(not(tarpaulin), inline(always))]
 const fn extract_xv36(quad: &[u16]) -> (i32, i32, i32) {
   let u = (quad[0] >> 4) as i32;
@@ -24,10 +30,19 @@ const fn extract_xv36(quad: &[u16]) -> (i32, i32, i32) {
   (u, y, v)
 }
 
+/// Load one XV36 u16 sample, applying a byte-swap for BE wire format
+/// when `BE = true`. Uses target-endian aware `u16::from_be`/`u16::from_le`
+/// — these are no-ops when the source byte order matches the host, so the
+/// helper produces correct samples on both LE and BE hosts (e.g. s390x).
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn load_xv36_u16<const BE: bool>(v: u16) -> u16 {
+  if BE { u16::from_be(v) } else { u16::from_le(v) }
+}
+
 // ---- u8 RGB / RGBA output ----------------------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
+pub(crate) fn xv36_to_rgb_or_rgba_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u8],
   width: usize,
@@ -43,8 +58,14 @@ pub(crate) fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
   let bias = chroma_bias::<12>();
 
   for x in 0..width {
-    let quad = &packed[x * 4..x * 4 + 4];
-    let (u, y, v) = extract_xv36(quad);
+    let base = x * 4;
+    let quad = [
+      load_xv36_u16::<BE>(packed[base]),
+      load_xv36_u16::<BE>(packed[base + 1]),
+      load_xv36_u16::<BE>(packed[base + 2]),
+      load_xv36_u16::<BE>(packed[base + 3]),
+    ];
+    let (u, y, v) = extract_xv36(&quad);
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
     let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
@@ -65,7 +86,7 @@ pub(crate) fn xv36_to_rgb_or_rgba_row<const ALPHA: bool>(
 // ---- u16 RGB / RGBA native-depth output --------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
+pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool, const BE: bool>(
   packed: &[u16],
   out: &mut [u16],
   width: usize,
@@ -83,8 +104,14 @@ pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
   let out_max: i32 = 0x0FFF;
 
   for x in 0..width {
-    let quad = &packed[x * 4..x * 4 + 4];
-    let (u, y, v) = extract_xv36(quad);
+    let base = x * 4;
+    let quad = [
+      load_xv36_u16::<BE>(packed[base]),
+      load_xv36_u16::<BE>(packed[base + 1]),
+      load_xv36_u16::<BE>(packed[base + 2]),
+      load_xv36_u16::<BE>(packed[base + 3]),
+    ];
+    let (u, y, v) = extract_xv36(&quad);
     let u_d = q15_scale(u - bias, c_scale);
     let v_d = q15_scale(v - bias, c_scale);
     let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
@@ -105,11 +132,11 @@ pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row<const ALPHA: bool>(
 // ---- Luma (u8) — `>> 8` (drops 4 padding bits + 4 LSBs) ----------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
+pub(crate) fn xv36_to_luma_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
   for x in 0..width {
-    let y = packed[x * 4 + 1] >> 8;
+    let y = load_xv36_u16::<BE>(packed[x * 4 + 1]) >> 8;
     out[x] = y as u8;
   }
 }
@@ -117,11 +144,11 @@ pub(crate) fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) {
 // ---- Luma (u16, low-bit-packed at 12-bit) ------------------------------
 
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) {
+pub(crate) fn xv36_to_luma_u16_row<const BE: bool>(packed: &[u16], out: &mut [u16], width: usize) {
   debug_assert!(packed.len() >= width * 4);
   debug_assert!(out.len() >= width);
   for x in 0..width {
-    let y = packed[x * 4 + 1] >> 4;
+    let y = load_xv36_u16::<BE>(packed[x * 4 + 1]) >> 4;
     out[x] = y;
   }
 }
@@ -149,7 +176,7 @@ mod tests {
     let p3 = pack_xv36(2048, 3760, 2048, 0);
     let packed: Vec<u16> = [p0, p1, p2, p3].iter().flatten().copied().collect();
     let mut out = vec![0u8; 4 * 3];
-    xv36_to_rgb_or_rgba_row::<false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
+    xv36_to_rgb_or_rgba_row::<false, false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
     assert_eq!(&out[0..3], &[0u8, 0, 0]);
     assert_eq!(&out[3..6], &[0u8, 0, 0]);
     assert_eq!(&out[6..9], &[255u8, 255, 255]);
@@ -161,7 +188,7 @@ mod tests {
     let p = pack_xv36(2048, 3760, 2048, 0);
     let packed: Vec<u16> = p.into_iter().collect();
     let mut out = vec![0u8; 4];
-    xv36_to_rgb_or_rgba_row::<true>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
+    xv36_to_rgb_or_rgba_row::<true, false>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
     // X = padding; RGBA forces α=0xFF regardless of source A byte.
     assert_eq!(out[3], 0xFF);
   }
@@ -172,7 +199,7 @@ mod tests {
     let p = pack_xv36(2048, 3760, 2048, 0xFFF);
     let packed: Vec<u16> = p.into_iter().collect();
     let mut out = vec![0u8; 4];
-    xv36_to_rgb_or_rgba_row::<true>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
+    xv36_to_rgb_or_rgba_row::<true, false>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
     assert_eq!(out[3], 0xFF);
   }
 
@@ -185,7 +212,7 @@ mod tests {
       .copied()
       .collect();
     let mut out = vec![0u8; 2];
-    xv36_to_luma_row(&packed, &mut out, 2);
+    xv36_to_luma_row::<false>(&packed, &mut out, 2);
     assert_eq!(&out[..], &[0xFFu8, 0x10]);
   }
 
@@ -197,7 +224,7 @@ mod tests {
       .copied()
       .collect();
     let mut out = vec![0u16; 2];
-    xv36_to_luma_u16_row(&packed, &mut out, 2);
+    xv36_to_luma_u16_row::<false>(&packed, &mut out, 2);
     assert_eq!(&out[..], &[0xFFFu16, 0x123]);
   }
 
@@ -206,8 +233,35 @@ mod tests {
     let p = pack_xv36(2048, 3760, 2048, 0xFFF);
     let packed: Vec<u16> = p.into_iter().collect();
     let mut out = vec![0u16; 4];
-    xv36_to_rgb_u16_or_rgba_u16_row::<true>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
+    xv36_to_rgb_u16_or_rgba_u16_row::<true, false>(&packed, &mut out, 1, ColorMatrix::Bt709, false);
     // 12-bit alpha max = 0x0FFF; X = padding so source A byte is ignored.
     assert_eq!(out[3], 0x0FFF);
   }
+
+  #[test]
+  fn xv36_be_roundtrip_matches_byte_swapped_le() {
+    // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
+    // so semantics are host-independent: on every host, `le` carries the
+    // intended values as LE-encoded bytes and `be` carries the same values as
+    // BE-encoded bytes. Both kernels should therefore decode to the same
+    // intended host-native values (and produce identical RGB output) on both
+    // LE and BE hosts. The earlier `swap_bytes` pattern only validated this
+    // on LE hosts and degenerated to equal-but-wrong on BE hosts.
+    let intended = pack_xv36(1024, 2048, 512, 0);
+    let le_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
+    let be_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
+    let le_buf: Vec<u16> = le_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let be_buf: Vec<u16> = be_bytes
+      .chunks_exact(2)
+      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
+      .collect();
+    let mut out_le = vec![0u8; 3];
+    let mut out_be = vec![0u8; 3];
+    xv36_to_rgb_or_rgba_row::<false, false>(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false);
+    xv36_to_rgb_or_rgba_row::<false, true>(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false);
+    assert_eq!(out_le, out_be, "XV36 BE scalar must match byte-swapped LE");
+  }
 }
diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs
index 6ab55456..afba3d5e 100644
--- a/src/sinker/mixed/ayuv64.rs
+++ b/src/sinker/mixed/ayuv64.rs
@@ -252,6 +252,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -263,6 +264,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -290,6 +292,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -308,6 +311,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -328,7 +332,15 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         w,
         h,
       )?;
-      ayuv64_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+      ayuv64_to_rgb_row(
+        packed,
+        rgb_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+        false,
+      );
 
       if let Some(hsv) = hsv.as_mut() {
         rgb_to_hsv_row(
@@ -370,6 +382,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     }
 
@@ -393,6 +406,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
 
       // Strategy A+ combo (u16): RGBA u16 also attached — derive from the
@@ -429,6 +443,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     }
 
diff --git a/src/sinker/mixed/tests/ayuv64.rs b/src/sinker/mixed/tests/ayuv64.rs
index 12bcf366..3f0e4709 100644
--- a/src/sinker/mixed/tests/ayuv64.rs
+++ b/src/sinker/mixed/tests/ayuv64.rs
@@ -856,14 +856,14 @@ fn ayuv64_strategy_a_plus_matches_independent_kernel() {
         let row_off_packed = r * width * 4;
         let row_off_rgb = r * width * 3;
         let row_off_rgba = r * width * 4;
-        crate::row::scalar::ayuv64_to_rgb_row(
+        crate::row::scalar::ayuv64_to_rgb_row::<false>(
           &packed[row_off_packed..row_off_packed + width * 4],
           &mut inline_rgb[row_off_rgb..row_off_rgb + width * 3],
           width,
           matrix,
           full_range,
         );
-        crate::row::scalar::ayuv64_to_rgba_row(
+        crate::row::scalar::ayuv64_to_rgba_row::<false>(
           &packed[row_off_packed..row_off_packed + width * 4],
           &mut inline_rgba[row_off_rgba..row_off_rgba + width * 4],
           width,
@@ -931,14 +931,14 @@ fn ayuv64_strategy_a_plus_u16_matches_independent_kernel() {
         let row_off_packed = r * width * 4;
         let row_off_rgb = r * width * 3;
         let row_off_rgba = r * width * 4;
-        crate::row::scalar::ayuv64_to_rgb_u16_row(
+        crate::row::scalar::ayuv64_to_rgb_u16_row::<false>(
           &packed[row_off_packed..row_off_packed + width * 4],
           &mut inline_rgb[row_off_rgb..row_off_rgb + width * 3],
           width,
           matrix,
           full_range,
         );
-        crate::row::scalar::ayuv64_to_rgba_u16_row(
+        crate::row::scalar::ayuv64_to_rgba_u16_row::<false>(
           &packed[row_off_packed..row_off_packed + width * 4],
           &mut inline_rgba[row_off_rgba..row_off_rgba + width * 4],
           width,
diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index 574e482c..de173084 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -330,7 +330,17 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
 /// driven by miri on s390x / powerpc64; gating it out of miri (per the
 /// codex 4th-pass finding) would skip exactly the host where BE corruption
 /// would surface.
+///
+/// Re-gated on miri because the fixture builder calls `half::f16::from_f32`,
+/// which on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` that miri rejects. BE-host miri (s390x /
+/// powerpc64) covers the byte-swap correctness via the f32 LE-encoded
+/// regression tests in this module instead.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn rgbf16_sinker_le_encoded_frame_decodes_correctly() {
   let vals_f32 = [0.5f32, 1.5, -0.25, 100.0];
   let intended: Vec<half::f16> = (0..16 * 4 * 3)
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index a8132480..7d1d9eb2 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -1024,10 +1024,17 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
 
 /// LE-encoded byte contract regression for [`Gbrpf16`].
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1104,10 +1111,17 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
 /// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA
 /// pass-through, including the α plane).
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1178,10 +1192,17 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
 /// `widen_f16_be_to_host_f32::<false>` would interpret byte-swapped bits as
 /// host-native f16 and decode to wildly wrong f32 values.
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1247,10 +1268,17 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 /// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 →
 /// f32 widen step — same bit-normalise-first contract as the no-α variant.
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1310,10 +1338,17 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 /// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch
 /// the double-swap on BE.
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1595,10 +1630,17 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
 /// `widen_and_scatter` helper, so this test guards against the
 /// post-widen routing flag being wrong).
 ///
-/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
-/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
-/// docstring for the rationale.
+/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD
+/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which
+/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C)
+/// expands to inline `asm!` unsupported by miri. Miri-gated on every
+/// target — BE-host miri (s390x / powerpc64) covers the byte-swap
+/// correctness via the f32 LE-encoded regressions in this module.
 #[test]
+#[cfg_attr(
+  miri,
+  ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
+)]
 fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
   let w = 15usize;
   let h = 3usize;
diff --git a/src/sinker/mixed/v410.rs b/src/sinker/mixed/v410.rs
index c1c865db..09adc483 100644
--- a/src/sinker/mixed/v410.rs
+++ b/src/sinker/mixed/v410.rs
@@ -201,6 +201,7 @@ impl PixelSink for MixedSinker<'_, V410> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 10-bit Y values at native depth (low-bit-packed
@@ -211,6 +212,7 @@ impl PixelSink for MixedSinker<'_, V410> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -231,6 +233,7 @@ impl PixelSink for MixedSinker<'_, V410> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -251,6 +254,7 @@ impl PixelSink for MixedSinker<'_, V410> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -281,6 +285,7 @@ impl PixelSink for MixedSinker<'_, V410> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -297,7 +302,15 @@ impl PixelSink for MixedSinker<'_, V410> {
       w,
       h,
     )?;
-    v410_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    v410_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(
diff --git a/src/sinker/mixed/xv36.rs b/src/sinker/mixed/xv36.rs
index e773fab9..b7579d4e 100644
--- a/src/sinker/mixed/xv36.rs
+++ b/src/sinker/mixed/xv36.rs
@@ -209,6 +209,7 @@ impl PixelSink for MixedSinker<'_, Xv36> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
     // Luma u16 — extract 12-bit Y values at native depth (shift >> 4
@@ -219,6 +220,7 @@ impl PixelSink for MixedSinker<'_, Xv36> {
         &mut buf[one_plane_start..one_plane_end],
         w,
         use_simd,
+        false,
       );
     }
 
@@ -239,6 +241,7 @@ impl PixelSink for MixedSinker<'_, Xv36> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
     } else if want_rgb_u16 {
       let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
@@ -259,6 +262,7 @@ impl PixelSink for MixedSinker<'_, Xv36> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       if want_rgba_u16 {
         // Strategy A u16 fan-out — derive RGBA from the just-computed
@@ -289,6 +293,7 @@ impl PixelSink for MixedSinker<'_, Xv36> {
         row.matrix(),
         row.full_range(),
         use_simd,
+        false,
       );
       return Ok(());
     }
@@ -305,7 +310,15 @@ impl PixelSink for MixedSinker<'_, Xv36> {
       w,
       h,
     )?;
-    xv36_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd);
+    xv36_to_rgb_row(
+      packed,
+      rgb_row,
+      w,
+      row.matrix(),
+      row.full_range(),
+      use_simd,
+      false,
+    );
 
     if let Some(hsv) = hsv.as_mut() {
       rgb_to_hsv_row(