diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs
index 8b870dc2..ffb04e6a 100644
--- a/src/row/arch/neon/alpha_extract.rs
+++ b/src/row/arch/neon/alpha_extract.rs
@@ -241,7 +241,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
   }
 
   if x < width {
-    scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+    // Scalar tail uses `BE = false`: this NEON helper does host-native u16
+    // loads (`vld1q_u16`), which match LE-on-disk only on LE hosts. The
+    // dispatcher routes the BE = true case directly to scalar (see
+    // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+    // construction.
+    scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
       &alpha[x..width],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -286,7 +291,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
   }
 
   if x < width {
-    scalar::copy_alpha_plane_u16::<BITS>(
+    // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+    scalar::copy_alpha_plane_u16::<BITS, false>(
       &alpha[x..width],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -409,7 +415,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -430,7 +437,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -448,7 +456,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/neon/planar_gbr_high_bit.rs b/src/row/arch/neon/planar_gbr_high_bit.rs
index 44996068..0d839324 100644
--- a/src/row/arch/neon/planar_gbr_high_bit.rs
+++ b/src/row/arch/neon/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! NEON kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 8 pixels per iteration (`vld1q_u16` = 8 × u16).
 //! `vst3q_u16` / `vst4q_u16` do the 3-way / 4-way u16 interleave in a
 //! single hardware instruction. Scalar tails handle the remainder.
@@ -11,16 +12,27 @@
 //! using a negative-count vector shift (`vshlq_u16` with a negative
 //! shift), then narrowed with `vqmovn_u16` to u8x8. Two such halves are
 //! recombined with `vcombine_u8` before `vst3q_u8` / `vst4q_u8`.
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each 8-pixel NEON load goes through
+//! `load_endian_u16x8::<BE>` (defined in `endian.rs`) which applies a
+//! per-lane byte-swap via `vrev16q_u8`. The branch is resolved at
+//! monomorphisation — `BE = false` compiles to a plain `vld1q_u16`.
 
 use core::arch::aarch64::*;
 
 use crate::row::scalar;
 
+use super::endian::load_endian_u16x8;
+
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and narrows to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -28,7 +40,7 @@ use crate::row::scalar;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -48,9 +60,13 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
 
       // Right-shift each 8-pixel vector by BITS-8, then narrow to u8x8.
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
@@ -70,7 +86,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -86,6 +102,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`). Used by `Gbrp*` (no alpha plane).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -93,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -113,9 +131,13 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
 
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
       let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
@@ -132,7 +154,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -148,6 +170,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -155,7 +179,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -176,10 +200,15 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+      let a_raw = load_endian_u16x8::<BE>(a.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
+      let a_v = vandq_u16(a_raw, mask_v);
 
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
       let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
@@ -197,7 +226,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -214,6 +243,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// Copies samples without shifting — output values in `[0, (1<<BITS)-1]`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -221,7 +252,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -238,16 +269,16 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       // vst3q_u16 stores 8×3 = 24 u16 interleaved as R,G,B per pixel.
       let triple = uint16x8x3_t(r_v, g_v, b_v);
       vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), triple);
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -263,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -270,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -292,15 +325,15 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       let quad = uint16x8x4_t(r_v, g_v, b_v, opaque);
       vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -316,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -323,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -345,16 +380,16 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = vandq_u16(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       let quad = uint16x8x4_t(r_v, g_v, b_v, a_v);
       vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/neon/tests/planar_gbr_high_bit.rs b/src/row/arch/neon/tests/planar_gbr_high_bit.rs
index 3f7762ba..0a9c3301 100644
--- a/src/row/arch/neon/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/neon/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -57,9 +57,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -77,9 +77,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -97,9 +97,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -118,9 +118,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -139,9 +139,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -161,9 +161,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -181,9 +181,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -201,9 +201,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -221,9 +221,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -242,9 +242,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -263,9 +263,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -286,9 +286,9 @@ fn neon_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -307,9 +307,9 @@ fn neon_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -327,9 +327,9 @@ fn neon_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -348,9 +348,9 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -358,3 +358,300 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: NEON<BITS, true> output must match NEON<BITS, false> --------
+//
+// For each kernel:
+//  1. Generate LE plane data.
+//  2. Byte-swap each element to produce BE-encoded plane data.
+//  3. Run the kernel with BE=true on the byte-swapped input.
+//  4. Run the kernel with BE=false on the original LE input.
+//  5. Assert outputs are byte-identical.
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs
index 105910be..b999b618 100644
--- a/src/row/arch/wasm_simd128/alpha_extract.rs
+++ b/src/row/arch/wasm_simd128/alpha_extract.rs
@@ -357,7 +357,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this wasm-simd128 helper does
+      // host-native u16 loads (`v128_load64_zero`), which match LE-on-disk
+      // only on LE hosts. The dispatcher routes BE = true directly to scalar
+      // (see `dispatch::alpha_extract`), so the SIMD path here is BE = false
+      // by construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -438,7 +443,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -575,7 +581,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -598,7 +605,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -618,7 +626,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
index 7102afa3..94dcfbd5 100644
--- a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
+++ b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! wasm-simd128 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE` (big-endian input when `true`).
 //! Lane width: 8 pixels per iteration (8 × u16 per `v128`).
 //! Scalar tail handles the remainder.
 //!
@@ -21,12 +22,13 @@ use core::arch::wasm32::*;
 
 use crate::row::scalar;
 
-use super::*;
+use super::{endian::load_endian_u16x8, *};
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and narrows to u8.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -35,7 +37,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -55,9 +57,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       // Shift right by BITS-8, then narrow u16x8 → u8x8 (in low half).
       let r_sh = u16x8_shr(r_v, shift);
@@ -80,7 +82,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -95,6 +97,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -103,7 +106,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -124,9 +127,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = u16x8_shr(r_v, shift);
       let g_sh = u16x8_shr(g_v, shift);
@@ -144,7 +147,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -159,6 +162,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -167,7 +171,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -189,10 +193,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = v128_and(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = u16x8_shr(r_v, shift);
       let g_sh = u16x8_shr(g_v, shift);
@@ -211,7 +215,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -227,6 +231,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -235,7 +240,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -252,14 +257,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -274,6 +279,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -282,7 +288,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -304,14 +310,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -326,6 +332,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -334,7 +341,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -356,15 +363,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = v128_and(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
index 8fb1faef..5b041673 100644
--- a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
@@ -31,9 +31,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_wasm = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -50,9 +50,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_wasm = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -69,9 +69,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -88,9 +88,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -108,9 +108,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -128,9 +128,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -149,9 +149,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_wasm = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -168,9 +168,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_wasm = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -187,9 +187,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -206,9 +206,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -226,9 +226,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -246,9 +246,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -256,3 +256,281 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     );
   }
 }
+
+// ---- BE parity: simd128<BITS, true> output must match simd128<BITS, false> --
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs
index ba4ade4f..1ebe97c1 100644
--- a/src/row/arch/x86_avx2/alpha_extract.rs
+++ b/src/row/arch/x86_avx2/alpha_extract.rs
@@ -450,7 +450,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this AVX2 helper does host-native
+      // u16 loads (`_mm_loadu_si128`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -554,7 +559,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -696,7 +702,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -720,7 +727,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -741,7 +749,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
index 23c76e15..26d9e298 100644
--- a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! AVX2 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 16 pixels per iteration (16 × u16 per `__m256i`).
 //! Scalar tail handles the remainder.
 //!
@@ -18,16 +19,26 @@
 //!
 //! Process 16 u16 pixels per outer iteration via two calls to the 128-bit
 //! `write_rgb_u16_8` / `write_rgba_u16_8` helpers (8 pixels each).
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! Wide (16-pixel) iterations use `load_endian_u16x16::<BE>` from this
+//! backend's own `endian.rs` (256-bit shuffle). 8-pixel tail iterations use
+//! `load_endian_u16x8::<BE>` from the SSE4.1 `endian.rs` (128-bit shuffle).
+//! Both branches are resolved at monomorphisation time.
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x16, *};
+use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -35,7 +46,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
 
       // Variable-count logical right-shift for all 16 u16 lanes.
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
@@ -85,9 +96,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     // Drain remaining 8-pixel blocks with the SSE-width path.
     if x + 8 <= width {
       let zero = _mm_setzero_si128();
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -100,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -116,6 +127,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -123,7 +136,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -147,9 +160,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
 
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
       let g_sh = _mm256_srl_epi16(g_v, shr_count);
@@ -174,9 +187,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -189,7 +202,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -205,6 +218,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -212,7 +227,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -236,10 +251,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
-      let a_v = _mm256_and_si256(_mm256_loadu_si256(a.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
+      let a_v = _mm256_and_si256(load_endian_u16x16::<BE>(a.as_ptr().add(x).cast()), mask256);
 
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
       let g_sh = _mm256_srl_epi16(g_v, shr_count);
@@ -261,10 +276,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -279,7 +294,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -297,6 +312,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 /// Processes 16 pixels per outer loop via two 8-pixel `write_rgb_u16_8` calls.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -304,7 +321,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -322,27 +339,36 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 16 <= width {
       // Two 8-pixel halves using the SSE helper.
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_u16_out.as_mut_ptr().add(x * 3));
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
 
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -358,6 +384,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -365,7 +393,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -387,9 +415,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(
         r_lo,
         g_lo,
@@ -398,9 +426,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         rgba_u16_out.as_mut_ptr().add(x * 4),
       );
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgba_u16_8(
         r_hi,
         g_hi,
@@ -412,14 +449,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -435,6 +472,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -442,7 +481,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -464,16 +503,28 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_lo = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_lo = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo, rgba_u16_out.as_mut_ptr().add(x * 4));
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
-      let a_hi = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let a_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(a.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgba_u16_8(
         r_hi,
         g_hi,
@@ -485,15 +536,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
index 72225d19..505256fe 100644
--- a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -60,9 +60,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -83,9 +83,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -106,9 +106,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -130,9 +130,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -154,9 +154,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -179,9 +179,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -202,9 +202,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -225,9 +225,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -248,9 +248,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -272,9 +272,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -296,9 +296,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -321,9 +321,9 @@ fn avx2_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -345,9 +345,9 @@ fn avx2_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -368,9 +368,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -392,9 +392,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -402,3 +402,329 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: AVX2<BITS, true> output must match AVX2<BITS, false> --------
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs
index f311e366..203e08e3 100644
--- a/src/row/arch/x86_avx512/alpha_extract.rs
+++ b/src/row/arch/x86_avx512/alpha_extract.rs
@@ -434,7 +434,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this AVX-512 helper does host-native
+      // u16 loads (`_mm256_loadu_si256`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -519,7 +524,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -670,7 +676,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -696,7 +703,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -719,7 +727,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
index 4f763434..afc8ccc3 100644
--- a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! AVX-512 (F + BW) kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 32 pixels per iteration (32 × u16 per `__m512i`).
 //! Scalar tail handles the remainder.
 //!
@@ -20,16 +21,25 @@
 //!
 //! Process 32 pixels via four calls to `write_rgb_u16_8` /
 //! `write_rgba_u16_8` (8 pixels each, SSE4.1 128-bit helpers).
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! Wide (32-pixel) iterations use `load_endian_u16x32::<BE>` from this
+//! backend's own `endian.rs` (512-bit shuffle). 8-pixel tail iterations use
+//! `load_endian_u16x8::<BE>` from the SSE4.1 `endian.rs` (128-bit shuffle).
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x32, *};
+use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -37,7 +47,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -60,9 +70,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       // Load 32 u16 pixels per plane via 512-bit loads, then mask.
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
 
       // Shift all 32 u16 lanes right by BITS-8.
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
@@ -124,9 +134,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     }
     // Drain remaining 8-pixel blocks before scalar tail.
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -139,7 +149,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -155,6 +165,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -162,7 +174,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -185,9 +197,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
 
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
       let g_sh = _mm512_srl_epi16(g_v, shr_count);
@@ -245,9 +257,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -260,7 +272,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -276,6 +288,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -283,7 +297,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -306,10 +320,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
-      let a_v = _mm512_and_si512(_mm512_loadu_si512(a.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
+      let a_v = _mm512_and_si512(load_endian_u16x32::<BE>(a.as_ptr().add(x).cast()), mask512);
 
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
       let g_sh = _mm512_srl_epi16(g_v, shr_count);
@@ -376,10 +390,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -394,7 +408,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -412,6 +426,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 /// Processes 32 pixels per outer loop via four 8-pixel `write_rgb_u16_8` calls.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -419,7 +435,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -438,40 +454,67 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     while x + 32 <= width {
       // Four 8-pixel blocks (offsets 0, 8, 16, 24).
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 16) * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 24) * 3));
       }
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -487,6 +530,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -494,7 +539,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -517,15 +562,24 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
         write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -535,9 +589,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -547,9 +610,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -561,14 +633,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -584,6 +656,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -591,7 +665,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -614,17 +688,29 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+        let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
         write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -634,10 +720,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -647,10 +745,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -662,15 +772,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
index 3a5ca557..80bc153a 100644
--- a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -60,9 +60,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -83,9 +83,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -106,9 +106,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -130,9 +130,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -154,9 +154,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -179,9 +179,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -202,9 +202,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -225,9 +225,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -248,9 +248,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -272,9 +272,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -296,9 +296,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -321,9 +321,9 @@ fn avx512_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -345,9 +345,9 @@ fn avx512_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -368,9 +368,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -392,9 +392,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -402,3 +402,329 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: AVX-512<BITS, true> output must match AVX-512<BITS, false> --
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs
index 5abdfd08..d327e299 100644
--- a/src/row/arch/x86_sse41/alpha_extract.rs
+++ b/src/row/arch/x86_sse41/alpha_extract.rs
@@ -356,7 +356,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this SSE4.1 helper does host-native
+      // u16 loads (`_mm_loadl_epi64`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -440,7 +445,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -581,7 +587,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -605,7 +612,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -626,7 +634,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
index 364eac74..f28d4fe3 100644
--- a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! SSE4.1 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 8 pixels per iteration (8 × u16 per `__m128i`).
 //! Scalar tail handles the remainder.
 //!
@@ -17,16 +18,26 @@
 //! Use the existing `write_rgb_u16_8` / `write_rgba_u16_8` helpers from
 //! `x86_common` which interleave 8 u16 lanes per channel into packed
 //! RGB / RGBA u16 output.
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each 8-pixel load goes through
+//! `load_endian_u16x8::<BE>` (defined in `endian.rs`) which applies
+//! `_mm_shuffle_epi8` (SSSE3 pshufb) to byte-swap every u16 lane.
+//! The branch is resolved at monomorphisation — `BE = false` compiles
+//! to a plain `_mm_loadu_si128`.
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x8, *};
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -34,7 +45,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       // Variable-count logical right-shift by BITS-8 per u16 lane.
       let r_sh = _mm_srl_epi16(r_v, shr_count);
@@ -81,7 +92,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -97,6 +108,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -104,7 +117,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -127,9 +140,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
@@ -146,7 +159,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -162,6 +175,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -169,7 +184,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -191,10 +206,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
@@ -213,7 +228,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -230,6 +245,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -237,7 +254,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -254,14 +271,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -277,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -284,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -307,14 +326,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -330,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -337,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -359,15 +380,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
index 7292b15e..f0c11bf1 100644
--- a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -60,9 +60,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -83,9 +83,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -106,9 +106,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -130,9 +130,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -154,9 +154,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -179,9 +179,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -202,9 +202,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -225,9 +225,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -248,9 +248,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -272,9 +272,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -296,9 +296,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -321,9 +321,9 @@ fn sse41_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -345,9 +345,9 @@ fn sse41_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -368,9 +368,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -392,9 +392,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -402,3 +402,332 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: SSE4.1<BITS, true> output must match SSE4.1<BITS, false> ---
+//
+// Byte-swap LE inputs to produce BE-encoded data; verify that BE=true kernel
+// output is byte-identical to BE=false kernel output on the original LE data.
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 75cccb8d..00ecb61e 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -260,17 +260,45 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
 /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u8) with
 /// depth-conv `>> (BITS - 8)`.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
+/// `true` = BE on disk/wire). The SIMD α-extract helpers use host-native
+/// `u16` loads (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) AND
+/// hardcode their scalar tail to `scalar::<BITS, false>`. So SIMD is only
+/// correct when BOTH the host CPU is little-endian AND the source data is
+/// little-endian — any other quadrant either loads the wrong byte order in
+/// the vector body (LE-data on BE-host / BE-data on LE-host) or feeds
+/// already-native u16 samples through `u16::from_le` in the scalar tail
+/// (BE-data on BE-host), corrupting the tail at non-multiple widths.
+///
+/// The dispatcher computes
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to
+/// scalar in every other quadrant. The scalar helper is target-endian-aware
+/// via `u16::from_be` / `u16::from_le`, so this scalar fallback emits the
+/// correct α plane on every host. Phase 4 will plumb BE through the SIMD
+/// helpers if a BE-input sinker hot-path lands.
+///
+/// Truth table (`safe_for_simd = !BE && target_endian == "little"`):
+/// - LE data, LE host: `!false && true  = true`  → SIMD (host-native LE u16 loads correct, tail `from_le` is no-op)
+/// - LE data, BE host: `!false && false = false` → scalar (handles via `from_le`)
+/// - BE data, LE host: `!true  && true  = false` → scalar (handles via `from_be`)
+/// - BE data, BE host: `!true  && false = false` → scalar (handles via `from_be`; SIMD vector body would be correct but tail `from_le` would corrupt non-multiple widths — see codex 4th-pass review of PR #82)
+///
+/// Selects the highest available SIMD backend on LE-host with LE-data;
+/// falls back to scalar otherwise. When `use_simd` is `false`, calls
+/// scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u8],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_plane_u16_to_u8::<BITS>(alpha, rgba_out, width);
+  // SIMD α-extract helpers use host-native u16 loads + a scalar tail
+  // hardcoded to BE=false. They are only correct on LE host with LE
+  // source data. Force scalar in every other quadrant.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -306,7 +334,7 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     },
     _ => {}
   }
-  scalar::copy_alpha_plane_u16_to_u8::<BITS>(alpha, rgba_out, width);
+  scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -317,17 +345,30 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
 /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u16). No depth
 /// conversion.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
+/// `true` = BE on disk/wire). The dispatcher computes
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to
+/// scalar in every other quadrant: see `copy_alpha_plane_u16_to_u8` above
+/// for the truth table and rationale (SIMD α-extract uses host-native u16
+/// loads AND hardcodes its scalar tail to `BE=false`, so it only handles
+/// the LE-host/LE-data quadrant correctly; scalar is target-endian-aware).
+///
+/// Selects the highest available SIMD backend on LE-host with LE-data;
+/// falls back to scalar otherwise. When `use_simd` is `false`, calls
+/// scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u16],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_plane_u16::<BITS>(alpha, rgba_out, width);
+  // SIMD α-extract helpers use host-native u16 loads + a scalar tail
+  // hardcoded to BE=false. They are only correct on LE host with LE
+  // source data. Force scalar in every other quadrant.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -363,5 +404,5 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
     },
     _ => {}
   }
-  scalar::copy_alpha_plane_u16::<BITS>(alpha, rgba_out, width);
+  scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
 }
diff --git a/src/row/dispatch/planar_gbr_high_bit.rs b/src/row/dispatch/planar_gbr_high_bit.rs
index 544d8166..9511e662 100644
--- a/src/row/dispatch/planar_gbr_high_bit.rs
+++ b/src/row/dispatch/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! Runtime SIMD dispatchers for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}`:
+//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}`
+//! and `BE` (big-endian input when `true`):
 //! - [`gbr_to_rgb_high_bit_row`] — interleave G/B/R → packed `R, G, B` bytes.
 //! - [`gbr_to_rgb_u16_high_bit_row`] — interleave G/B/R → packed `R, G, B` u16.
 //! - [`gbr_to_rgba_opaque_high_bit_row`] — interleave G/B/R → packed
@@ -39,8 +40,9 @@ use crate::{
 
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8`. `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -65,31 +67,33 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified available.
-          unsafe { arch::neon::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::neon::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
-          unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
-          unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
-          unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
-          unsafe { arch::wasm_simd128::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe {
+            arch::wasm_simd128::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width);
+          }
           return;
         }
       },
@@ -97,7 +101,7 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width);
+  scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -107,8 +111,9 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **u16**
 /// elements. Samples are copied as-is (no depth conversion); values stay in
 /// `[0, (1 << BITS) - 1]`. `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -134,7 +139,7 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::neon::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(g, b, r, rgb_u16_out, width);
           }
           return;
         }
@@ -143,21 +148,27 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
@@ -166,7 +177,9 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
@@ -175,7 +188,7 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+  scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(g, b, r, rgb_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -185,8 +198,9 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B, A` **bytes**
 /// with constant α = `0xFF`. Used by `GbrpN` for standalone `with_rgba` path.
 /// `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -212,7 +226,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::neon::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(g, b, r, rgba_out, width);
           }
           return;
         }
@@ -221,21 +235,27 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
@@ -244,7 +264,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
@@ -253,7 +275,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+  scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(g, b, r, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -264,8 +286,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// **u16** elements with constant α = `(1 << BITS) - 1` (native-depth
 /// opaque). Used by `GbrpN` for standalone `with_rgba_u16` path.
 /// `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -291,7 +314,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -302,7 +325,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -311,7 +334,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -320,7 +343,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -331,7 +354,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -342,7 +365,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(g, b, r, rgba_u16_out, width);
+  scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(g, b, r, rgba_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -352,9 +375,10 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **bytes**. Alpha is downshifted by `BITS - 8` (real source α, not
 /// constant). `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -382,7 +406,7 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::neon::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
@@ -391,21 +415,21 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_avx512::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_avx2::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_sse41::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
@@ -414,7 +438,9 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::wasm_simd128::gbra_to_rgba_high_bit_row::<BITS, BE>(
+              g, b, r, a, rgba_out, width,
+            );
           }
           return;
         }
@@ -423,7 +449,7 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+  scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -433,9 +459,10 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **u16** elements. Alpha is copied directly without depth conversion (values
 /// stay in `[0, (1 << BITS) - 1]`). `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -463,7 +490,9 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbra_to_rgba_u16_high_bit_row::<BITS>(g, b, r, a, rgba_u16_out, width);
+            arch::neon::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
+              g, b, r, a, rgba_u16_out, width,
+            );
           }
           return;
         }
@@ -472,7 +501,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -481,7 +510,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -490,7 +519,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -501,7 +530,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -512,7 +541,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(g, b, r, a, rgba_u16_out, width);
+  scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(g, b, r, a, rgba_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -529,9 +558,10 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
 /// `use_simd` accepted for signature consistency with the rest of the
 /// row dispatcher family. Currently no SIMD path is wired (kernel is
 /// scalar-only); the flag is reserved for future backends.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -551,5 +581,5 @@ pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
   assert!(b.len() >= width, "b row too short");
   assert!(r.len() >= width, "r row too short");
   assert!(luma_out.len() >= width, "luma_out row too short");
-  scalar::gbr_to_luma_u16_high_bit_row::<BITS>(g, b, r, luma_out, width, matrix, full_range);
+  scalar::gbr_to_luma_u16_high_bit_row::<BITS, BE>(g, b, r, luma_out, width, matrix, full_range);
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 297f1c3c..a5d210f1 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -615,7 +615,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgb: [u8; 0] = [];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -626,7 +626,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgb: [u16; 0] = [];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -637,7 +637,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -648,7 +648,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -660,7 +660,7 @@ mod overflow_tests {
     let r: [u16; 0] = [];
     let a: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -672,7 +672,7 @@ mod overflow_tests {
     let r: [u16; 0] = [];
     let a: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   // ---- Tier 11 gray dispatchers — `width × {3, 4}` overflow ----
diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index a5190496..6c77346a 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -96,20 +96,33 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
   }
 }
 
-/// Yuva*p9/10/12/14 → u8 RGBA: scatter α plane (u16) into
-/// `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`.
+/// Yuva*p9/10/12/14/16 + Gbrap10/12/14/16 → u8 RGBA: scatter α plane
+/// (u16) into `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`.
 ///
-/// `BITS` is the source α bit depth (9, 10, 12, or 14).
+/// `BITS` is the source α bit depth (any value in `[8, 16]`; the runtime
+/// `assert!` enforces the range). In practice callers pass 9, 10, 12, 14,
+/// or 16. `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire (e.g., AV `Yuva420p10le`, `Gbrap10le`),
+/// `true` = BE on disk/wire (e.g., `Yuva420p10be`, `Gbrap10be`).
 ///
-/// α is masked with `(1 << BITS) - 1` BEFORE the shift to canonicalize
-/// over-range source samples. Frame constructors admit raw u16 input
-/// (e.g., p010-style buffers store the 10 active bits in the HIGH bits
+/// Each raw u16 sample is converted from its disk byte order into host-native
+/// order via `u16::from_le` / `u16::from_be` BEFORE the BITS-mask + shift.
+/// On a host whose endianness matches the data, the conversion compiles to a
+/// no-op; otherwise it is a `swap_bytes`. This mirrors the
+/// `load_endian_u16x*::<BE>` SIMD pattern from #81 so scalar tails and SIMD
+/// paths stay byte-for-byte equivalent on every host. Without this, a
+/// big-endian host (e.g., s390x) processing LE source data would emit a
+/// byte-reversed α plane.
+///
+/// α is masked with `(1 << BITS) - 1` AFTER the endian conversion to
+/// canonicalize over-range source samples. Frame constructors admit raw u16
+/// input (e.g., p010-style buffers store the 10 active bits in the HIGH bits
 /// of u16), so an unmasked over-range value would otherwise leak through
 /// the shift and produce divergent output between scalar and SIMD paths.
 /// See sibling inline-α kernels (`yuva_4_*` row impls) for the same
 /// pattern with comment "silently turning over-range alpha into
 /// transparent output".
-pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -122,7 +135,12 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for n in 0..width {
-    rgba_out[n * 4 + 3] = ((alpha[n] & mask) >> shift) as u8;
+    let raw = if BE {
+      u16::from_be(alpha[n])
+    } else {
+      u16::from_le(alpha[n])
+    };
+    rgba_out[n * 4 + 3] = ((raw & mask) >> shift) as u8;
   }
 }
 
@@ -131,7 +149,17 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
 /// depth, masked to `(1 << BITS) - 1` so over-range source samples
 /// don't leak through (parity with the inline-α kernels — frame
 /// constructors admit raw u16 input above the BITS-bit native range).
-pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
+///
+/// `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire, `true` = BE on disk/wire. Each raw u16
+/// sample is converted to host-native order via `u16::from_le` /
+/// `u16::from_be` BEFORE masking. On a host whose endianness matches
+/// the data, the conversion compiles to a no-op; otherwise it is a
+/// `swap_bytes`. Mirrors the `load_endian_u16x*::<BE>` SIMD pattern
+/// from #81 so scalar and SIMD stay byte-for-byte equivalent on every
+/// host. Without this, a BE host processing LE source data would emit
+/// a byte-reversed α plane.
+pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -143,7 +171,12 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n] & mask;
+    let raw = if BE {
+      u16::from_be(alpha[n])
+    } else {
+      u16::from_le(alpha[n])
+    };
+    rgba_out[n * 4 + 3] = raw & mask;
   }
 }
 
@@ -263,33 +296,51 @@ mod tests {
     );
   }
 
+  // ---- LE-host fixture tests ----
+  //
+  // The tests below use host-native `u16` literals (e.g.
+  // `vec![0x3FFu16, 0x1FF]`) as if they were the on-disk LE encoding of
+  // those samples and then call the kernel with `<BITS, BE = false>`
+  // (LE path). On a BE host (e.g., s390x under miri-sb), host-native
+  // `u16` storage does NOT lay bytes out little-endian, so the kernel's
+  // `u16::from_le` byte-swap correctly reinterprets the host-native
+  // value and produces a different logical value than the literal —
+  // making the assertion fail. The kernel is correct: its BE-host
+  // scalar correctness is locked down by the dedicated
+  // `*_be_parity_with_swapped_buffer` tests below, which build
+  // BE-encoded fixtures via `swap_bytes` from LE inputs and assert
+  // byte-for-byte parity. Gating these LE-fixture tests on
+  // `target_endian = "little"` avoids fixture-vs-kernel byte-order
+  // confusion without weakening coverage.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_to_u8_depth_converts_at_each_bits_value() {
     // BITS=10
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x7F]);
 
     // BITS=12
     let alpha: std::vec::Vec<u16> = std::vec![0xFFF, 0x800];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]);
 
     // BITS=16
     let alpha: std::vec::Vec<u16> = std::vec![0xFFFF, 0x8000];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<16>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<16, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]);
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_preserves_native_u16_within_bits_range() {
     // In-range values pass through unchanged.
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF, 0x000];
     let mut rgba = std::vec![1u16; 12];
-    copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(
       rgba,
       std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x1FF, 1, 1, 1, 0x000]
@@ -297,6 +348,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_masks_overrange_to_bits_range() {
     // Over-range α (e.g., 0xFFFF at BITS=10) must be masked to low BITS.
     // Without the mask, raw u16 0xFFFF would leak straight to output and
@@ -304,7 +356,7 @@ mod tests {
     // diverging from the inline-α scalar reference.
     let alpha: std::vec::Vec<u16> = std::vec![0xFFFF, 0x0500, 0x07FF];
     let mut rgba = std::vec![1u16; 12];
-    copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(
       rgba,
       std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x100, 1, 1, 1, 0x3FF]
@@ -312,6 +364,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_to_u8_masks_overrange_then_shifts() {
     // Without the BITS mask, 0x0500 at BITS=10 would shift `>> 2` to
     // 320 and either narrow as u8 to 64 (scalar `as u8`) or saturate to
@@ -319,10 +372,47 @@ mod tests {
     // & 0x3FF = 0x100 → 0x100 >> 2 = 64 consistently across all paths.
     let alpha: std::vec::Vec<u16> = std::vec![0x0500, 0xFFFF, 0x03FF];
     let mut rgba = std::vec![1u8; 12];
-    copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(rgba, std::vec![1, 1, 1, 64, 1, 1, 1, 0xFF, 1, 1, 1, 0xFF]);
   }
 
+  /// BE parity: byte-swapping the source α plane and toggling the `BE`
+  /// flag must yield byte-for-byte identical output. Locks down the
+  /// codex-flagged corruption where a BE host processing LE input
+  /// would otherwise emit a byte-reversed α slot. The synthesized
+  /// "BE-encoded" buffer is built by host-side `swap_bytes` on the LE
+  /// fixture; both `from_le` (LE flag) and `from_be` (BE flag with the
+  /// swapped buffer) recover the same logical u16 values, so the
+  /// outputs match on every host.
+  #[test]
+  fn copy_alpha_plane_u16_to_u8_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF, 0x0500, 0xFFFF, 0x07FF, 0x0123];
+    let alpha_be: std::vec::Vec<u16> = alpha_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 24];
+    let mut rgba_be = std::vec![1u8; 24];
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_u16_to_u8::<10, true>(&alpha_be, &mut rgba_be, 6);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for the u16-output variant.
+  #[test]
+  fn copy_alpha_plane_u16_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<u16> = std::vec![0xFFFF, 0x0500, 0x07FF, 0x0123, 0x3FF, 0x000];
+    let alpha_be: std::vec::Vec<u16> = alpha_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 24];
+    let mut rgba_be = std::vec![7u16; 24];
+    copy_alpha_plane_u16::<10, false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_u16::<10, true>(&alpha_be, &mut rgba_be, 6);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
   #[test]
   fn copy_alpha_ya_u8_extracts_alpha_from_odd_byte_slots() {
     // Ya8 packed layout: [Y0, A0, Y1, A1, Y2, A2]
diff --git a/src/row/scalar/planar_gbr_high_bit.rs b/src/row/scalar/planar_gbr_high_bit.rs
index b9c966df..9cdc8568 100644
--- a/src/row/scalar/planar_gbr_high_bit.rs
+++ b/src/row/scalar/planar_gbr_high_bit.rs
@@ -1,13 +1,15 @@
 //! Scalar reference kernels for high-bit-depth planar GBR sources
-//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE` /
-//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE`).
+//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE/BE` /
+//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE/BE`).
 //!
 //! `gbr_*` kernels (3-plane, no α) are const-generic over
-//! `BITS ∈ {9, 10, 12, 14, 16}`. `gbra_*` kernels (4-plane, with α)
-//! are const-generic over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no
-//! `GBRAP9` variant; only the 3-plane `GBRP9` exists at 9 bits.
+//! `BITS ∈ {9, 10, 12, 14, 16}` **and** `BE: bool` (endianness of the
+//! source planes). `gbra_*` kernels (4-plane, with α) are const-generic
+//! over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no `GBRAP9` variant;
+//! only the 3-plane `GBRP9` exists at 9 bits.
 //! No runtime branching on `BITS` — every `BITS - 8` shift is a
-//! const-eval expression resolved at monomorphisation.
+//! const-eval expression resolved at monomorphisation.  The `BE` branch is
+//! also const-folded away at monomorphisation time.
 //!
 //! # Output variants
 //!
@@ -34,18 +36,27 @@
 //!
 //! - u8: `0xFF`
 //! - u16: `(1u16 << BITS) - 1` (i.e., `511`, `1023`, `4095`, …)
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each u16 sample is byte-swapped before masking and
+//! arithmetic.  The swap is a compile-time branch: the `BE = false` path
+//! compiles to a no-op and the call overhead is zero.
 
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B`
 /// **bytes**, downshifting each sample by `BITS - 8`.
 ///
 /// Output order is **R, G, B** per pixel (FFmpeg `RGB24` convention).
 ///
+/// When `BE = true` each source element is byte-swapped before processing
+/// (big-endian wire format → host-native arithmetic value).
+///
 /// # Panics (debug builds)
 ///
 /// Asserts that `g`, `b`, `r` each have at least `width` samples and
 /// `rgb_out` has at least `width * 3` bytes.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -65,9 +76,24 @@ pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 3;
     rgb_out[dst] = (r_val >> shift) as u8;
     rgb_out[dst + 1] = (g_val >> shift) as u8;
@@ -79,12 +105,14 @@ pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// **`u16`** samples. Copies samples directly without shifting —
 /// output values are in `[0, (1 << BITS) - 1]`.
 ///
+/// When `BE = true` each source element is byte-swapped before processing.
+///
 /// # Panics (debug builds)
 ///
 /// Asserts that `g`, `b`, `r` each have at least `width` samples and
 /// `rgb_u16_out` has at least `width * 3` samples.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -103,9 +131,24 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
   debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short");
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 3;
     rgb_u16_out[dst] = r_val;
     rgb_u16_out[dst + 1] = g_val;
@@ -118,8 +161,9 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// `Gbrp*` sources (no alpha plane) when `with_rgba` is requested.
 ///
 /// Each sample is downshifted by `BITS - 8`.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -139,9 +183,24 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 4;
     rgba_out[dst] = (r_val >> shift) as u8;
     rgba_out[dst + 1] = (g_val >> shift) as u8;
@@ -154,8 +213,9 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// **`u16`** samples with a constant **opaque** alpha
 /// (`(1u16 << BITS) - 1`). Used for `Gbrp*` sources (no alpha plane)
 /// when `with_rgba_u16` is requested. Copies samples directly.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -178,9 +238,24 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let opaque: u16 = mask;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 4;
     rgba_u16_out[dst] = r_val;
     rgba_u16_out[dst + 1] = g_val;
@@ -192,8 +267,9 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **bytes**. Alpha is sourced from the `a` plane (real per-pixel α).
 /// Each sample (including α) is downshifted by `BITS - 8`.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -215,10 +291,30 @@ pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
-    let a_val = a[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let a_raw = if BE {
+      u16::from_be(a[x])
+    } else {
+      u16::from_le(a[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
+    let a_val = a_raw & mask;
     let dst = x * 4;
     rgba_out[dst] = (r_val >> shift) as u8;
     rgba_out[dst + 1] = (g_val >> shift) as u8;
@@ -230,8 +326,9 @@ pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **`u16`** samples. Alpha is sourced from the `a` plane at native
 /// depth (no shift). Copies all four channels directly.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -255,10 +352,30 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
   );
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
-    let a_val = a[x] & mask;
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let a_raw = if BE {
+      u16::from_be(a[x])
+    } else {
+      u16::from_le(a[x])
+    };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
+    let a_val = a_raw & mask;
     let dst = x * 4;
     rgba_u16_out[dst] = r_val;
     rgba_u16_out[dst + 1] = g_val;
@@ -280,8 +397,9 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
 /// `full_range = false` → Y' ∈ `[16 << (BITS - 8), 235 << (BITS - 8)]`
 /// (limited / studio swing). The limited-range formula mirrors
 /// `rgb_to_luma_row` but scaled to native depth.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -311,9 +429,24 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
 
   if full_range {
     for x in 0..width {
-      let rv = (r[x] & mask) as i64;
-      let gv = (g[x] & mask) as i64;
-      let bv = (b[x] & mask) as i64;
+      let r_raw = if BE {
+        u16::from_be(r[x])
+      } else {
+        u16::from_le(r[x])
+      };
+      let g_raw = if BE {
+        u16::from_be(g[x])
+      } else {
+        u16::from_le(g[x])
+      };
+      let b_raw = if BE {
+        u16::from_be(b[x])
+      } else {
+        u16::from_le(b[x])
+      };
+      let rv = (r_raw & mask) as i64;
+      let gv = (g_raw & mask) as i64;
+      let bv = (b_raw & mask) as i64;
       let y = ((k_r * rv + k_g * gv + k_b * bv + RND) >> 15) as i32;
       luma_out[x] = y.clamp(0, native_max as i32) as u16;
     }
@@ -339,9 +472,24 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
     let y_max = (235i64) << (BITS - 8);
     let y_min = y_off;
     for x in 0..width {
-      let rv = (r[x] & mask) as i64;
-      let gv = (g[x] & mask) as i64;
-      let bv = (b[x] & mask) as i64;
+      let r_raw = if BE {
+        u16::from_be(r[x])
+      } else {
+        u16::from_le(r[x])
+      };
+      let g_raw = if BE {
+        u16::from_be(g[x])
+      } else {
+        u16::from_le(g[x])
+      };
+      let b_raw = if BE {
+        u16::from_be(b[x])
+      } else {
+        u16::from_le(b[x])
+      };
+      let rv = (r_raw & mask) as i64;
+      let gv = (g_raw & mask) as i64;
+      let bv = (b_raw & mask) as i64;
       let y_full = (k_r * rv + k_g * gv + k_b * bv + RND) >> 15;
       let y_full_clamped = y_full.clamp(0, native_max_i64);
       let y_lim = y_off + (y_full_clamped * range + native_max_i64 / 2) / native_max_i64;
@@ -357,29 +505,51 @@ mod tests {
   use super::*;
   use crate::ColorMatrix;
 
+  // ---- LE-host fixture tests ----
+  //
+  // The tests below use host-native `u16` literals (e.g. `[100u16; 1]`,
+  // `vec![400u16, 200u16, 0u16]`) as if they were the on-disk LE
+  // encoding of those samples and then call the kernel with
+  // `<BITS, BE = false>` (LE path). On a BE host (e.g., s390x under
+  // miri-sb), host-native `u16` storage does NOT lay bytes out
+  // little-endian, so the kernel's `u16::from_le` byte-swap correctly
+  // reinterprets the host-native value and produces a different
+  // logical value than the literal — making the assertion fail. The
+  // kernel is correct: its BE-host scalar correctness is locked down
+  // by the dedicated `scalar_*_be_parity_*` tests further below, which
+  // build BE-encoded fixtures via `byte_swap_vec` from LE inputs and
+  // assert byte-for-byte parity. Gating these LE-fixture tests on
+  // `target_endian = "little"` avoids fixture-vs-kernel byte-order
+  // confusion without weakening coverage.
+  // Tests with all-zero / all-`u16::MAX` (byte-symmetric) literals are
+  // intentionally NOT gated — `from_le` is a no-op on those bit
+  // patterns regardless of host endianness.
+
   // ---- gbr_to_rgb_high_bit_row: u8 output, downshift ----------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_channel_reorder() {
     // G=0, B=100, R=1000 → packed R,G,B = 1000>>2, 0>>2, 100>>2 = 250, 0, 25
     let g = [0u16; 1];
     let b = [100u16; 1];
     let r = [1000u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 250); // R
     assert_eq!(out[1], 0); // G
     assert_eq!(out[2], 25); // B
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_max_value_becomes_0xff() {
     let max = (1u16 << 10) - 1; // 1023
     let g = [max; 4];
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u8; 12];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == 0xFF), "all pixels must be 0xFF");
   }
 
@@ -390,7 +560,7 @@ mod tests {
     let b = [max; 2];
     let r = [max; 2];
     let mut out = [0u8; 6];
-    gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == 0xFF));
   }
 
@@ -400,33 +570,36 @@ mod tests {
     let b = [0u16; 2];
     let r = [0u16; 2];
     let mut out = [0xFFu8; 6];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == 0));
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits9_downshift_by_1() {
     // BITS=9: shift = 1. Value 510 >> 1 = 255.
     let g = [510u16; 1];
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[1], 255); // G channel
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits12_downshift_by_4() {
     // BITS=12: shift = 4. Value 4080 >> 4 = 255.
     let r = [4080u16; 1];
     let g = [0u16; 1];
     let b = [0u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 255); // R channel
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_multiple_pixels_correct_layout() {
     // 3 pixels: (R,G,B) = (100,200,300>>2=75), (200>>2=50,0,0), (0,150>>2=37,50>>2=12)
     // BITS=10, shift=2
@@ -434,7 +607,7 @@ mod tests {
     let g = [800u16, 0u16, 600u16];
     let b = [300u16, 0u16, 200u16];
     let mut out = [0u8; 9];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 3);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 3);
     // pixel 0: R=400>>2=100, G=800>>2=200, B=300>>2=75
     assert_eq!(out[0], 100);
     assert_eq!(out[1], 200);
@@ -452,25 +625,27 @@ mod tests {
   // ---- gbr_to_rgb_u16_high_bit_row: u16 output, no shift ------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_channel_reorder() {
     let g = [111u16; 1];
     let b = [222u16; 1];
     let r = [333u16; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 333); // R
     assert_eq!(out[1], 111); // G
     assert_eq!(out[2], 222); // B
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_bits10_max_preserved() {
     let max = (1u16 << 10) - 1; // 1023
     let g = [max; 4];
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u16; 12];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == max));
   }
 
@@ -481,18 +656,19 @@ mod tests {
     let b = [max; 2];
     let r = [max; 2];
     let mut out = [0u16; 6];
-    gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == max));
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_values_not_shifted() {
     // Verify that u16 output does NOT shift values (unlike u8 output).
     let g = [1000u16; 1];
     let b = [2000u16; 1];
     let r = [3000u16; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 3000); // R — unchanged
     assert_eq!(out[1], 1000); // G — unchanged
     assert_eq!(out[2], 2000); // B — unchanged
@@ -501,13 +677,14 @@ mod tests {
   // ---- gbr_to_rgba_opaque_high_bit_row: u8 RGBA with constant alpha --------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_high_bit_bits10_alpha_is_0xff() {
     let max = (1u16 << 10) - 1;
     let g = [max; 4];
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u8; 16];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     for i in 0..4 {
       assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}");
       assert_eq!(out[i * 4], 0xFF, "R must be 0xFF at pixel {i}");
@@ -515,13 +692,14 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_high_bit_bits9_downshift_correct() {
     // BITS=9, shift=1. Value 510 >> 1 = 255.
     let g = [510u16; 1];
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u8; 4];
-    gbr_to_rgba_opaque_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[1], 255); // G
     assert_eq!(out[3], 0xFF); // alpha
   }
@@ -529,12 +707,13 @@ mod tests {
   // ---- gbr_to_rgba_opaque_u16_high_bit_row: u16 RGBA with constant alpha ---
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_u16_high_bit_bits10_alpha_is_1023() {
     let g = [500u16; 2];
     let b = [200u16; 2];
     let r = [800u16; 2];
     let mut out = [0u16; 8];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2);
     let opaque = (1u16 << 10) - 1; // 1023
     assert_eq!(out[3], opaque); // pixel 0 alpha
     assert_eq!(out[7], opaque); // pixel 1 alpha
@@ -549,7 +728,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[3], u16::MAX);
   }
 
@@ -559,13 +738,14 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[3], (1u16 << 9) - 1); // 511
   }
 
   // ---- gbra_to_rgba_high_bit_row: u8 RGBA with source alpha ----------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits10_source_alpha_downshifted() {
     // BITS=10, shift=2. Alpha value 512 >> 2 = 128.
     let g = [0u16; 1];
@@ -573,11 +753,12 @@ mod tests {
     let r = [0u16; 1];
     let a = [512u16; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[3], 128); // alpha = 512 >> 2
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits10_max_alpha_is_0xff() {
     let max = (1u16 << 10) - 1;
     let g = [max; 2];
@@ -585,13 +766,14 @@ mod tests {
     let r = [max; 2];
     let a = [max; 2];
     let mut out = [0u8; 8];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 2);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 2);
     for i in 0..2 {
       assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}");
     }
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits14_channel_reorder_and_shift() {
     // BITS=14, shift=6. R=16320 >> 6 = 255, G=0, B=0, A=8192 >> 6 = 128.
     let g = [0u16; 1];
@@ -599,7 +781,7 @@ mod tests {
     let r = [16320u16; 1];
     let a = [8192u16; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<14>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<14, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 255); // R
     assert_eq!(out[1], 0); // G
     assert_eq!(out[2], 0); // B
@@ -609,13 +791,14 @@ mod tests {
   // ---- gbra_to_rgba_u16_high_bit_row: u16 RGBA with source alpha -----------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_u16_high_bit_source_alpha_preserved() {
     let g = [100u16; 1];
     let b = [200u16; 1];
     let r = [300u16; 1];
     let a = [777u16; 1];
     let mut out = [0u16; 4];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 300); // R
     assert_eq!(out[1], 100); // G
     assert_eq!(out[2], 200); // B
@@ -623,13 +806,14 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_u16_high_bit_bits16_all_channels_preserved() {
     let g = [10000u16; 2];
     let b = [20000u16; 2];
     let r = [30000u16; 2];
     let a = [40000u16; 2];
     let mut out = [0u16; 8];
-    gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out, 2);
+    gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out, 2);
     for i in 0..2 {
       assert_eq!(out[i * 4], 30000);
       assert_eq!(out[i * 4 + 1], 10000);
@@ -641,6 +825,7 @@ mod tests {
   // ---- Round-trip parity: high-bit u8 output matches 8-bit source ----------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_parity_with_scaled_8bit() {
     // val=128 in 8-bit; in 10-bit: 128 << 2 = 512. 512 >> 2 = 128.
     let val: u16 = 128u16 << 2;
@@ -648,11 +833,12 @@ mod tests {
     let b = [val; 8];
     let r = [val; 8];
     let mut out = [0u8; 24];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 8);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 8);
     assert!(out.iter().all(|&v| v == 128));
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits12_parity_with_scaled_8bit() {
     // val=200 in 8-bit; in 12-bit: 200 << 4 = 3200. 3200 >> 4 = 200.
     let val: u16 = 200u16 << 4;
@@ -660,7 +846,7 @@ mod tests {
     let b = [val; 4];
     let r = [val; 4];
     let mut out = [0u8; 12];
-    gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == 200));
   }
 
@@ -669,6 +855,7 @@ mod tests {
   // correctly before processing, ensuring scalar/SIMD produce identical output.
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_high_bit_masks_upper_bits_bits10() {
     // BITS=10, mask=0x03FF. Input 0x0CFF has upper bits set.
     // masked = 0x0CFF & 0x03FF = 0x00FF = 255. 255 >> 2 = 63 as u8.
@@ -679,7 +866,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(
       out[0], expected_u8,
       "R must equal masked-then-shifted value"
@@ -695,6 +882,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_high_bit_masks_upper_bits_multiple_widths_bits10() {
     // Width sweep: [1, 7, 8, 16, 17, 32, 33, 64, 128, 130].
     let dirty: u16 = 0x0500; // BITS=10: mask&0x0500 = 0x0100=256; 256>>2=64.
@@ -705,7 +893,7 @@ mod tests {
       let b = std::vec![dirty; w];
       let r = std::vec![dirty; w];
       let mut out = std::vec![0u8; w * 3];
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, w);
       for i in 0..w {
         assert_eq!(out[i * 3], expected_u8, "R pixel {i} wrong at width {w}");
         assert_eq!(
@@ -723,6 +911,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_high_bit_masks_upper_bits_alpha_bits10() {
     // Verify that the alpha channel is also masked before shifting.
     // BITS=10: dirty_alpha = 0x0800 | 512 = 0x0A00 = 2560.
@@ -734,7 +923,7 @@ mod tests {
     let r = [dirty_rgb; 1];
     let a = [dirty_alpha; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 0, "R (dirty, masked to 0)");
     assert_eq!(out[1], 0, "G (dirty, masked to 0)");
     assert_eq!(out[2], 0, "B (dirty, masked to 0)");
@@ -742,6 +931,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_u16_high_bit_masks_upper_bits_bits10() {
     // u16-output: verify that masked sample is in the output (not raw dirty value).
     let dirty: u16 = 0x0CFF;
@@ -750,13 +940,14 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked value");
     assert_eq!(out[1], clean, "G u16 must be masked value");
     assert_eq!(out[2], clean, "B u16 must be masked value");
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_u16_high_bit_masks_upper_bits_bits10() {
     // u16 RGBA output: all channels masked.
     let dirty: u16 = 0x0555; // BITS=10: masked = 0x0555 & 0x03FF = 0x0155 = 341.
@@ -766,7 +957,7 @@ mod tests {
     let r = [dirty; 1];
     let a = [dirty; 1];
     let mut out = [0u16; 4];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked");
     assert_eq!(out[1], clean, "G u16 must be masked");
     assert_eq!(out[2], clean, "B u16 must be masked");
@@ -774,6 +965,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgba_opaque_high_bit_masks_upper_bits_bits10() {
     // u8 RGBA opaque: RGB channels masked, alpha always 0xFF.
     let dirty: u16 = 0x0CFF; // masked & 0x03FF = 0x00FF = 255. 255>>2=63.
@@ -783,7 +975,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u8; 4];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], expected_u8, "R must be masked");
     assert_eq!(out[1], expected_u8, "G must be masked");
     assert_eq!(out[2], expected_u8, "B must be masked");
@@ -791,6 +983,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgba_opaque_u16_high_bit_masks_upper_bits_bits10() {
     // u16 RGBA opaque: RGB masked, alpha is opaque mask value.
     let dirty: u16 = 0x0CFF; // masked = 0x00FF = 255.
@@ -799,7 +992,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked");
     assert_eq!(out[1], clean, "G u16 must be masked");
     assert_eq!(out[2], clean, "B u16 must be masked");
@@ -815,7 +1008,7 @@ mod tests {
     let b = [val; 2];
     let r = [val; 2];
     let mut out = [0u8; 6];
-    gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(
       out.iter().all(|&v| v == 0xFF),
       "BITS=16: max sample => 0xFF"
@@ -825,6 +1018,7 @@ mod tests {
   // ---- Cross-path consistency: direct GBRA vs masked RGB + separate alpha ---
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_high_bit_cross_path_consistency_bits10() {
     // With upper-bits-set alpha: direct gbra_to_rgba == manual masking.
     // BITS=10, dirty_alpha = 0x0800 | 0x0100 = 0x0900; masked=0x0100=256; 256>>2=64.
@@ -839,12 +1033,12 @@ mod tests {
 
     // Direct path
     let mut out_direct = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_direct, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_direct, 1);
 
     // Manual path: apply mask to alpha, call with clean value
     let a_clean = [clean_alpha; 1];
     let mut out_manual = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a_clean, &mut out_manual, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a_clean, &mut out_manual, 1);
 
     assert_eq!(
       out_direct, out_manual,
@@ -856,6 +1050,7 @@ mod tests {
   // ---- gbr_to_luma_u16_high_bit_row: native-depth luma --------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_max_white_not_banded() {
     // BITS=10: max = 1023. Old path gave (255 as u16) << 2 = 1020, not 1023.
     // New kernel must produce a value near 1023 for all-white input.
@@ -864,7 +1059,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     // For BT.709 full-range all-white: Y = round(Kr*max + Kg*max + Kb*max).
     // = round((6966 + 23436 + 2366) / 32768 * 1023) ≈ round(32768/32768 * 1023) = 1023.
     assert!(
@@ -879,6 +1074,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits12_max_white_not_banded() {
     // BITS=12: max = 4095. Old path: (255 as u16) << 4 = 4080.
     // New kernel should give a value in [4090, 4095].
@@ -887,7 +1083,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true);
+    gbr_to_luma_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true);
     assert!(
       out[0] >= 4090,
       "max-white luma_u16 bits12 must be near 4095 (was {})",
@@ -905,7 +1101,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     assert!(
       out[0] >= 65520,
       "max-white luma_u16 bits16 must be near 65535 (was {}), old banded gives 65280",
@@ -915,6 +1111,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_neutral_gray_midrange() {
     // BITS=10: mid = 512. Luma of neutral gray ≈ 512.
     let mid = 512u16;
@@ -922,7 +1119,7 @@ mod tests {
     let b = [mid; 1];
     let r = [mid; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     assert!(
       out[0] >= 510 && out[0] <= 514,
       "neutral gray luma_u16 must be ~512 (was {})",
@@ -936,11 +1133,12 @@ mod tests {
     let b = [0u16; 2];
     let r = [0u16; 2];
     let mut out = [0xFFFFu16; 2];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true);
     assert!(out.iter().all(|&v| v == 0), "all-black must give zero luma");
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_full_range_vs_limited_range() {
     // For mid-gray input, limited-range luma should be in [16<<2, 235<<2] = [64, 940].
     let mid = 512u16;
@@ -949,8 +1147,24 @@ mod tests {
     let r = [mid; 1];
     let mut out_full = [0u16; 1];
     let mut out_lim = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_full, 1, ColorMatrix::Bt601, true);
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_lim, 1, ColorMatrix::Bt601, false);
+    gbr_to_luma_u16_high_bit_row::<10, false>(
+      &g,
+      &b,
+      &r,
+      &mut out_full,
+      1,
+      ColorMatrix::Bt601,
+      true,
+    );
+    gbr_to_luma_u16_high_bit_row::<10, false>(
+      &g,
+      &b,
+      &r,
+      &mut out_lim,
+      1,
+      ColorMatrix::Bt601,
+      false,
+    );
     let y_off = 16u16 << 2; // 64
     let y_max = 235u16 << 2; // 940
     assert!(
@@ -976,7 +1190,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
     let y_off = 16u16 << 8; // 4096
     assert_eq!(
       out[0], y_off,
@@ -1001,7 +1215,7 @@ mod tests {
     let b = [u16::MAX; 1];
     let r = [u16::MAX; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
     let y_max = 235u16 << 8; // 60160
     assert_eq!(
       out[0], y_max,
@@ -1010,6 +1224,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits16_limited_range_near_white_keeps_gradation() {
     // BITS=16, BT.709 luma weights ≈ Kr=0.2126, Kg=0.7152, Kb=0.0722.
     // Setting all 3 channels equal makes the matrix multiply produce
@@ -1022,7 +1237,7 @@ mod tests {
       let b = [v; 1];
       let r = [v; 1];
       let mut out = [0u16; 1];
-      gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+      gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
       // Native-depth limited-range: y_lim = 4096 + v × 56064 / 65535
       let expected = 4096 + ((v as u64 * 56064 + 65535 / 2) / 65535) as u16;
       // Allow ±1 LSB for matrix-multiply rounding (BT.709 weights aren't
@@ -1044,6 +1259,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_limited_range_endpoints() {
     // BITS=10: y_off=64 (=16<<2), y_max=940 (=235<<2), native_max=1023.
     // BT.709 luma at all-equal channels passes y_full ≈ input through.
@@ -1054,7 +1270,7 @@ mod tests {
       let b = [input; 1];
       let r = [input; 1];
       let mut out = [0u16; 1];
-      gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+      gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
       let diff = (out[0] as i32 - expected as i32).abs();
       assert!(
         diff <= 1,
@@ -1063,4 +1279,164 @@ mod tests {
       );
     }
   }
+
+  // ---- BE vs LE parity: scalar<BITS, true> must produce same output as -------
+  // scalar<BITS, false> on byte-swapped input. Covers 6 kernels at BITS 10/16. -
+
+  fn byte_swap_vec(v: &[u16]) -> std::vec::Vec<u16> {
+    v.iter().map(|x| x.swap_bytes()).collect()
+  }
+
+  fn rand_plane<const BITS: u32>(seed: u32, n: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let mut s = seed;
+    (0..n)
+      .map(|_| {
+        s = s.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+        (s & mask) as u16
+      })
+      .collect()
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17, 33, 65] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_high_bit_be_parity_bits16() {
+    for w in [1usize, 7, 8, 9, 17, 33, 65] {
+      let g = rand_plane::<16>(0xAAAA, w);
+      let b = rand_plane::<16>(0xBBBB, w);
+      let r = rand_plane::<16>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb bits16 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgba_opaque_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgba_opaque bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbra_to_rgba_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let a = rand_plane::<10>(0xDDDD, w);
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &byte_swap_vec(&a),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbra_to_rgba bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_u16_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb_u16 bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbra_to_rgba_u16_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let a = rand_plane::<10>(0xDDDD, w);
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &byte_swap_vec(&a),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbra_to_rgba_u16 bits10 w={w}"
+      );
+    }
+  }
 }
diff --git a/src/sinker/mixed/planar_gbr_high_bit.rs b/src/sinker/mixed/planar_gbr_high_bit.rs
index f28432b9..f1a6479c 100644
--- a/src/sinker/mixed/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/planar_gbr_high_bit.rs
@@ -237,7 +237,14 @@ macro_rules! impl_gbrp_high_bit {
           let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
           let rgba_u16_row =
             rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-          gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgba_u16_row, w, use_simd);
+          gbr_to_rgba_opaque_u16_high_bit_row::<BITS, false>(
+            g_in,
+            b_in,
+            r_in,
+            rgba_u16_row,
+            w,
+            use_simd,
+          );
         } else if want_rgb_u16 {
           let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
           let rgb_plane_end =
@@ -250,7 +257,7 @@ macro_rules! impl_gbrp_high_bit {
               })?;
           let rgb_plane_start = one_plane_start * 3;
           let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-          gbr_to_rgb_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
+          gbr_to_rgb_u16_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
           if want_rgba_u16 {
             let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
             let rgba_u16_row =
@@ -264,7 +271,7 @@ macro_rules! impl_gbrp_high_bit {
         // going through the u8 staging path, so it is independent of whether
         // RGB staging happens below.
         if let Some(luma_u16_buf) = luma_u16.as_deref_mut() {
-          gbr_to_luma_u16_high_bit_row::<BITS>(
+          gbr_to_luma_u16_high_bit_row::<BITS, false>(
             g_in,
             b_in,
             r_in,
@@ -287,7 +294,7 @@ macro_rules! impl_gbrp_high_bit {
         if want_rgba && !need_rgb_staging {
           let rgba_buf = rgba.as_deref_mut().unwrap();
           let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-          gbr_to_rgba_opaque_high_bit_row::<BITS>(g_in, b_in, r_in, rgba_row, w, use_simd);
+          gbr_to_rgba_opaque_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgba_row, w, use_simd);
           return Ok(());
         }
 
@@ -304,7 +311,7 @@ macro_rules! impl_gbrp_high_bit {
           w,
           h,
         )?;
-        gbr_to_rgb_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_row, w, use_simd);
+        gbr_to_rgb_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
         if let Some(luma) = luma.as_deref_mut() {
           rgb_to_luma_row(
@@ -519,7 +526,15 @@ macro_rules! impl_gbrap_high_bit {
           let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
           let rgba_u16_row =
             rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-          gbra_to_rgba_u16_high_bit_row::<BITS>(g_in, b_in, r_in, a_in, rgba_u16_row, w, use_simd);
+          gbra_to_rgba_u16_high_bit_row::<BITS, false>(
+            g_in,
+            b_in,
+            r_in,
+            a_in,
+            rgba_u16_row,
+            w,
+            use_simd,
+          );
         } else if want_rgb_u16 {
           let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
           let rgb_plane_end =
@@ -532,7 +547,7 @@ macro_rules! impl_gbrap_high_bit {
               })?;
           let rgb_plane_start = one_plane_start * 3;
           let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-          gbr_to_rgb_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
+          gbr_to_rgb_u16_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
           if want_rgba_u16 {
             // Strategy A+: expand RGB → RGBA, then overwrite α from source plane.
             let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
@@ -540,7 +555,11 @@ macro_rules! impl_gbrap_high_bit {
               rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
             expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
             // Overwrite α slot from source plane (native depth, no shift).
-            alpha_extract::copy_alpha_plane_u16::<BITS>(a_in, rgba_u16_row, w, use_simd);
+            // BE flag hard-wired to `false`: this sinker only handles LE-encoded
+            // GBR/GBRA inputs today (Tier 10b). Phase 4 will wire the kernel's
+            // `<const BE: bool>` through here (matches the LE-only `false` in
+            // the sibling `gbr_to_rgb_u16_high_bit_row::<BITS, false>` call).
+            alpha_extract::copy_alpha_plane_u16::<BITS, false>(a_in, rgba_u16_row, w, use_simd);
           }
         }
 
@@ -549,7 +568,7 @@ macro_rules! impl_gbrap_high_bit {
         // going through the u8 staging path, so it is independent of whether
         // RGB staging happens below.
         if let Some(luma_u16_buf) = luma_u16.as_deref_mut() {
-          gbr_to_luma_u16_high_bit_row::<BITS>(
+          gbr_to_luma_u16_high_bit_row::<BITS, false>(
             g_in,
             b_in,
             r_in,
@@ -572,7 +591,7 @@ macro_rules! impl_gbrap_high_bit {
         if want_rgba && !need_rgb_staging {
           let rgba_buf = rgba.as_deref_mut().unwrap();
           let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-          gbra_to_rgba_high_bit_row::<BITS>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+          gbra_to_rgba_high_bit_row::<BITS, false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
           return Ok(());
         }
 
@@ -589,7 +608,7 @@ macro_rules! impl_gbrap_high_bit {
           w,
           h,
         )?;
-        gbr_to_rgb_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_row, w, use_simd);
+        gbr_to_rgb_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
         if let Some(luma) = luma.as_deref_mut() {
           rgb_to_luma_row(
@@ -618,7 +637,8 @@ macro_rules! impl_gbrap_high_bit {
           // overwrite α bytes from the source A plane.
           let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
           expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-          alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_in, rgba_row, w, use_simd);
+          // BE flag hard-wired to `false`: see the rgba_u16 branch above.
+          alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(a_in, rgba_row, w, use_simd);
         }
 
         Ok(())
diff --git a/src/sinker/mixed/tests/planar_gbr_high_bit.rs b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
index 83cf13c4..d49e573e 100644
--- a/src/sinker/mixed/tests/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
@@ -140,10 +140,13 @@ test_gbrp_channel_reorder!(gbrp16_channel_reorder, Gbrp16, gbrp16_to, 16);
 
 macro_rules! test_gbrap_strategy_a_plus {
   ($name:ident, $marker:ident, $walker:ident, $bits:literal) => {
+    test_gbrap_strategy_a_plus!($name, $marker, $walker, $bits, 32);
+  };
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => {
     #[test]
     #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")]
     fn $name() {
-      let w = 32usize;
+      let w = $w as usize;
       let h = 8usize;
       let n = w * h;
       let mut g = std::vec![0u16; n];
@@ -177,7 +180,7 @@ macro_rules! test_gbrap_strategy_a_plus {
       // RGBA bytes must be identical between standalone and combo paths.
       assert_eq!(
         rgba_ref, rgba_combo,
-        "Strategy A+ RGBA mismatch for BITS={}", $bits,
+        "Strategy A+ RGBA mismatch for BITS={} w={}", $bits, $w,
       );
     }
   };
@@ -208,6 +211,151 @@ test_gbrap_strategy_a_plus!(
   16
 );
 
+// ---- Strategy A+: Gbrap combo RGB_u16+RGBA_u16 matches standalone RGBA_u16 -
+//
+// Mirrors the u8 Strategy A+ test above, but covers the native-depth combo
+// path (`with_rgb_u16` + `with_rgba_u16`) that routes through
+// `copy_alpha_plane_u16` rather than `copy_alpha_plane_u16_to_u8`. Without
+// this, a regression in the `BE != cfg!(target_endian)` dispatcher routing
+// or in the scalar α-extract helper would not be caught for the native-depth
+// path.
+//
+// Source planes are filled with full-range u16 values (`bits=16` argument
+// to `pseudo_random_u16_low_n_bits`) so the upper bits beyond BITS are
+// "dirty" — both paths must mask via `(1 << BITS) - 1`, so any drift between
+// them surfaces here.
+macro_rules! test_gbrap_strategy_a_plus_u16 {
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal) => {
+    test_gbrap_strategy_a_plus_u16!($name, $marker, $walker, $bits, 32);
+  };
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => {
+    #[test]
+    #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")]
+    fn $name() {
+      let w = $w as usize;
+      let h = 8usize;
+      let n = w * h;
+      let mut g = std::vec![0u16; n];
+      let mut b = std::vec![0u16; n];
+      let mut r = std::vec![0u16; n];
+      let mut a = std::vec![0u16; n];
+      // Use full-range u16 (bits=16) so upper bits beyond BITS are dirty,
+      // exercising the mask in both the direct kernel and α-extract paths.
+      pseudo_random_u16_low_n_bits(&mut g, 0x55_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut b, 0x66_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut r, 0x77_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut a, 0x88_u32.wrapping_add($bits), 16);
+
+      // Reference: standalone with_rgba_u16 (direct 4-channel kernel).
+      let src_ref = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32);
+      let mut rgba_u16_ref = std::vec![0u16; n * 4];
+      let mut sink_ref = MixedSinker::<crate::yuv::$marker>::new(w, h)
+        .with_rgba_u16(&mut rgba_u16_ref)
+        .unwrap();
+      crate::yuv::$walker(&src_ref, false, ColorMatrix::Bt709, &mut sink_ref).unwrap();
+
+      // Combo: with_rgb_u16 + with_rgba_u16 (Strategy A+ native-depth).
+      let src_combo = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32);
+      let mut rgb_u16_combo = std::vec![0u16; n * 3];
+      let mut rgba_u16_combo = std::vec![0u16; n * 4];
+      let mut sink_combo = MixedSinker::<crate::yuv::$marker>::new(w, h)
+        .with_rgb_u16(&mut rgb_u16_combo)
+        .unwrap()
+        .with_rgba_u16(&mut rgba_u16_combo)
+        .unwrap();
+      crate::yuv::$walker(&src_combo, false, ColorMatrix::Bt709, &mut sink_combo).unwrap();
+
+      // RGBA u16 elements must be byte-exact between standalone and combo paths.
+      assert_eq!(
+        rgba_u16_ref, rgba_u16_combo,
+        "Strategy A+ native-depth RGBA u16 mismatch for BITS={} w={}", $bits, $w,
+      );
+    }
+  };
+}
+
+test_gbrap_strategy_a_plus_u16!(
+  gbrap10_strategy_a_plus_u16_matches_standalone,
+  Gbrap10,
+  gbrap10_to,
+  10
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap12_strategy_a_plus_u16_matches_standalone,
+  Gbrap12,
+  gbrap12_to,
+  12
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap14_strategy_a_plus_u16_matches_standalone,
+  Gbrap14,
+  gbrap14_to,
+  14
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap16_strategy_a_plus_u16_matches_standalone,
+  Gbrap16,
+  gbrap16_to,
+  16
+);
+
+// ---- Strategy A+ at non-multiple width (31) — exercises SIMD scalar tail ---
+//
+// The SIMD α-extract backends (`copy_alpha_plane_u16{_to_u8}`) hardcode
+// `scalar::<BITS, false>` for the tail (e.g. NEON block size 8 + width 31
+// leaves 7 px in the tail; AVX2/AVX-512 likewise). Codex's 4th-pass review
+// of PR #82 found that the prior dispatcher routing
+// (`need_swap = BE != cfg!(target_endian = "big")`) admitted SIMD on
+// BE-host/BE-data: the vector body's host-native loads are correct there,
+// but the LE-only scalar tail then byte-swaps already-native u16 samples,
+// silently corrupting α at non-multiple widths. The fix is to route SIMD
+// only for the LE-host/LE-data quadrant; these tests at width 31 exercise
+// the SIMD tail path on supported (LE) hosts, locking in the parity
+// guarantee for the LE/LE quadrant. (The LE/BE, BE/LE, BE/BE quadrants
+// are exercised at the scalar level by the `target_endian`-aware scalar
+// helper itself; the new dispatcher routes them to scalar always.)
+
+test_gbrap_strategy_a_plus_u16!(
+  gbrap10_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap10,
+  gbrap10_to,
+  10,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap12_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap12,
+  gbrap12_to,
+  12,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap14_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap14,
+  gbrap14_to,
+  14,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap16_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap16,
+  gbrap16_to,
+  16,
+  31
+);
+
+// u8-path Strategy A+ at width 31 — exercises the SIMD tail of
+// `copy_alpha_plane_u16_to_u8` (depth-conv `>> (BITS - 8)`). One BITS value
+// is sufficient to cover the same dispatcher path as the u16 set above;
+// Gbrap10 chosen for parity with the existing u8 Strategy A+ coverage.
+test_gbrap_strategy_a_plus!(
+  gbrap10_strategy_a_plus_matches_standalone_w31,
+  Gbrap10,
+  gbrap10_to,
+  10,
+  31
+);
+
 // ---- Gbrap alpha downshift correctness -------------------------------------
 
 macro_rules! test_gbrap_alpha_downshift {
diff --git a/src/sinker/mixed/yuva_4_2_0.rs b/src/sinker/mixed/yuva_4_2_0.rs
index e32af5ba..d543f0a6 100644
--- a/src/sinker/mixed/yuva_4_2_0.rs
+++ b/src/sinker/mixed/yuva_4_2_0.rs
@@ -657,7 +657,14 @@ fn yuva420p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -727,7 +734,10 @@ fn yuva420p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())
diff --git a/src/sinker/mixed/yuva_4_2_2.rs b/src/sinker/mixed/yuva_4_2_2.rs
index 6174c7d7..c7e861a6 100644
--- a/src/sinker/mixed/yuva_4_2_2.rs
+++ b/src/sinker/mixed/yuva_4_2_2.rs
@@ -757,7 +757,14 @@ fn yuva422p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -826,7 +833,10 @@ fn yuva422p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())
diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs
index be76e51a..c9d46e9d 100644
--- a/src/sinker/mixed/yuva_4_4_4.rs
+++ b/src/sinker/mixed/yuva_4_4_4.rs
@@ -868,7 +868,14 @@ fn yuva444p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -938,7 +945,10 @@ fn yuva444p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())