From 8c6b6dc4df1404b1aa5995329216f230788e7dd9 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Thu, 7 May 2026 23:28:42 +1200
Subject: [PATCH 1/7] feat(be-tier10b): BE support for Gbrp/Gbrap 9-16-bit row
 kernels

Add <const BE: bool> const-generic to all Gbrp/Gbrap high-bit row
kernels (scalar, NEON, SSE4.1, AVX2, AVX512, wasm-simd128) and their
dispatchers.  Sinker gets a minimal compile fix (hardcoded false).
Adds BE parity tests in all six test modules (72 new test cases).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/planar_gbr_high_bit.rs      | 101 +++--
 .../arch/neon/tests/planar_gbr_high_bit.rs    | 361 ++++++++++++++--
 .../arch/wasm_simd128/planar_gbr_high_bit.rs  |  75 ++--
 .../wasm_simd128/tests/planar_gbr_high_bit.rs | 326 +++++++++++++--
 src/row/arch/x86_avx2/planar_gbr_high_bit.rs  | 179 +++++---
 .../x86_avx2/tests/planar_gbr_high_bit.rs     | 390 +++++++++++++++--
 .../arch/x86_avx512/planar_gbr_high_bit.rs    | 278 +++++++++----
 .../x86_avx512/tests/planar_gbr_high_bit.rs   | 390 +++++++++++++++--
 src/row/arch/x86_sse41/planar_gbr_high_bit.rs |  89 ++--
 .../x86_sse41/tests/planar_gbr_high_bit.rs    | 393 ++++++++++++++++--
 src/row/dispatch/planar_gbr_high_bit.rs       | 120 ++++--
 src/row/mod.rs                                |  12 +-
 src/row/scalar/planar_gbr_high_bit.rs         | 386 +++++++++++++----
 src/sinker/mixed/planar_gbr_high_bit.rs       |  35 +-
 14 files changed, 2590 insertions(+), 545 deletions(-)
diff --git a/src/row/arch/neon/planar_gbr_high_bit.rs b/src/row/arch/neon/planar_gbr_high_bit.rs
index 44996068..0d839324 100644
--- a/src/row/arch/neon/planar_gbr_high_bit.rs
+++ b/src/row/arch/neon/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! NEON kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 8 pixels per iteration (`vld1q_u16` = 8 × u16).
 //! `vst3q_u16` / `vst4q_u16` do the 3-way / 4-way u16 interleave in a
 //! single hardware instruction. Scalar tails handle the remainder.
@@ -11,16 +12,27 @@
 //! using a negative-count vector shift (`vshlq_u16` with a negative
 //! shift), then narrowed with `vqmovn_u16` to u8x8. Two such halves are
 //! recombined with `vcombine_u8` before `vst3q_u8` / `vst4q_u8`.
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each 8-pixel NEON load goes through
+//! `load_endian_u16x8::<BE>` (defined in `endian.rs`) which applies a
+//! per-lane byte-swap via `vrev16q_u8`. The branch is resolved at
+//! monomorphisation — `BE = false` compiles to a plain `vld1q_u16`.
 
 use core::arch::aarch64::*;
 
 use crate::row::scalar;
 
+use super::endian::load_endian_u16x8;
+
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and narrows to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -28,7 +40,7 @@ use crate::row::scalar;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -48,9 +60,13 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
 
       // Right-shift each 8-pixel vector by BITS-8, then narrow to u8x8.
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
@@ -70,7 +86,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -86,6 +102,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`). Used by `Gbrp*` (no alpha plane).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -93,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -113,9 +131,13 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
 
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
       let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
@@ -132,7 +154,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -148,6 +170,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -155,7 +179,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -176,10 +200,15 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v);
+      let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
+      let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
+      let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
+      let a_raw = load_endian_u16x8::<BE>(a.as_ptr().add(x).cast());
+
+      let g_v = vandq_u16(g_raw, mask_v);
+      let b_v = vandq_u16(b_raw, mask_v);
+      let r_v = vandq_u16(r_raw, mask_v);
+      let a_v = vandq_u16(a_raw, mask_v);
 
       let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
       let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
@@ -197,7 +226,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -214,6 +243,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// Copies samples without shifting — output values in `[0, (1<<BITS)-1]`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -221,7 +252,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -238,16 +269,16 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       // vst3q_u16 stores 8×3 = 24 u16 interleaved as R,G,B per pixel.
       let triple = uint16x8x3_t(r_v, g_v, b_v);
       vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), triple);
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -263,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -270,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -292,15 +325,15 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       let quad = uint16x8x4_t(r_v, g_v, b_v, opaque);
       vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -316,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// NEON high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (caller obligation).
@@ -323,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -345,16 +380,16 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = vandq_u16(vld1q_u16(r.as_ptr().add(x)), mask_v);
-      let g_v = vandq_u16(vld1q_u16(g.as_ptr().add(x)), mask_v);
-      let b_v = vandq_u16(vld1q_u16(b.as_ptr().add(x)), mask_v);
-      let a_v = vandq_u16(vld1q_u16(a.as_ptr().add(x)), mask_v);
+      let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = vandq_u16(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       let quad = uint16x8x4_t(r_v, g_v, b_v, a_v);
       vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/neon/tests/planar_gbr_high_bit.rs b/src/row/arch/neon/tests/planar_gbr_high_bit.rs
index 3f7762ba..0a9c3301 100644
--- a/src/row/arch/neon/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/neon/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -57,9 +57,9 @@ fn neon_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -77,9 +77,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -97,9 +97,9 @@ fn neon_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -118,9 +118,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -139,9 +139,9 @@ fn neon_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -161,9 +161,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -181,9 +181,9 @@ fn neon_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -201,9 +201,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -221,9 +221,9 @@ fn neon_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -242,9 +242,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -263,9 +263,9 @@ fn neon_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -286,9 +286,9 @@ fn neon_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -307,9 +307,9 @@ fn neon_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -327,9 +327,9 @@ fn neon_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_neon, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -348,9 +348,9 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_neon, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_neon, w);
     }
     assert_eq!(
       out_scalar, out_neon,
@@ -358,3 +358,300 @@ fn neon_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: NEON<BITS, true> output must match NEON<BITS, false> --------
+//
+// For each kernel:
+//  1. Generate LE plane data.
+//  2. Byte-swap each element to produce BE-encoded plane data.
+//  3. Run the kernel with BE=true on the byte-swapped input.
+//  4. Run the kernel with BE=false on the original LE input.
+//  5. Assert outputs are byte-identical.
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
index 7102afa3..94dcfbd5 100644
--- a/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
+++ b/src/row/arch/wasm_simd128/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! wasm-simd128 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE` (big-endian input when `true`).
 //! Lane width: 8 pixels per iteration (8 × u16 per `v128`).
 //! Scalar tail handles the remainder.
 //!
@@ -21,12 +22,13 @@ use core::arch::wasm32::*;
 
 use crate::row::scalar;
 
-use super::*;
+use super::{endian::load_endian_u16x8, *};
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and narrows to u8.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -35,7 +37,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -55,9 +57,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       // Shift right by BITS-8, then narrow u16x8 → u8x8 (in low half).
       let r_sh = u16x8_shr(r_v, shift);
@@ -80,7 +82,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -95,6 +97,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -103,7 +106,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -124,9 +127,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = u16x8_shr(r_v, shift);
       let g_sh = u16x8_shr(g_v, shift);
@@ -144,7 +147,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -159,6 +162,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -167,7 +171,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -189,10 +193,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = v128_and(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = u16x8_shr(r_v, shift);
       let g_sh = u16x8_shr(g_v, shift);
@@ -211,7 +215,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -227,6 +231,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -235,7 +240,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -252,14 +257,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -274,6 +279,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -282,7 +288,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -304,14 +310,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -326,6 +332,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
 /// wasm-simd128 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
+/// When `BE = true`, input u16 lanes are byte-swapped before processing.
 ///
 /// # Safety
 ///
@@ -334,7 +341,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -356,15 +363,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = u16x8_splat(((1u32 << BITS) - 1) as u16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = v128_and(v128_load(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = v128_and(v128_load(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = v128_and(v128_load(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = v128_and(v128_load(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = v128_and(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = v128_and(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = v128_and(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = v128_and(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
index 8fb1faef..5b041673 100644
--- a/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/wasm_simd128/tests/planar_gbr_high_bit.rs
@@ -31,9 +31,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_wasm = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -50,9 +50,9 @@ fn simd128_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_wasm = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -69,9 +69,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -88,9 +88,9 @@ fn simd128_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -108,9 +108,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -128,9 +128,9 @@ fn simd128_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_wasm = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -149,9 +149,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_wasm = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -168,9 +168,9 @@ fn simd128_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_wasm = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -187,9 +187,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -206,9 +206,9 @@ fn simd128_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_wasm, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -226,9 +226,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -246,9 +246,9 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_wasm = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_wasm, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_wasm, w);
     }
     assert_eq!(
       out_scalar, out_wasm,
@@ -256,3 +256,281 @@ fn simd128_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     );
   }
 }
+
+// ---- BE parity: simd128<BITS, true> output must match simd128<BITS, false> --
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+fn simd128_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "simd128 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
index 23c76e15..26d9e298 100644
--- a/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx2/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! AVX2 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 16 pixels per iteration (16 × u16 per `__m256i`).
 //! Scalar tail handles the remainder.
 //!
@@ -18,16 +19,26 @@
 //!
 //! Process 16 u16 pixels per outer iteration via two calls to the 128-bit
 //! `write_rgb_u16_8` / `write_rgba_u16_8` helpers (8 pixels each).
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! Wide (16-pixel) iterations use `load_endian_u16x16::<BE>` from this
+//! backend's own `endian.rs` (256-bit shuffle). 8-pixel tail iterations use
+//! `load_endian_u16x8::<BE>` from the SSE4.1 `endian.rs` (128-bit shuffle).
+//! Both branches are resolved at monomorphisation time.
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x16, *};
+use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -35,7 +46,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
 
       // Variable-count logical right-shift for all 16 u16 lanes.
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
@@ -85,9 +96,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     // Drain remaining 8-pixel blocks with the SSE-width path.
     if x + 8 <= width {
       let zero = _mm_setzero_si128();
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -100,7 +111,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -116,6 +127,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -123,7 +136,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -147,9 +160,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
 
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
       let g_sh = _mm256_srl_epi16(g_v, shr_count);
@@ -174,9 +187,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -189,7 +202,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -205,6 +218,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -212,7 +227,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -236,10 +251,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_v = _mm256_and_si256(_mm256_loadu_si256(r.as_ptr().add(x).cast()), mask256);
-      let g_v = _mm256_and_si256(_mm256_loadu_si256(g.as_ptr().add(x).cast()), mask256);
-      let b_v = _mm256_and_si256(_mm256_loadu_si256(b.as_ptr().add(x).cast()), mask256);
-      let a_v = _mm256_and_si256(_mm256_loadu_si256(a.as_ptr().add(x).cast()), mask256);
+      let r_v = _mm256_and_si256(load_endian_u16x16::<BE>(r.as_ptr().add(x).cast()), mask256);
+      let g_v = _mm256_and_si256(load_endian_u16x16::<BE>(g.as_ptr().add(x).cast()), mask256);
+      let b_v = _mm256_and_si256(load_endian_u16x16::<BE>(b.as_ptr().add(x).cast()), mask256);
+      let a_v = _mm256_and_si256(load_endian_u16x16::<BE>(a.as_ptr().add(x).cast()), mask256);
 
       let r_sh = _mm256_srl_epi16(r_v, shr_count);
       let g_sh = _mm256_srl_epi16(g_v, shr_count);
@@ -261,10 +276,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -279,7 +294,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -297,6 +312,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 /// Processes 16 pixels per outer loop via two 8-pixel `write_rgb_u16_8` calls.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -304,7 +321,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -322,27 +339,36 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 16 <= width {
       // Two 8-pixel halves using the SSE helper.
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_u16_out.as_mut_ptr().add(x * 3));
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
 
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -358,6 +384,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -365,7 +393,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -387,9 +415,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(
         r_lo,
         g_lo,
@@ -398,9 +426,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         rgba_u16_out.as_mut_ptr().add(x * 4),
       );
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgba_u16_8(
         r_hi,
         g_hi,
@@ -412,14 +449,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -435,6 +472,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// AVX2 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available (caller obligation).
@@ -442,7 +481,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -464,16 +503,28 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 16 <= width {
-      let r_lo = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_lo = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_lo = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_lo = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_lo = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_lo = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_lo = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_lo = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo, rgba_u16_out.as_mut_ptr().add(x * 4));
 
-      let r_hi = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-      let g_hi = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-      let b_hi = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
-      let a_hi = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128);
+      let r_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let g_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let b_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
+      let a_hi = _mm_and_si128(
+        load_endian_u16x8::<BE>(a.as_ptr().add(x + 8).cast()),
+        mask128,
+      );
       write_rgba_u16_8(
         r_hi,
         g_hi,
@@ -485,15 +536,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
       x += 16;
     }
     if x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
index 72225d19..505256fe 100644
--- a/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx2/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -60,9 +60,9 @@ fn avx2_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -83,9 +83,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -106,9 +106,9 @@ fn avx2_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -130,9 +130,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -154,9 +154,9 @@ fn avx2_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -179,9 +179,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -202,9 +202,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -225,9 +225,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -248,9 +248,9 @@ fn avx2_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -272,9 +272,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -296,9 +296,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -321,9 +321,9 @@ fn avx2_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -345,9 +345,9 @@ fn avx2_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -368,9 +368,9 @@ fn avx2_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -392,9 +392,9 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -402,3 +402,329 @@ fn avx2_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: AVX2<BITS, true> output must match AVX2<BITS, false> --------
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx2_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
index 4f763434..afc8ccc3 100644
--- a/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx512/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! AVX-512 (F + BW) kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 32 pixels per iteration (32 × u16 per `__m512i`).
 //! Scalar tail handles the remainder.
 //!
@@ -20,16 +21,25 @@
 //!
 //! Process 32 pixels via four calls to `write_rgb_u16_8` /
 //! `write_rgba_u16_8` (8 pixels each, SSE4.1 128-bit helpers).
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! Wide (32-pixel) iterations use `load_endian_u16x32::<BE>` from this
+//! backend's own `endian.rs` (512-bit shuffle). 8-pixel tail iterations use
+//! `load_endian_u16x8::<BE>` from the SSE4.1 `endian.rs` (128-bit shuffle).
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x32, *};
+use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -37,7 +47,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -60,9 +70,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       // Load 32 u16 pixels per plane via 512-bit loads, then mask.
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
 
       // Shift all 32 u16 lanes right by BITS-8.
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
@@ -124,9 +134,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     }
     // Drain remaining 8-pixel blocks before scalar tail.
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -139,7 +149,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -155,6 +165,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -162,7 +174,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -185,9 +197,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
 
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
       let g_sh = _mm512_srl_epi16(g_v, shr_count);
@@ -245,9 +257,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -260,7 +272,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -276,6 +288,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -283,7 +297,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -306,10 +320,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 32 <= width {
-      let r_v = _mm512_and_si512(_mm512_loadu_si512(r.as_ptr().add(x).cast()), mask512);
-      let g_v = _mm512_and_si512(_mm512_loadu_si512(g.as_ptr().add(x).cast()), mask512);
-      let b_v = _mm512_and_si512(_mm512_loadu_si512(b.as_ptr().add(x).cast()), mask512);
-      let a_v = _mm512_and_si512(_mm512_loadu_si512(a.as_ptr().add(x).cast()), mask512);
+      let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
+      let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
+      let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
+      let a_v = _mm512_and_si512(load_endian_u16x32::<BE>(a.as_ptr().add(x).cast()), mask512);
 
       let r_sh = _mm512_srl_epi16(r_v, shr_count);
       let g_sh = _mm512_srl_epi16(g_v, shr_count);
@@ -376,10 +390,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
       let b_sh = _mm_srl_epi16(b_v, shr_count);
@@ -394,7 +408,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -412,6 +426,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 /// Processes 32 pixels per outer loop via four 8-pixel `write_rgb_u16_8` calls.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -419,7 +435,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -438,40 +454,67 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     while x + 32 <= width {
       // Four 8-pixel blocks (offsets 0, 8, 16, 24).
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 16) * 3));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 24) * 3));
       }
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -487,6 +530,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -494,7 +539,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -517,15 +562,24 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
         write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -535,9 +589,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -547,9 +610,18 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -561,14 +633,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -584,6 +656,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// AVX-512 (F+BW) high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. AVX-512BW must be available (caller obligation).
@@ -591,7 +665,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -614,17 +688,29 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mut x = 0usize;
     while x + 32 <= width {
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+        let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+        let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+        let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+        let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
         write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 8).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 8).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 8).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 8).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 8).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -634,10 +720,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 16).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 16).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 16).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 16).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 16).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -647,10 +745,22 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         );
       }
       {
-        let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x + 24).cast()), mask128);
-        let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x + 24).cast()), mask128);
-        let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x + 24).cast()), mask128);
-        let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x + 24).cast()), mask128);
+        let r_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let g_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let b_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
+        let a_v = _mm_and_si128(
+          load_endian_u16x8::<BE>(a.as_ptr().add(x + 24).cast()),
+          mask128,
+        );
         write_rgba_u16_8(
           r_v,
           g_v,
@@ -662,15 +772,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
       x += 32;
     }
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask128);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask128);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask128);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask128);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
index 3a5ca557..80bc153a 100644
--- a/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_avx512/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -60,9 +60,9 @@ fn avx512_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -83,9 +83,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -106,9 +106,9 @@ fn avx512_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -130,9 +130,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -154,9 +154,9 @@ fn avx512_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -179,9 +179,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -202,9 +202,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -225,9 +225,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -248,9 +248,9 @@ fn avx512_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -272,9 +272,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -296,9 +296,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -321,9 +321,9 @@ fn avx512_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_avx = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -345,9 +345,9 @@ fn avx512_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_avx = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -368,9 +368,9 @@ fn avx512_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_avx = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_avx, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -392,9 +392,9 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_avx = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_avx, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_avx, w);
     }
     assert_eq!(
       out_scalar, out_avx,
@@ -402,3 +402,329 @@ fn avx512_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: AVX-512<BITS, true> output must match AVX-512<BITS, false> --
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn avx512_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
index 364eac74..f28d4fe3 100644
--- a/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_sse41/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! SSE4.1 kernels for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}`.
+//! All functions are const-generic over `BITS ∈ {9, 10, 12, 14, 16}` and
+//! `BE: bool` (endianness of the source u16 planes).
 //! Lane width: 8 pixels per iteration (8 × u16 per `__m128i`).
 //! Scalar tail handles the remainder.
 //!
@@ -17,16 +18,26 @@
 //! Use the existing `write_rgb_u16_8` / `write_rgba_u16_8` helpers from
 //! `x86_common` which interleave 8 u16 lanes per channel into packed
 //! RGB / RGBA u16 output.
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each 8-pixel load goes through
+//! `load_endian_u16x8::<BE>` (defined in `endian.rs`) which applies
+//! `_mm_shuffle_epi8` (SSSE3 pshufb) to byte-swap every u16 lane.
+//! The branch is resolved at monomorphisation — `BE = false` compiles
+//! to a plain `_mm_loadu_si128`.
 
 use core::arch::x86_64::*;
 
-use super::*;
+use super::{endian::load_endian_u16x8, *};
 
 // ---- u8 output, 3-channel (RGB) -----------------------------------------
 
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8` and packs to u8.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -34,7 +45,7 @@ use super::*;
 /// 3. `rgb_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -58,9 +69,9 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       // Variable-count logical right-shift by BITS-8 per u16 lane.
       let r_sh = _mm_srl_epi16(r_v, shr_count);
@@ -81,7 +92,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -97,6 +108,8 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **bytes**
 /// with constant opaque alpha (`0xFF`).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -104,7 +117,7 @@ pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -127,9 +140,9 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
@@ -146,7 +159,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -162,6 +175,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **bytes**.
 /// Alpha sourced from the `a` plane, downshifted by `BITS - 8`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -169,7 +184,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// 3. `rgba_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -191,10 +206,10 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
 
       let r_sh = _mm_srl_epi16(r_v, shr_count);
       let g_sh = _mm_srl_epi16(g_v, shr_count);
@@ -213,7 +228,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -230,6 +245,8 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B` **u16** samples.
 /// No shift — values copied directly, reordered G/B/R → R/G/B.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -237,7 +254,7 @@ pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// 3. `rgb_u16_out.len()` ≥ `3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -254,14 +271,14 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -277,6 +294,8 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R planar → packed `R, G, B, A` **u16** samples
 /// with constant opaque alpha `(1 << BITS) - 1`.
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -284,7 +303,7 @@ pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -307,14 +326,14 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+      scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
@@ -330,6 +349,8 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// SSE4.1 high-bit-depth G/B/R/A planar → packed `R, G, B, A` **u16** samples.
 /// Alpha sourced from the `a` plane at native depth (no shift).
 ///
+/// When `BE = true` each source u16 element is byte-swapped on load.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available (caller obligation).
@@ -337,7 +358,7 @@ pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// 3. `rgba_u16_out.len()` ≥ `4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -359,15 +380,15 @@ pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     let mask_v = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
     let mut x = 0usize;
     while x + 8 <= width {
-      let r_v = _mm_and_si128(_mm_loadu_si128(r.as_ptr().add(x).cast()), mask_v);
-      let g_v = _mm_and_si128(_mm_loadu_si128(g.as_ptr().add(x).cast()), mask_v);
-      let b_v = _mm_and_si128(_mm_loadu_si128(b.as_ptr().add(x).cast()), mask_v);
-      let a_v = _mm_and_si128(_mm_loadu_si128(a.as_ptr().add(x).cast()), mask_v);
+      let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
+      let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
+      let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
+      let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
       write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
       x += 8;
     }
     if x < width {
-      scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(
+      scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
         &g[x..width],
         &b[x..width],
         &r[x..width],
diff --git a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
index 7292b15e..f0c11bf1 100644
--- a/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
+++ b/src/row/arch/x86_sse41/tests/planar_gbr_high_bit.rs
@@ -37,9 +37,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -60,9 +60,9 @@ fn sse41_gbr_to_rgb_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -83,9 +83,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -106,9 +106,9 @@ fn sse41_gbr_to_rgba_opaque_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -130,9 +130,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -154,9 +154,9 @@ fn sse41_gbra_to_rgba_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -179,9 +179,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -202,9 +202,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -225,9 +225,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits10() {
     let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -248,9 +248,9 @@ fn sse41_gbr_to_rgba_opaque_u16_high_bit_matches_scalar_bits16() {
     let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -272,9 +272,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits10() {
     let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -296,9 +296,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_matches_scalar_bits16() {
     let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -321,9 +321,9 @@ fn sse41_gbr_to_rgb_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_sse = std::vec![0u8; w * 3];
-    scalar::gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -345,9 +345,9 @@ fn sse41_gbra_to_rgba_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_sse = std::vec![0u8; w * 4];
-    scalar::gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -368,9 +368,9 @@ fn sse41_gbr_to_rgb_u16_high_bit_upper_bits_masked_bits10() {
     let r = gbr_plane_u16_dirty::<10>(w, 0x0400);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_sse = std::vec![0u16; w * 3];
-    scalar::gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_scalar, w);
+    scalar::gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_scalar, w);
     unsafe {
-      gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out_sse, w);
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -392,9 +392,9 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     let a = gbr_plane_u16_dirty::<10>(w, 0x0C00);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_sse = std::vec![0u16; w * 4];
-    scalar::gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_scalar, w);
+    scalar::gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_scalar, w);
     unsafe {
-      gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out_sse, w);
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_sse, w);
     }
     assert_eq!(
       out_scalar, out_sse,
@@ -402,3 +402,332 @@ fn sse41_gbra_to_rgba_u16_high_bit_upper_bits_masked_bits10() {
     );
   }
 }
+
+// ---- BE parity: SSE4.1<BITS, true> output must match SSE4.1<BITS, false> ---
+//
+// Byte-swap LE inputs to produce BE-encoded data; verify that BE=true kernel
+// output is byte-identical to BE=false kernel output on the original LE data.
+
+fn byte_swap_plane(plane: &[u16]) -> std::vec::Vec<u16> {
+  plane.iter().map(|v| v.swap_bytes()).collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      gbra_to_rgba_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgb_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgb_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbr_to_rgba_opaque_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbr_to_rgba_opaque_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits10() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<10>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<10>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<10>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<10>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")]
+fn sse41_gbra_to_rgba_u16_high_bit_be_matches_le_bits16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 7, 8, 16, 17, 32, 33, 64, 128, 130] {
+    let g = gbr_plane_u16::<16>(w, 0x6CCD_5C7B);
+    let b = gbr_plane_u16::<16>(w, 0x12AB_34CD);
+    let r = gbr_plane_u16::<16>(w, 0xDEAD_BEEF);
+    let a = gbr_plane_u16::<16>(w, 0xCAFE_F00D);
+    let g_be = byte_swap_plane(&g);
+    let b_be = byte_swap_plane(&b);
+    let r_be = byte_swap_plane(&r);
+    let a_be = byte_swap_plane(&a);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<16, true>(&g_be, &b_be, &r_be, &a_be, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 gbra_to_rgba_u16_high_bit BE/LE mismatch (w={w})"
+    );
+  }
+}
diff --git a/src/row/dispatch/planar_gbr_high_bit.rs b/src/row/dispatch/planar_gbr_high_bit.rs
index 544d8166..9511e662 100644
--- a/src/row/dispatch/planar_gbr_high_bit.rs
+++ b/src/row/dispatch/planar_gbr_high_bit.rs
@@ -1,6 +1,7 @@
 //! Runtime SIMD dispatchers for high-bit-depth planar GBR sources (Tier 10b).
 //!
-//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}`:
+//! Seven kernel variants, all const-generic over `BITS ∈ {9, 10, 12, 14, 16}`
+//! and `BE` (big-endian input when `true`):
 //! - [`gbr_to_rgb_high_bit_row`] — interleave G/B/R → packed `R, G, B` bytes.
 //! - [`gbr_to_rgb_u16_high_bit_row`] — interleave G/B/R → packed `R, G, B` u16.
 //! - [`gbr_to_rgba_opaque_high_bit_row`] — interleave G/B/R → packed
@@ -39,8 +40,9 @@ use crate::{
 
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **bytes**.
 /// Downshifts each sample by `BITS - 8`. `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -65,31 +67,33 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: NEON verified available.
-          unsafe { arch::neon::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::neon::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
-          unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_avx512::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
-          unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_avx2::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
-          unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe { arch::x86_sse41::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
-          unsafe { arch::wasm_simd128::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width); }
+          unsafe {
+            arch::wasm_simd128::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width);
+          }
           return;
         }
       },
@@ -97,7 +101,7 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgb_high_bit_row::<BITS>(g, b, r, rgb_out, width);
+  scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(g, b, r, rgb_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -107,8 +111,9 @@ pub fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B` **u16**
 /// elements. Samples are copied as-is (no depth conversion); values stay in
 /// `[0, (1 << BITS) - 1]`. `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -134,7 +139,7 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::neon::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(g, b, r, rgb_u16_out, width);
           }
           return;
         }
@@ -143,21 +148,27 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_avx512::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_avx2::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::x86_sse41::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
@@ -166,7 +177,9 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+            arch::wasm_simd128::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
+              g, b, r, rgb_u16_out, width,
+            );
           }
           return;
         }
@@ -175,7 +188,7 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgb_u16_high_bit_row::<BITS>(g, b, r, rgb_u16_out, width);
+  scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(g, b, r, rgb_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -185,8 +198,9 @@ pub fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B, A` **bytes**
 /// with constant α = `0xFF`. Used by `GbrpN` for standalone `with_rgba` path.
 /// `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -212,7 +226,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::neon::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(g, b, r, rgba_out, width);
           }
           return;
         }
@@ -221,21 +235,27 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_avx512::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_avx2::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::x86_sse41::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
@@ -244,7 +264,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+            arch::wasm_simd128::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
+              g, b, r, rgba_out, width,
+            );
           }
           return;
         }
@@ -253,7 +275,7 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgba_opaque_high_bit_row::<BITS>(g, b, r, rgba_out, width);
+  scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(g, b, r, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -264,8 +286,9 @@ pub fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// **u16** elements with constant α = `(1 << BITS) - 1` (native-depth
 /// opaque). Used by `GbrpN` for standalone `with_rgba_u16` path.
 /// `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -291,7 +314,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::neon::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -302,7 +325,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_avx512::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -311,7 +334,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_avx2::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -320,7 +343,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::x86_sse41::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -331,7 +354,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(
+            arch::wasm_simd128::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
               g, b, r, rgba_u16_out, width,
             );
           }
@@ -342,7 +365,7 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(g, b, r, rgba_u16_out, width);
+  scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(g, b, r, rgba_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -352,9 +375,10 @@ pub fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **bytes**. Alpha is downshifted by `BITS - 8` (real source α, not
 /// constant). `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -382,7 +406,7 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::neon::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
@@ -391,21 +415,21 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_avx512::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_avx2::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::x86_sse41::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
           }
           return;
         }
@@ -414,7 +438,9 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+            arch::wasm_simd128::gbra_to_rgba_high_bit_row::<BITS, BE>(
+              g, b, r, a, rgba_out, width,
+            );
           }
           return;
         }
@@ -423,7 +449,7 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbra_to_rgba_high_bit_row::<BITS>(g, b, r, a, rgba_out, width);
+  scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(g, b, r, a, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -433,9 +459,10 @@ pub fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **u16** elements. Alpha is copied directly without depth conversion (values
 /// stay in `[0, (1 << BITS) - 1]`). `use_simd = false` forces scalar.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -463,7 +490,9 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if neon_available() {
           // SAFETY: NEON verified available.
           unsafe {
-            arch::neon::gbra_to_rgba_u16_high_bit_row::<BITS>(g, b, r, a, rgba_u16_out, width);
+            arch::neon::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
+              g, b, r, a, rgba_u16_out, width,
+            );
           }
           return;
         }
@@ -472,7 +501,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if avx512_available() {
           // SAFETY: AVX-512BW verified available.
           unsafe {
-            arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_avx512::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -481,7 +510,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if avx2_available() {
           // SAFETY: AVX2 verified available.
           unsafe {
-            arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_avx2::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -490,7 +519,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if sse41_available() {
           // SAFETY: SSE4.1 verified available.
           unsafe {
-            arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::x86_sse41::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -501,7 +530,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
         if simd128_available() {
           // SAFETY: simd128 compile-time enabled.
           unsafe {
-            arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::<BITS>(
+            arch::wasm_simd128::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
               g, b, r, a, rgba_u16_out, width,
             );
           }
@@ -512,7 +541,7 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
     }
   }
 
-  scalar::gbra_to_rgba_u16_high_bit_row::<BITS>(g, b, r, a, rgba_u16_out, width);
+  scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(g, b, r, a, rgba_u16_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -529,9 +558,10 @@ pub fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
 /// `use_simd` accepted for signature consistency with the rest of the
 /// row dispatcher family. Currently no SIMD path is wired (kernel is
 /// scalar-only); the flag is reserved for future backends.
+/// When `BE = true`, input u16 samples are big-endian and byte-swapped first.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
-pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
+pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -551,5 +581,5 @@ pub fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
   assert!(b.len() >= width, "b row too short");
   assert!(r.len() >= width, "r row too short");
   assert!(luma_out.len() >= width, "luma_out row too short");
-  scalar::gbr_to_luma_u16_high_bit_row::<BITS>(g, b, r, luma_out, width, matrix, full_range);
+  scalar::gbr_to_luma_u16_high_bit_row::<BITS, BE>(g, b, r, luma_out, width, matrix, full_range);
 }
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 297f1c3c..a5d210f1 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -615,7 +615,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgb: [u8; 0] = [];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -626,7 +626,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgb: [u16; 0] = [];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgb, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -637,7 +637,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -648,7 +648,7 @@ mod overflow_tests {
     let b: [u16; 0] = [];
     let r: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -660,7 +660,7 @@ mod overflow_tests {
     let r: [u16; 0] = [];
     let a: [u16; 0] = [];
     let mut rgba: [u8; 0] = [];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   #[cfg(target_pointer_width = "32")]
@@ -672,7 +672,7 @@ mod overflow_tests {
     let r: [u16; 0] = [];
     let a: [u16; 0] = [];
     let mut rgba: [u16; 0] = [];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut rgba, OVERFLOW_WIDTH, false);
   }
 
   // ---- Tier 11 gray dispatchers — `width × {3, 4}` overflow ----
diff --git a/src/row/scalar/planar_gbr_high_bit.rs b/src/row/scalar/planar_gbr_high_bit.rs
index b9c966df..ac97ef38 100644
--- a/src/row/scalar/planar_gbr_high_bit.rs
+++ b/src/row/scalar/planar_gbr_high_bit.rs
@@ -1,13 +1,15 @@
 //! Scalar reference kernels for high-bit-depth planar GBR sources
-//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE` /
-//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE`).
+//! (Tier 10b — `AV_PIX_FMT_GBRP{9,10,12,14,16}LE/BE` /
+//! `AV_PIX_FMT_GBRAP{10,12,14,16}LE/BE`).
 //!
 //! `gbr_*` kernels (3-plane, no α) are const-generic over
-//! `BITS ∈ {9, 10, 12, 14, 16}`. `gbra_*` kernels (4-plane, with α)
-//! are const-generic over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no
-//! `GBRAP9` variant; only the 3-plane `GBRP9` exists at 9 bits.
+//! `BITS ∈ {9, 10, 12, 14, 16}` **and** `BE: bool` (endianness of the
+//! source planes). `gbra_*` kernels (4-plane, with α) are const-generic
+//! over `BITS ∈ {10, 12, 14, 16}` — FFmpeg has no `GBRAP9` variant;
+//! only the 3-plane `GBRP9` exists at 9 bits.
 //! No runtime branching on `BITS` — every `BITS - 8` shift is a
-//! const-eval expression resolved at monomorphisation.
+//! const-eval expression resolved at monomorphisation.  The `BE` branch is
+//! also const-folded away at monomorphisation time.
 //!
 //! # Output variants
 //!
@@ -34,18 +36,27 @@
 //!
 //! - u8: `0xFF`
 //! - u16: `(1u16 << BITS) - 1` (i.e., `511`, `1023`, `4095`, …)
+//!
+//! # Big-endian (`BE = true`) mode
+//!
+//! When `BE = true` each u16 sample is byte-swapped before masking and
+//! arithmetic.  The swap is a compile-time branch: the `BE = false` path
+//! compiles to a no-op and the call overhead is zero.
 
 /// Interleaves three planar G/B/R `u16` rows into packed `R, G, B`
 /// **bytes**, downshifting each sample by `BITS - 8`.
 ///
 /// Output order is **R, G, B** per pixel (FFmpeg `RGB24` convention).
 ///
+/// When `BE = true` each source element is byte-swapped before processing
+/// (big-endian wire format → host-native arithmetic value).
+///
 /// # Panics (debug builds)
 ///
 /// Asserts that `g`, `b`, `r` each have at least `width` samples and
 /// `rgb_out` has at least `width * 3` bytes.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -65,9 +76,12 @@ pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 3;
     rgb_out[dst] = (r_val >> shift) as u8;
     rgb_out[dst + 1] = (g_val >> shift) as u8;
@@ -79,12 +93,14 @@ pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32>(
 /// **`u16`** samples. Copies samples directly without shifting —
 /// output values are in `[0, (1 << BITS) - 1]`.
 ///
+/// When `BE = true` each source element is byte-swapped before processing.
+///
 /// # Panics (debug builds)
 ///
 /// Asserts that `g`, `b`, `r` each have at least `width` samples and
 /// `rgb_u16_out` has at least `width * 3` samples.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -103,9 +119,12 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
   debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short");
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 3;
     rgb_u16_out[dst] = r_val;
     rgb_u16_out[dst + 1] = g_val;
@@ -118,8 +137,9 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32>(
 /// `Gbrp*` sources (no alpha plane) when `with_rgba` is requested.
 ///
 /// Each sample is downshifted by `BITS - 8`.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -139,9 +159,12 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 4;
     rgba_out[dst] = (r_val >> shift) as u8;
     rgba_out[dst + 1] = (g_val >> shift) as u8;
@@ -154,8 +177,9 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32>(
 /// **`u16`** samples with a constant **opaque** alpha
 /// (`(1u16 << BITS) - 1`). Used for `Gbrp*` sources (no alpha plane)
 /// when `with_rgba_u16` is requested. Copies samples directly.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -178,9 +202,12 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let opaque: u16 = mask;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
     let dst = x * 4;
     rgba_u16_out[dst] = r_val;
     rgba_u16_out[dst + 1] = g_val;
@@ -192,8 +219,9 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **bytes**. Alpha is sourced from the `a` plane (real per-pixel α).
 /// Each sample (including α) is downshifted by `BITS - 8`.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
+pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -215,10 +243,14 @@ pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
-    let a_val = a[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let a_raw = if BE { a[x].swap_bytes() } else { a[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
+    let a_val = a_raw & mask;
     let dst = x * 4;
     rgba_out[dst] = (r_val >> shift) as u8;
     rgba_out[dst + 1] = (g_val >> shift) as u8;
@@ -230,8 +262,9 @@ pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32>(
 /// Interleaves four planar G/B/R/A `u16` rows into packed `R, G, B, A`
 /// **`u16`** samples. Alpha is sourced from the `a` plane at native
 /// depth (no shift). Copies all four channels directly.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -255,10 +288,14 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
   );
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_val = r[x] & mask;
-    let g_val = g[x] & mask;
-    let b_val = b[x] & mask;
-    let a_val = a[x] & mask;
+    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let a_raw = if BE { a[x].swap_bytes() } else { a[x] };
+    let r_val = r_raw & mask;
+    let g_val = g_raw & mask;
+    let b_val = b_raw & mask;
+    let a_val = a_raw & mask;
     let dst = x * 4;
     rgba_u16_out[dst] = r_val;
     rgba_u16_out[dst + 1] = g_val;
@@ -280,8 +317,9 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32>(
 /// `full_range = false` → Y' ∈ `[16 << (BITS - 8), 235 << (BITS - 8)]`
 /// (limited / studio swing). The limited-range formula mirrors
 /// `rgb_to_luma_row` but scaled to native depth.
+/// When `BE = true` each source element is byte-swapped before processing.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
+pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
   g: &[u16],
   b: &[u16],
   r: &[u16],
@@ -311,9 +349,12 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
 
   if full_range {
     for x in 0..width {
-      let rv = (r[x] & mask) as i64;
-      let gv = (g[x] & mask) as i64;
-      let bv = (b[x] & mask) as i64;
+      let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+      let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+      let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+      let rv = (r_raw & mask) as i64;
+      let gv = (g_raw & mask) as i64;
+      let bv = (b_raw & mask) as i64;
       let y = ((k_r * rv + k_g * gv + k_b * bv + RND) >> 15) as i32;
       luma_out[x] = y.clamp(0, native_max as i32) as u16;
     }
@@ -339,9 +380,12 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32>(
     let y_max = (235i64) << (BITS - 8);
     let y_min = y_off;
     for x in 0..width {
-      let rv = (r[x] & mask) as i64;
-      let gv = (g[x] & mask) as i64;
-      let bv = (b[x] & mask) as i64;
+      let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
+      let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
+      let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+      let rv = (r_raw & mask) as i64;
+      let gv = (g_raw & mask) as i64;
+      let bv = (b_raw & mask) as i64;
       let y_full = (k_r * rv + k_g * gv + k_b * bv + RND) >> 15;
       let y_full_clamped = y_full.clamp(0, native_max_i64);
       let y_lim = y_off + (y_full_clamped * range + native_max_i64 / 2) / native_max_i64;
@@ -366,7 +410,7 @@ mod tests {
     let b = [100u16; 1];
     let r = [1000u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 250); // R
     assert_eq!(out[1], 0); // G
     assert_eq!(out[2], 25); // B
@@ -379,7 +423,7 @@ mod tests {
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u8; 12];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == 0xFF), "all pixels must be 0xFF");
   }
 
@@ -390,7 +434,7 @@ mod tests {
     let b = [max; 2];
     let r = [max; 2];
     let mut out = [0u8; 6];
-    gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == 0xFF));
   }
 
@@ -400,7 +444,7 @@ mod tests {
     let b = [0u16; 2];
     let r = [0u16; 2];
     let mut out = [0xFFu8; 6];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == 0));
   }
 
@@ -411,7 +455,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[1], 255); // G channel
   }
 
@@ -422,7 +466,7 @@ mod tests {
     let g = [0u16; 1];
     let b = [0u16; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 255); // R channel
   }
 
@@ -434,7 +478,7 @@ mod tests {
     let g = [800u16, 0u16, 600u16];
     let b = [300u16, 0u16, 200u16];
     let mut out = [0u8; 9];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 3);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 3);
     // pixel 0: R=400>>2=100, G=800>>2=200, B=300>>2=75
     assert_eq!(out[0], 100);
     assert_eq!(out[1], 200);
@@ -457,7 +501,7 @@ mod tests {
     let b = [222u16; 1];
     let r = [333u16; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 333); // R
     assert_eq!(out[1], 111); // G
     assert_eq!(out[2], 222); // B
@@ -470,7 +514,7 @@ mod tests {
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u16; 12];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == max));
   }
 
@@ -481,7 +525,7 @@ mod tests {
     let b = [max; 2];
     let r = [max; 2];
     let mut out = [0u16; 6];
-    gbr_to_rgb_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(out.iter().all(|&v| v == max));
   }
 
@@ -492,7 +536,7 @@ mod tests {
     let b = [2000u16; 1];
     let r = [3000u16; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], 3000); // R — unchanged
     assert_eq!(out[1], 1000); // G — unchanged
     assert_eq!(out[2], 2000); // B — unchanged
@@ -507,7 +551,7 @@ mod tests {
     let b = [max; 4];
     let r = [max; 4];
     let mut out = [0u8; 16];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 4);
     for i in 0..4 {
       assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}");
       assert_eq!(out[i * 4], 0xFF, "R must be 0xFF at pixel {i}");
@@ -521,7 +565,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u8; 4];
-    gbr_to_rgba_opaque_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[1], 255); // G
     assert_eq!(out[3], 0xFF); // alpha
   }
@@ -534,7 +578,7 @@ mod tests {
     let b = [200u16; 2];
     let r = [800u16; 2];
     let mut out = [0u16; 8];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2);
     let opaque = (1u16 << 10) - 1; // 1023
     assert_eq!(out[3], opaque); // pixel 0 alpha
     assert_eq!(out[7], opaque); // pixel 1 alpha
@@ -549,7 +593,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[3], u16::MAX);
   }
 
@@ -559,7 +603,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<9>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<9, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[3], (1u16 << 9) - 1); // 511
   }
 
@@ -573,7 +617,7 @@ mod tests {
     let r = [0u16; 1];
     let a = [512u16; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[3], 128); // alpha = 512 >> 2
   }
 
@@ -585,7 +629,7 @@ mod tests {
     let r = [max; 2];
     let a = [max; 2];
     let mut out = [0u8; 8];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 2);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 2);
     for i in 0..2 {
       assert_eq!(out[i * 4 + 3], 0xFF, "alpha must be 0xFF at pixel {i}");
     }
@@ -599,7 +643,7 @@ mod tests {
     let r = [16320u16; 1];
     let a = [8192u16; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<14>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<14, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 255); // R
     assert_eq!(out[1], 0); // G
     assert_eq!(out[2], 0); // B
@@ -615,7 +659,7 @@ mod tests {
     let r = [300u16; 1];
     let a = [777u16; 1];
     let mut out = [0u16; 4];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 300); // R
     assert_eq!(out[1], 100); // G
     assert_eq!(out[2], 200); // B
@@ -629,7 +673,7 @@ mod tests {
     let r = [30000u16; 2];
     let a = [40000u16; 2];
     let mut out = [0u16; 8];
-    gbra_to_rgba_u16_high_bit_row::<16>(&g, &b, &r, &a, &mut out, 2);
+    gbra_to_rgba_u16_high_bit_row::<16, false>(&g, &b, &r, &a, &mut out, 2);
     for i in 0..2 {
       assert_eq!(out[i * 4], 30000);
       assert_eq!(out[i * 4 + 1], 10000);
@@ -648,7 +692,7 @@ mod tests {
     let b = [val; 8];
     let r = [val; 8];
     let mut out = [0u8; 24];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 8);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 8);
     assert!(out.iter().all(|&v| v == 128));
   }
 
@@ -660,7 +704,7 @@ mod tests {
     let b = [val; 4];
     let r = [val; 4];
     let mut out = [0u8; 12];
-    gbr_to_rgb_high_bit_row::<12>(&g, &b, &r, &mut out, 4);
+    gbr_to_rgb_high_bit_row::<12, false>(&g, &b, &r, &mut out, 4);
     assert!(out.iter().all(|&v| v == 200));
   }
 
@@ -679,7 +723,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u8; 3];
-    gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(
       out[0], expected_u8,
       "R must equal masked-then-shifted value"
@@ -705,7 +749,7 @@ mod tests {
       let b = std::vec![dirty; w];
       let r = std::vec![dirty; w];
       let mut out = std::vec![0u8; w * 3];
-      gbr_to_rgb_high_bit_row::<10>(&g, &b, &r, &mut out, w);
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out, w);
       for i in 0..w {
         assert_eq!(out[i * 3], expected_u8, "R pixel {i} wrong at width {w}");
         assert_eq!(
@@ -734,7 +778,7 @@ mod tests {
     let r = [dirty_rgb; 1];
     let a = [dirty_alpha; 1];
     let mut out = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], 0, "R (dirty, masked to 0)");
     assert_eq!(out[1], 0, "G (dirty, masked to 0)");
     assert_eq!(out[2], 0, "B (dirty, masked to 0)");
@@ -750,7 +794,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u16; 3];
-    gbr_to_rgb_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked value");
     assert_eq!(out[1], clean, "G u16 must be masked value");
     assert_eq!(out[2], clean, "B u16 must be masked value");
@@ -766,7 +810,7 @@ mod tests {
     let r = [dirty; 1];
     let a = [dirty; 1];
     let mut out = [0u16; 4];
-    gbra_to_rgba_u16_high_bit_row::<10>(&g, &b, &r, &a, &mut out, 1);
+    gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked");
     assert_eq!(out[1], clean, "G u16 must be masked");
     assert_eq!(out[2], clean, "B u16 must be masked");
@@ -783,7 +827,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u8; 4];
-    gbr_to_rgba_opaque_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], expected_u8, "R must be masked");
     assert_eq!(out[1], expected_u8, "G must be masked");
     assert_eq!(out[2], expected_u8, "B must be masked");
@@ -799,7 +843,7 @@ mod tests {
     let b = [dirty; 1];
     let r = [dirty; 1];
     let mut out = [0u16; 4];
-    gbr_to_rgba_opaque_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1);
+    gbr_to_rgba_opaque_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1);
     assert_eq!(out[0], clean, "R u16 must be masked");
     assert_eq!(out[1], clean, "G u16 must be masked");
     assert_eq!(out[2], clean, "B u16 must be masked");
@@ -815,7 +859,7 @@ mod tests {
     let b = [val; 2];
     let r = [val; 2];
     let mut out = [0u8; 6];
-    gbr_to_rgb_high_bit_row::<16>(&g, &b, &r, &mut out, 2);
+    gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out, 2);
     assert!(
       out.iter().all(|&v| v == 0xFF),
       "BITS=16: max sample => 0xFF"
@@ -839,12 +883,12 @@ mod tests {
 
     // Direct path
     let mut out_direct = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a, &mut out_direct, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_direct, 1);
 
     // Manual path: apply mask to alpha, call with clean value
     let a_clean = [clean_alpha; 1];
     let mut out_manual = [0u8; 4];
-    gbra_to_rgba_high_bit_row::<10>(&g, &b, &r, &a_clean, &mut out_manual, 1);
+    gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a_clean, &mut out_manual, 1);
 
     assert_eq!(
       out_direct, out_manual,
@@ -864,7 +908,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     // For BT.709 full-range all-white: Y = round(Kr*max + Kg*max + Kb*max).
     // = round((6966 + 23436 + 2366) / 32768 * 1023) ≈ round(32768/32768 * 1023) = 1023.
     assert!(
@@ -887,7 +931,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<12>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true);
+    gbr_to_luma_u16_high_bit_row::<12, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt601, true);
     assert!(
       out[0] >= 4090,
       "max-white luma_u16 bits12 must be near 4095 (was {})",
@@ -905,7 +949,7 @@ mod tests {
     let b = [max; 1];
     let r = [max; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     assert!(
       out[0] >= 65520,
       "max-white luma_u16 bits16 must be near 65535 (was {}), old banded gives 65280",
@@ -922,7 +966,7 @@ mod tests {
     let b = [mid; 1];
     let r = [mid; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true);
     assert!(
       out[0] >= 510 && out[0] <= 514,
       "neutral gray luma_u16 must be ~512 (was {})",
@@ -936,7 +980,7 @@ mod tests {
     let b = [0u16; 2];
     let r = [0u16; 2];
     let mut out = [0xFFFFu16; 2];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true);
+    gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 2, ColorMatrix::Bt709, true);
     assert!(out.iter().all(|&v| v == 0), "all-black must give zero luma");
   }
 
@@ -949,8 +993,24 @@ mod tests {
     let r = [mid; 1];
     let mut out_full = [0u16; 1];
     let mut out_lim = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_full, 1, ColorMatrix::Bt601, true);
-    gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out_lim, 1, ColorMatrix::Bt601, false);
+    gbr_to_luma_u16_high_bit_row::<10, false>(
+      &g,
+      &b,
+      &r,
+      &mut out_full,
+      1,
+      ColorMatrix::Bt601,
+      true,
+    );
+    gbr_to_luma_u16_high_bit_row::<10, false>(
+      &g,
+      &b,
+      &r,
+      &mut out_lim,
+      1,
+      ColorMatrix::Bt601,
+      false,
+    );
     let y_off = 16u16 << 2; // 64
     let y_max = 235u16 << 2; // 940
     assert!(
@@ -976,7 +1036,7 @@ mod tests {
     let b = [0u16; 1];
     let r = [0u16; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
     let y_off = 16u16 << 8; // 4096
     assert_eq!(
       out[0], y_off,
@@ -1001,7 +1061,7 @@ mod tests {
     let b = [u16::MAX; 1];
     let r = [u16::MAX; 1];
     let mut out = [0u16; 1];
-    gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+    gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
     let y_max = 235u16 << 8; // 60160
     assert_eq!(
       out[0], y_max,
@@ -1022,7 +1082,7 @@ mod tests {
       let b = [v; 1];
       let r = [v; 1];
       let mut out = [0u16; 1];
-      gbr_to_luma_u16_high_bit_row::<16>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+      gbr_to_luma_u16_high_bit_row::<16, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
       // Native-depth limited-range: y_lim = 4096 + v × 56064 / 65535
       let expected = 4096 + ((v as u64 * 56064 + 65535 / 2) / 65535) as u16;
       // Allow ±1 LSB for matrix-multiply rounding (BT.709 weights aren't
@@ -1054,7 +1114,7 @@ mod tests {
       let b = [input; 1];
       let r = [input; 1];
       let mut out = [0u16; 1];
-      gbr_to_luma_u16_high_bit_row::<10>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
+      gbr_to_luma_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, false);
       let diff = (out[0] as i32 - expected as i32).abs();
       assert!(
         diff <= 1,
@@ -1063,4 +1123,164 @@ mod tests {
       );
     }
   }
+
+  // ---- BE vs LE parity: scalar<BITS, true> must produce same output as -------
+  // scalar<BITS, false> on byte-swapped input. Covers 6 kernels at BITS 10/16. -
+
+  fn byte_swap_vec(v: &[u16]) -> std::vec::Vec<u16> {
+    v.iter().map(|x| x.swap_bytes()).collect()
+  }
+
+  fn rand_plane<const BITS: u32>(seed: u32, n: usize) -> std::vec::Vec<u16> {
+    let mask = (1u32 << BITS) - 1;
+    let mut s = seed;
+    (0..n)
+      .map(|_| {
+        s = s.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+        (s & mask) as u16
+      })
+      .collect()
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17, 33, 65] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      gbr_to_rgb_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_high_bit_be_parity_bits16() {
+    for w in [1usize, 7, 8, 9, 17, 33, 65] {
+      let g = rand_plane::<16>(0xAAAA, w);
+      let b = rand_plane::<16>(0xBBBB, w);
+      let r = rand_plane::<16>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 3];
+      let mut out_be = std::vec![0u8; w * 3];
+      gbr_to_rgb_high_bit_row::<16, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_high_bit_row::<16, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb bits16 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgba_opaque_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      gbr_to_rgba_opaque_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgba_opaque_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgba_opaque bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbra_to_rgba_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let a = rand_plane::<10>(0xDDDD, w);
+      let mut out_le = std::vec![0u8; w * 4];
+      let mut out_be = std::vec![0u8; w * 4];
+      gbra_to_rgba_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &byte_swap_vec(&a),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbra_to_rgba bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbr_to_rgb_u16_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let mut out_le = std::vec![0u16; w * 3];
+      let mut out_be = std::vec![0u16; w * 3];
+      gbr_to_rgb_u16_high_bit_row::<10, false>(&g, &b, &r, &mut out_le, w);
+      gbr_to_rgb_u16_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbr_to_rgb_u16 bits10 w={w}"
+      );
+    }
+  }
+
+  #[test]
+  fn scalar_gbra_to_rgba_u16_high_bit_be_parity_bits10() {
+    for w in [1usize, 7, 8, 9, 17] {
+      let g = rand_plane::<10>(0xAAAA, w);
+      let b = rand_plane::<10>(0xBBBB, w);
+      let r = rand_plane::<10>(0xCCCC, w);
+      let a = rand_plane::<10>(0xDDDD, w);
+      let mut out_le = std::vec![0u16; w * 4];
+      let mut out_be = std::vec![0u16; w * 4];
+      gbra_to_rgba_u16_high_bit_row::<10, false>(&g, &b, &r, &a, &mut out_le, w);
+      gbra_to_rgba_u16_high_bit_row::<10, true>(
+        &byte_swap_vec(&g),
+        &byte_swap_vec(&b),
+        &byte_swap_vec(&r),
+        &byte_swap_vec(&a),
+        &mut out_be,
+        w,
+      );
+      assert_eq!(
+        out_le, out_be,
+        "scalar BE/LE mismatch gbra_to_rgba_u16 bits10 w={w}"
+      );
+    }
+  }
 }
diff --git a/src/sinker/mixed/planar_gbr_high_bit.rs b/src/sinker/mixed/planar_gbr_high_bit.rs
index f28432b9..69ba1982 100644
--- a/src/sinker/mixed/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/planar_gbr_high_bit.rs
@@ -237,7 +237,14 @@ macro_rules! impl_gbrp_high_bit {
           let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
           let rgba_u16_row =
             rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-          gbr_to_rgba_opaque_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgba_u16_row, w, use_simd);
+          gbr_to_rgba_opaque_u16_high_bit_row::<BITS, false>(
+            g_in,
+            b_in,
+            r_in,
+            rgba_u16_row,
+            w,
+            use_simd,
+          );
         } else if want_rgb_u16 {
           let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
           let rgb_plane_end =
@@ -250,7 +257,7 @@ macro_rules! impl_gbrp_high_bit {
               })?;
           let rgb_plane_start = one_plane_start * 3;
           let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-          gbr_to_rgb_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
+          gbr_to_rgb_u16_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
           if want_rgba_u16 {
             let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
             let rgba_u16_row =
@@ -264,7 +271,7 @@ macro_rules! impl_gbrp_high_bit {
         // going through the u8 staging path, so it is independent of whether
         // RGB staging happens below.
         if let Some(luma_u16_buf) = luma_u16.as_deref_mut() {
-          gbr_to_luma_u16_high_bit_row::<BITS>(
+          gbr_to_luma_u16_high_bit_row::<BITS, false>(
             g_in,
             b_in,
             r_in,
@@ -287,7 +294,7 @@ macro_rules! impl_gbrp_high_bit {
         if want_rgba && !need_rgb_staging {
           let rgba_buf = rgba.as_deref_mut().unwrap();
           let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-          gbr_to_rgba_opaque_high_bit_row::<BITS>(g_in, b_in, r_in, rgba_row, w, use_simd);
+          gbr_to_rgba_opaque_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgba_row, w, use_simd);
           return Ok(());
         }
 
@@ -304,7 +311,7 @@ macro_rules! impl_gbrp_high_bit {
           w,
           h,
         )?;
-        gbr_to_rgb_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_row, w, use_simd);
+        gbr_to_rgb_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
         if let Some(luma) = luma.as_deref_mut() {
           rgb_to_luma_row(
@@ -519,7 +526,15 @@ macro_rules! impl_gbrap_high_bit {
           let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
           let rgba_u16_row =
             rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
-          gbra_to_rgba_u16_high_bit_row::<BITS>(g_in, b_in, r_in, a_in, rgba_u16_row, w, use_simd);
+          gbra_to_rgba_u16_high_bit_row::<BITS, false>(
+            g_in,
+            b_in,
+            r_in,
+            a_in,
+            rgba_u16_row,
+            w,
+            use_simd,
+          );
         } else if want_rgb_u16 {
           let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
           let rgb_plane_end =
@@ -532,7 +547,7 @@ macro_rules! impl_gbrap_high_bit {
               })?;
           let rgb_plane_start = one_plane_start * 3;
           let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
-          gbr_to_rgb_u16_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
+          gbr_to_rgb_u16_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_u16_row, w, use_simd);
           if want_rgba_u16 {
             // Strategy A+: expand RGB → RGBA, then overwrite α from source plane.
             let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
@@ -549,7 +564,7 @@ macro_rules! impl_gbrap_high_bit {
         // going through the u8 staging path, so it is independent of whether
         // RGB staging happens below.
         if let Some(luma_u16_buf) = luma_u16.as_deref_mut() {
-          gbr_to_luma_u16_high_bit_row::<BITS>(
+          gbr_to_luma_u16_high_bit_row::<BITS, false>(
             g_in,
             b_in,
             r_in,
@@ -572,7 +587,7 @@ macro_rules! impl_gbrap_high_bit {
         if want_rgba && !need_rgb_staging {
           let rgba_buf = rgba.as_deref_mut().unwrap();
           let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-          gbra_to_rgba_high_bit_row::<BITS>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+          gbra_to_rgba_high_bit_row::<BITS, false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
           return Ok(());
         }
 
@@ -589,7 +604,7 @@ macro_rules! impl_gbrap_high_bit {
           w,
           h,
         )?;
-        gbr_to_rgb_high_bit_row::<BITS>(g_in, b_in, r_in, rgb_row, w, use_simd);
+        gbr_to_rgb_high_bit_row::<BITS, false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
         if let Some(luma) = luma.as_deref_mut() {
           rgb_to_luma_row(

From 26e50773d5175eb3818ee32e24c3d1eca7e59f91 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 00:48:47 +1200
Subject: [PATCH 2/7] fix(be-tier10b): make scalar BE conversion target-endian
 aware

Codex adversarial review of #82 caught a high-severity bug: the
scalar `if BE { x.swap_bytes() } else { x }` pattern is a no-op
when the data byte order matches the host CPU's byte order, but
unconditionally swaps bytes regardless of target endianness.

That diverges from the SIMD `load_endian_u16x*::<BE>` helpers from
#81, which are target-endian-aware (a swap is needed only when the
data byte order differs from the host).  Mismatched semantics
between scalar and SIMD paths means scalar tails (and luma kernels,
which are scalar-only) would corrupt rows on a big-endian host
(s390x), in BOTH BE=true and BE=false cases.

The fix replaces every scalar load with the standard `u16::from_be`
/ `u16::from_le` pair, which expand exactly to the SIMD helper
semantics: each is a no-op when the data byte order matches the
host, and a `swap_bytes()` when they differ.

  if BE { u16::from_be(r[x]) } else { u16::from_le(r[x]) }

26 call sites across the planar_gbr_high_bit kernels (g, b, r, a)
updated. Test helper `byte_swap_vec` left as-is; it intentionally
synthesizes BE-encoded buffers from LE inputs for parity tests on
LE-host CI (a future follow-up should make the test helper
target-endian aware too, when Phase 3 s390x QEMU coverage lands).

Verified: 2177 tests pass; cargo fmt clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/planar_gbr_high_bit.rs | 156 +++++++++++++++++++++-----
 1 file changed, 130 insertions(+), 26 deletions(-)

diff --git a/src/row/scalar/planar_gbr_high_bit.rs b/src/row/scalar/planar_gbr_high_bit.rs
index ac97ef38..22a60771 100644
--- a/src/row/scalar/planar_gbr_high_bit.rs
+++ b/src/row/scalar/planar_gbr_high_bit.rs
@@ -76,9 +76,21 @@ pub(crate) fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -119,9 +131,21 @@ pub(crate) fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
   debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short");
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -159,9 +183,21 @@ pub(crate) fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -202,9 +238,21 @@ pub(crate) fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: boo
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let opaque: u16 = mask;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -243,10 +291,26 @@ pub(crate) fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
-    let a_raw = if BE { a[x].swap_bytes() } else { a[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let a_raw = if BE {
+      u16::from_be(a[x])
+    } else {
+      u16::from_le(a[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -288,10 +352,26 @@ pub(crate) fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
   );
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for x in 0..width {
-    let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-    let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-    let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
-    let a_raw = if BE { a[x].swap_bytes() } else { a[x] };
+    let r_raw = if BE {
+      u16::from_be(r[x])
+    } else {
+      u16::from_le(r[x])
+    };
+    let g_raw = if BE {
+      u16::from_be(g[x])
+    } else {
+      u16::from_le(g[x])
+    };
+    let b_raw = if BE {
+      u16::from_be(b[x])
+    } else {
+      u16::from_le(b[x])
+    };
+    let a_raw = if BE {
+      u16::from_be(a[x])
+    } else {
+      u16::from_le(a[x])
+    };
     let r_val = r_raw & mask;
     let g_val = g_raw & mask;
     let b_val = b_raw & mask;
@@ -349,9 +429,21 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
 
   if full_range {
     for x in 0..width {
-      let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-      let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-      let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+      let r_raw = if BE {
+        u16::from_be(r[x])
+      } else {
+        u16::from_le(r[x])
+      };
+      let g_raw = if BE {
+        u16::from_be(g[x])
+      } else {
+        u16::from_le(g[x])
+      };
+      let b_raw = if BE {
+        u16::from_be(b[x])
+      } else {
+        u16::from_le(b[x])
+      };
       let rv = (r_raw & mask) as i64;
       let gv = (g_raw & mask) as i64;
       let bv = (b_raw & mask) as i64;
@@ -380,9 +472,21 @@ pub(crate) fn gbr_to_luma_u16_high_bit_row<const BITS: u32, const BE: bool>(
     let y_max = (235i64) << (BITS - 8);
     let y_min = y_off;
     for x in 0..width {
-      let r_raw = if BE { r[x].swap_bytes() } else { r[x] };
-      let g_raw = if BE { g[x].swap_bytes() } else { g[x] };
-      let b_raw = if BE { b[x].swap_bytes() } else { b[x] };
+      let r_raw = if BE {
+        u16::from_be(r[x])
+      } else {
+        u16::from_le(r[x])
+      };
+      let g_raw = if BE {
+        u16::from_be(g[x])
+      } else {
+        u16::from_le(g[x])
+      };
+      let b_raw = if BE {
+        u16::from_be(b[x])
+      } else {
+        u16::from_le(b[x])
+      };
       let rv = (r_raw & mask) as i64;
       let gv = (g_raw & mask) as i64;
       let bv = (b_raw & mask) as i64;

From 3418d0ea91ce8caf49bc207149f7a63e8fa78ae7 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 01:15:33 +1200
Subject: [PATCH 3/7] fix(be-tier10b): make Strategy A+ alpha scatter
 target-endian aware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex adversarial review of #82 caught a high-severity bug in the
Strategy A+ alpha-only fast-path helpers. `copy_alpha_plane_u16_to_u8`
and `copy_alpha_plane_u16` mask + shift raw u16 values without endian
awareness. On a big-endian host (s390x) processing LE-encoded Gbrap
input, the alpha plane is byte-reversed when both `with_rgb` and
`with_rgba` outputs are requested — the same class of bug we already
fixed for the direct `gbra_to_rgba_*` kernels in 26e5077.

The direct kernels are correct because they were threaded through
`<const BITS: u32, const BE: bool>` in PR #82 + 26e5077. The α-extract
helpers were left at `<const BITS: u32>` with raw `alpha[n]` reads and
silently drift to incorrect output on a BE host.

Fix: thread `<const BE: bool>` through both scalar α-extract helpers
and apply the same `u16::from_be` / `u16::from_le` pattern as the
direct-kernel scalar fix:

  let raw = if BE { u16::from_be(alpha[n]) } else { u16::from_le(alpha[n]) };
  rgba_out[n * 4 + 3] = ((raw & mask) >> shift) as u8;  // u16-to-u8 variant

Each conversion compiles to a no-op when the data byte order matches
the host CPU and a byte-swap otherwise, mirroring the SIMD
`load_endian_u16x*::<BE>` semantics from #81. Scalar tails and SIMD
hot paths now stay byte-for-byte equivalent on every host for
BE = false (the case currently exercised).

The dispatcher (`row::dispatch::alpha_extract`) gained the matching
`<const BE: bool>` parameter. When `BE = true` it routes directly to
scalar — the SIMD α-extract backends use raw native-u16 loads
(`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) and have no
byte-swap path, so feeding them BE-encoded input would re-introduce
the same corruption. Per the spec ("Don't touch SIMD α-extract paths
... codex didn't flag those"), the SIMD kernels keep their existing
LE-oriented loads. Phase 4 will plumb `<const BE: bool>` through SIMD
if/when a real BE-input sinker hot-path lands.

All sinker call sites pass `<BITS, false>` for now (LE-only sinkers
today; matches the `false` already passed to the sibling
`gbr_to_rgb_u16_high_bit_row::<BITS, false>` calls). Eight call sites
updated:

  - sinker/mixed/planar_gbr_high_bit.rs (Tier 10b, 2 sites)
  - sinker/mixed/yuva_4_4_4.rs           (Tier 9, 2 sites)
  - sinker/mixed/yuva_4_2_2.rs           (Tier 9, 2 sites)
  - sinker/mixed/yuva_4_2_0.rs           (Tier 9, 2 sites)

Each call site has an inline `// BE = false: ...` comment naming Phase 4
as the follow-up that will plumb a real `<const BE: bool>` from the
row type.

The 8/16-bit variants `copy_alpha_plane_u8`,
`copy_alpha_packed_u8x4_at_3`, `copy_alpha_packed_u16x4_to_u8_at_0`,
`copy_alpha_packed_u16x4_at_0`, and `copy_alpha_ya_*` are unchanged:
the 8-bit ones have no endianness; the AYUV64 / Rgba64 / Bgra64 / Ya16
variants take packed sources whose endianness is already a property of
the source's row-type wrapper rather than this helper.

The f32 helpers (`copy_alpha_plane_f32*`) are left untouched — they
belong to Tier 10 float (Gbrapf32 / Gbrpf16), out of scope for this
PR. They will be addressed in a separate PR when Phase 4 rolls up
through the float sinkers.

Tests added:
  - `copy_alpha_plane_u16_to_u8_be_parity_with_swapped_buffer` —
    builds a host-side `swap_bytes` of the LE fixture, calls the
    helper with `<10, false>` on the LE buffer and `<10, true>` on
    the BE-encoded buffer, asserts identical output. Locks down the
    BE-flag round-trip on every host.
  - `copy_alpha_plane_u16_be_parity_with_swapped_buffer` — same
    pattern for the u16-output variant.

Existing scalar tests retargeted to `<BITS, false>` (LE) to preserve
current behavior. SIMD parity tests in
`row/arch/{neon,x86_*,wasm_simd128}/alpha_extract.rs` retargeted the
scalar reference call to `<BITS, false>` — the SIMD helpers do
host-native loads, which matches scalar BE = false on LE hosts.

Verification:
  - cargo test --target aarch64-apple-darwin --lib → 2179 passed
  - cargo test --target x86_64-apple-darwin   --lib → 2873 passed
  - cargo build --target x86_64-apple-darwin --tests → 0 warnings
  - RUSTFLAGS=+simd128 cargo build --target wasm32-unknown-unknown
      --tests → only pre-existing unused-import warnings
  - cargo build --no-default-features → ok
  - cargo fmt --check → clean
  - cargo clippy --all-targets --all-features -- -D warnings → clean

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/alpha_extract.rs         | 19 +++--
 src/row/arch/wasm_simd128/alpha_extract.rs | 19 +++--
 src/row/arch/x86_avx2/alpha_extract.rs     | 19 +++--
 src/row/arch/x86_avx512/alpha_extract.rs   | 19 +++--
 src/row/arch/x86_sse41/alpha_extract.rs    | 19 +++--
 src/row/dispatch/alpha_extract.rs          | 38 ++++++---
 src/row/scalar/alpha_extract.rs            | 96 ++++++++++++++++++----
 src/sinker/mixed/planar_gbr_high_bit.rs    |  9 +-
 src/sinker/mixed/yuva_4_2_0.rs             | 14 +++-
 src/sinker/mixed/yuva_4_2_2.rs             | 14 +++-
 src/sinker/mixed/yuva_4_4_4.rs             | 14 +++-
 11 files changed, 221 insertions(+), 59 deletions(-)

diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs
index 8b870dc2..ffb04e6a 100644
--- a/src/row/arch/neon/alpha_extract.rs
+++ b/src/row/arch/neon/alpha_extract.rs
@@ -241,7 +241,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
   }
 
   if x < width {
-    scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+    // Scalar tail uses `BE = false`: this NEON helper does host-native u16
+    // loads (`vld1q_u16`), which match LE-on-disk only on LE hosts. The
+    // dispatcher routes the BE = true case directly to scalar (see
+    // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+    // construction.
+    scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
       &alpha[x..width],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -286,7 +291,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
   }
 
   if x < width {
-    scalar::copy_alpha_plane_u16::<BITS>(
+    // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+    scalar::copy_alpha_plane_u16::<BITS, false>(
       &alpha[x..width],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -409,7 +415,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -430,7 +437,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -448,7 +456,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs
index 105910be..b999b618 100644
--- a/src/row/arch/wasm_simd128/alpha_extract.rs
+++ b/src/row/arch/wasm_simd128/alpha_extract.rs
@@ -357,7 +357,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this wasm-simd128 helper does
+      // host-native u16 loads (`v128_load64_zero`), which match LE-on-disk
+      // only on LE hosts. The dispatcher routes BE = true directly to scalar
+      // (see `dispatch::alpha_extract`), so the SIMD path here is BE = false
+      // by construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -438,7 +443,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -575,7 +581,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -598,7 +605,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -618,7 +626,8 @@ mod tests {
       unsafe {
         super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs
index ba4ade4f..1ebe97c1 100644
--- a/src/row/arch/x86_avx2/alpha_extract.rs
+++ b/src/row/arch/x86_avx2/alpha_extract.rs
@@ -450,7 +450,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this AVX2 helper does host-native
+      // u16 loads (`_mm_loadu_si128`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -554,7 +559,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -696,7 +702,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -720,7 +727,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -741,7 +749,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs
index f311e366..203e08e3 100644
--- a/src/row/arch/x86_avx512/alpha_extract.rs
+++ b/src/row/arch/x86_avx512/alpha_extract.rs
@@ -434,7 +434,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this AVX-512 helper does host-native
+      // u16 loads (`_mm256_loadu_si256`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -519,7 +524,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -670,7 +676,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -696,7 +703,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -719,7 +727,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs
index 5abdfd08..d327e299 100644
--- a/src/row/arch/x86_sse41/alpha_extract.rs
+++ b/src/row/arch/x86_sse41/alpha_extract.rs
@@ -356,7 +356,12 @@ pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16_to_u8::<BITS>(
+      // Scalar tail uses `BE = false`: this SSE4.1 helper does host-native
+      // u16 loads (`_mm_loadl_epi64`), which match LE-on-disk only on LE
+      // hosts. The dispatcher routes BE = true directly to scalar (see
+      // `dispatch::alpha_extract`), so the SIMD path here is BE = false by
+      // construction.
+      scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -440,7 +445,8 @@ pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
     }
 
     if x < width {
-      scalar::copy_alpha_plane_u16::<BITS>(
+      // Scalar tail uses `BE = false`: see `copy_alpha_plane_u16_to_u8` above.
+      scalar::copy_alpha_plane_u16::<BITS, false>(
         &alpha[x..width],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -581,7 +587,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xBABE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -605,7 +612,8 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0x5EED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -626,7 +634,8 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0xFADE);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
-      scalar::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_scalar, w);
+      // SIMD reads native u16; pair with scalar BE = false (LE-on-LE-host).
+      scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 75cccb8d..0364dc21 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -260,17 +260,26 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
 /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u8) with
 /// depth-conv `>> (BITS - 8)`.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
+/// `true` = BE on disk/wire). When `BE = true` the dispatcher routes to
+/// scalar directly: the SIMD α-extract backends use raw native-u16 loads
+/// (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) and have no
+/// byte-swap path. Per the codex review of #82 the scalar helper is now
+/// target-endian-aware via `u16::from_be` / `u16::from_le`, so this
+/// scalar fallback emits the correct α plane on every host. Phase 4 will
+/// plumb BE through SIMD if a real BE-input sinker hot-path lands.
+///
+/// Selects the highest available SIMD backend (`BE = false`); falls back
+/// to scalar. When `use_simd` is `false`, calls scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u8],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_plane_u16_to_u8::<BITS>(alpha, rgba_out, width);
+  if !use_simd || BE {
+    return scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -306,7 +315,7 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
     },
     _ => {}
   }
-  scalar::copy_alpha_plane_u16_to_u8::<BITS>(alpha, rgba_out, width);
+  scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -317,17 +326,22 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
 /// scatter α plane (u16) into `rgba_out[3 + 4*n]` (u16). No depth
 /// conversion.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
+/// `true` = BE on disk/wire). When `BE = true` the dispatcher routes to
+/// scalar directly: see `copy_alpha_plane_u16_to_u8` above for the
+/// rationale (SIMD α-extract is BE-naïve; scalar is target-endian-aware).
+///
+/// Selects the highest available SIMD backend (`BE = false`); falls back
+/// to scalar. When `use_simd` is `false`, calls scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u16],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_plane_u16::<BITS>(alpha, rgba_out, width);
+  if !use_simd || BE {
+    return scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -363,5 +377,5 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
     },
     _ => {}
   }
-  scalar::copy_alpha_plane_u16::<BITS>(alpha, rgba_out, width);
+  scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
 }
diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index a5190496..23463a72 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -99,17 +99,28 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
 /// Yuva*p9/10/12/14 → u8 RGBA: scatter α plane (u16) into
 /// `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`.
 ///
-/// `BITS` is the source α bit depth (9, 10, 12, or 14).
+/// `BITS` is the source α bit depth (9, 10, 12, or 14). `BE` selects the
+/// **byte order** of the encoded source α plane: `false` = LE on disk/wire
+/// (e.g., AV `Yuva420p10le`), `true` = BE on disk/wire (e.g., `Yuva420p10be`).
 ///
-/// α is masked with `(1 << BITS) - 1` BEFORE the shift to canonicalize
-/// over-range source samples. Frame constructors admit raw u16 input
-/// (e.g., p010-style buffers store the 10 active bits in the HIGH bits
+/// Each raw u16 sample is converted from its disk byte order into host-native
+/// order via `u16::from_le` / `u16::from_be` BEFORE the BITS-mask + shift.
+/// On a host whose endianness matches the data, the conversion compiles to a
+/// no-op; otherwise it is a `swap_bytes`. This mirrors the
+/// `load_endian_u16x*::<BE>` SIMD pattern from #81 so scalar tails and SIMD
+/// paths stay byte-for-byte equivalent on every host. Without this, a
+/// big-endian host (e.g., s390x) processing LE source data would emit a
+/// byte-reversed α plane.
+///
+/// α is masked with `(1 << BITS) - 1` AFTER the endian conversion to
+/// canonicalize over-range source samples. Frame constructors admit raw u16
+/// input (e.g., p010-style buffers store the 10 active bits in the HIGH bits
 /// of u16), so an unmasked over-range value would otherwise leak through
 /// the shift and produce divergent output between scalar and SIMD paths.
 /// See sibling inline-α kernels (`yuva_4_*` row impls) for the same
 /// pattern with comment "silently turning over-range alpha into
 /// transparent output".
-pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
+pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -122,7 +133,12 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   let shift = BITS - 8;
   for n in 0..width {
-    rgba_out[n * 4 + 3] = ((alpha[n] & mask) >> shift) as u8;
+    let raw = if BE {
+      u16::from_be(alpha[n])
+    } else {
+      u16::from_le(alpha[n])
+    };
+    rgba_out[n * 4 + 3] = ((raw & mask) >> shift) as u8;
   }
 }
 
@@ -131,7 +147,17 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
 /// depth, masked to `(1 << BITS) - 1` so over-range source samples
 /// don't leak through (parity with the inline-α kernels — frame
 /// constructors admit raw u16 input above the BITS-bit native range).
-pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
+///
+/// `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire, `true` = BE on disk/wire. Each raw u16
+/// sample is converted to host-native order via `u16::from_le` /
+/// `u16::from_be` BEFORE masking. On a host whose endianness matches
+/// the data, the conversion compiles to a no-op; otherwise it is a
+/// `swap_bytes`. Mirrors the `load_endian_u16x*::<BE>` SIMD pattern
+/// from #81 so scalar and SIMD stay byte-for-byte equivalent on every
+/// host. Without this, a BE host processing LE source data would emit
+/// a byte-reversed α plane.
+pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
   rgba_out: &mut [u16],
   width: usize,
@@ -143,7 +169,12 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32>(
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   let mask: u16 = ((1u32 << BITS) - 1) as u16;
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n] & mask;
+    let raw = if BE {
+      u16::from_be(alpha[n])
+    } else {
+      u16::from_le(alpha[n])
+    };
+    rgba_out[n * 4 + 3] = raw & mask;
   }
 }
 
@@ -268,19 +299,19 @@ mod tests {
     // BITS=10
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x7F]);
 
     // BITS=12
     let alpha: std::vec::Vec<u16> = std::vec![0xFFF, 0x800];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]);
 
     // BITS=16
     let alpha: std::vec::Vec<u16> = std::vec![0xFFFF, 0x8000];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_plane_u16_to_u8::<16>(&alpha, &mut rgba, 2);
+    copy_alpha_plane_u16_to_u8::<16, false>(&alpha, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xFF, 1, 1, 1, 0x80]);
   }
 
@@ -289,7 +320,7 @@ mod tests {
     // In-range values pass through unchanged.
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF, 0x000];
     let mut rgba = std::vec![1u16; 12];
-    copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(
       rgba,
       std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x1FF, 1, 1, 1, 0x000]
@@ -304,7 +335,7 @@ mod tests {
     // diverging from the inline-α scalar reference.
     let alpha: std::vec::Vec<u16> = std::vec![0xFFFF, 0x0500, 0x07FF];
     let mut rgba = std::vec![1u16; 12];
-    copy_alpha_plane_u16::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(
       rgba,
       std::vec![1, 1, 1, 0x3FF, 1, 1, 1, 0x100, 1, 1, 1, 0x3FF]
@@ -319,10 +350,47 @@ mod tests {
     // & 0x3FF = 0x100 → 0x100 >> 2 = 64 consistently across all paths.
     let alpha: std::vec::Vec<u16> = std::vec![0x0500, 0xFFFF, 0x03FF];
     let mut rgba = std::vec![1u8; 12];
-    copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba, 3);
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba, 3);
     assert_eq!(rgba, std::vec![1, 1, 1, 64, 1, 1, 1, 0xFF, 1, 1, 1, 0xFF]);
   }
 
+  /// BE parity: byte-swapping the source α plane and toggling the `BE`
+  /// flag must yield byte-for-byte identical output. Locks down the
+  /// codex-flagged corruption where a BE host processing LE input
+  /// would otherwise emit a byte-reversed α slot. The synthesized
+  /// "BE-encoded" buffer is built by host-side `swap_bytes` on the LE
+  /// fixture; both `from_le` (LE flag) and `from_be` (BE flag with the
+  /// swapped buffer) recover the same logical u16 values, so the
+  /// outputs match on every host.
+  #[test]
+  fn copy_alpha_plane_u16_to_u8_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF, 0x0500, 0xFFFF, 0x07FF, 0x0123];
+    let alpha_be: std::vec::Vec<u16> = alpha_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 24];
+    let mut rgba_be = std::vec![1u8; 24];
+    copy_alpha_plane_u16_to_u8::<10, false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_u16_to_u8::<10, true>(&alpha_be, &mut rgba_be, 6);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for the u16-output variant.
+  #[test]
+  fn copy_alpha_plane_u16_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<u16> = std::vec![0xFFFF, 0x0500, 0x07FF, 0x0123, 0x3FF, 0x000];
+    let alpha_be: std::vec::Vec<u16> = alpha_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 24];
+    let mut rgba_be = std::vec![7u16; 24];
+    copy_alpha_plane_u16::<10, false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_u16::<10, true>(&alpha_be, &mut rgba_be, 6);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
   #[test]
   fn copy_alpha_ya_u8_extracts_alpha_from_odd_byte_slots() {
     // Ya8 packed layout: [Y0, A0, Y1, A1, Y2, A2]
diff --git a/src/sinker/mixed/planar_gbr_high_bit.rs b/src/sinker/mixed/planar_gbr_high_bit.rs
index 69ba1982..f1a6479c 100644
--- a/src/sinker/mixed/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/planar_gbr_high_bit.rs
@@ -555,7 +555,11 @@ macro_rules! impl_gbrap_high_bit {
               rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
             expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
             // Overwrite α slot from source plane (native depth, no shift).
-            alpha_extract::copy_alpha_plane_u16::<BITS>(a_in, rgba_u16_row, w, use_simd);
+            // BE flag hard-wired to `false`: this sinker only handles LE-encoded
+            // GBR/GBRA inputs today (Tier 10b). Phase 4 will wire the kernel's
+            // `<const BE: bool>` through here (matches the LE-only `false` in
+            // the sibling `gbr_to_rgb_u16_high_bit_row::<BITS, false>` call).
+            alpha_extract::copy_alpha_plane_u16::<BITS, false>(a_in, rgba_u16_row, w, use_simd);
           }
         }
 
@@ -633,7 +637,8 @@ macro_rules! impl_gbrap_high_bit {
           // overwrite α bytes from the source A plane.
           let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
           expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-          alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_in, rgba_row, w, use_simd);
+          // BE flag hard-wired to `false`: see the rgba_u16 branch above.
+          alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(a_in, rgba_row, w, use_simd);
         }
 
         Ok(())
diff --git a/src/sinker/mixed/yuva_4_2_0.rs b/src/sinker/mixed/yuva_4_2_0.rs
index e32af5ba..d543f0a6 100644
--- a/src/sinker/mixed/yuva_4_2_0.rs
+++ b/src/sinker/mixed/yuva_4_2_0.rs
@@ -657,7 +657,14 @@ fn yuva420p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -727,7 +734,10 @@ fn yuva420p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())
diff --git a/src/sinker/mixed/yuva_4_2_2.rs b/src/sinker/mixed/yuva_4_2_2.rs
index 6174c7d7..c7e861a6 100644
--- a/src/sinker/mixed/yuva_4_2_2.rs
+++ b/src/sinker/mixed/yuva_4_2_2.rs
@@ -757,7 +757,14 @@ fn yuva422p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -826,7 +833,10 @@ fn yuva422p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())
diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs
index be76e51a..c9d46e9d 100644
--- a/src/sinker/mixed/yuva_4_4_4.rs
+++ b/src/sinker/mixed/yuva_4_4_4.rs
@@ -868,7 +868,14 @@ fn yuva444p_high_bit_process<
       let rgba_buf = rgba_u16.as_deref_mut().unwrap();
       let rgba_u16_row = rgba_u16_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
-      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS>(a_row, rgba_u16_row, w, use_simd);
+      // BE = false: this sinker handles only LE-encoded high-bit Yuva*p inputs
+      // today. Phase 4 will plumb a `<const BE: bool>` from the row type here.
+      crate::row::alpha_extract::copy_alpha_plane_u16::<BITS, false>(
+        a_row,
+        rgba_u16_row,
+        w,
+        use_simd,
+      );
     }
   } else if want_rgba_u16 {
     // Standalone rgba_u16: delegate to the alpha-source-aware dispatcher.
@@ -938,7 +945,10 @@ fn yuva444p_high_bit_process<
     let rgba_buf = rgba.as_deref_mut().unwrap();
     let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
     expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS>(a_row, rgba_row, w, use_simd);
+    // BE = false: see the rgba_u16 branch above for rationale.
+    crate::row::alpha_extract::copy_alpha_plane_u16_to_u8::<BITS, false>(
+      a_row, rgba_row, w, use_simd,
+    );
   }
 
   Ok(())

From 56867721473c08a1c0279890bd6641ab6179f57d Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 01:32:22 +1200
Subject: [PATCH 4/7] fix(be-tier10b): force alpha SIMD to scalar when host
 endian differs from data endian
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 3rd-pass review of PR #82 caught two issues that survived the prior
two rounds. This commit fixes both.

Finding 1 [high]: LE-Strategy-A+ alpha SIMD path corrupts on BE hosts.
The previous routing in `row::dispatch::alpha_extract::{copy_alpha_plane_u16_to_u8,
copy_alpha_plane_u16}` used `if !use_simd || BE { scalar } else { SIMD }`.
That correctly routed BE-encoded input to scalar but silently broke the
mirror case: an LE-encoded Gbrap source on a BE host (`BE = false`,
`target_endian = "big"`) would still take the SIMD path, which uses raw
host-native u16 loads (`vld1q_u16` / `_mm_loadu_si128` /
`v128_load64_zero`). Those reads byte-swap the LE bytes on a BE host —
silently corrupting the α plane.

The fix replaces the narrow `BE` check with a real "do data and host
disagree?" check:

  let need_swap = BE != cfg!(target_endian = "big");
  if need_swap || !use_simd {
    // scalar — `u16::from_le` / `u16::from_be` handles the swap.
  } else {
    // SIMD — host-native loads are correct because data byte order
    // already matches the host CPU.
  }

Truth table:
  - LE data, LE host: need_swap = false != false = false  → SIMD ok
    (host-native LE u16 reads match LE encoding).
  - LE data, BE host: need_swap = false != true  = true   → scalar
    (scalar uses `u16::from_le`, swaps on BE host as needed).
  - BE data, LE host: need_swap = true  != false = true   → scalar
    (scalar uses `u16::from_be`, swaps on LE host as needed).
  - BE data, BE host: need_swap = true  != true  = false  → SIMD ok
    (host-native BE u16 reads match BE encoding).

Both u16 alpha-plane dispatchers (`copy_alpha_plane_u16_to_u8` and
`copy_alpha_plane_u16`) get the same fix. Doc comments updated with the
truth table for future readers. SIMD α-extract internals are untouched
— per the spec, they remain native-host-only by design; Phase 4 will
plumb `<const BE: bool>` through SIMD if a real BE-input sinker
hot-path lands. Sinker call sites are unchanged (they continue to pass
`BE = false`).

Finding 2 [medium]: Native-depth Strategy A+ alpha scatter had no test
coverage. The existing Strategy A+ integration tests for Gbrap10/12/14/16
in `src/sinker/mixed/tests/planar_gbr_high_bit.rs` only covered the u8
alpha-scatter path (`with_rgb` + `with_rgba`, which routes through
`copy_alpha_plane_u16_to_u8`). The native-depth combo path
`with_rgb_u16` + `with_rgba_u16` calls `copy_alpha_plane_u16` and was
unexercised — a regression there would not have been caught.

Fix: added `test_gbrap_strategy_a_plus_u16!` macro mirroring the
existing u8 macro, with one instance per bit depth (10, 12, 14, 16):
  - Build a Gbrap source with full-range u16 G/B/R/α plane values
    (using `pseudo_random_u16_low_n_bits` with `bits=16` so the upper
    bits beyond BITS are dirty — exercises the `(1 << BITS) - 1` mask
    in both the direct kernel and α-extract paths).
  - Run standalone: attach only `with_rgba_u16`, drives the direct
    4-channel `gbra_to_rgba_u16_high_bit_row` kernel.
  - Run combo: attach both `with_rgb_u16` AND `with_rgba_u16`, drives
    the Strategy A+ path (`gbr_to_rgb_u16_high_bit_row` →
    `expand_rgb_u16_to_rgba_u16_row` → `copy_alpha_plane_u16`).
  - Assert byte-exact equality between the two RGBA u16 buffers.

This mirrors the existing `test_gbrap_strategy_a_plus!` macro pattern
exactly.

Verification:
  - cargo test --target aarch64-apple-darwin --lib → 2183 passed (+4)
  - cargo test --target x86_64-apple-darwin   --lib → 2877 passed (+4)
  - cargo build --target x86_64-apple-darwin --tests → no new warnings
  - RUSTFLAGS=+simd128 cargo build --target wasm32-unknown-unknown
      --tests → only pre-existing unused-import warnings
  - cargo build --no-default-features → ok
  - cargo fmt --check → clean
  - cargo clippy --all-targets --all-features -- -D warnings → clean

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/dispatch/alpha_extract.rs             | 43 +++++++---
 src/sinker/mixed/tests/planar_gbr_high_bit.rs | 85 +++++++++++++++++++
 2 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 0364dc21..8b7dabb8 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -261,16 +261,26 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
 /// depth-conv `>> (BITS - 8)`.
 ///
 /// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
-/// `true` = BE on disk/wire). When `BE = true` the dispatcher routes to
-/// scalar directly: the SIMD α-extract backends use raw native-u16 loads
-/// (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) and have no
-/// byte-swap path. Per the codex review of #82 the scalar helper is now
+/// `true` = BE on disk/wire). The SIMD α-extract backends use raw
+/// native-u16 loads (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`)
+/// and have no byte-swap path, so SIMD is only correct when the source
+/// byte order matches the host CPU's native byte order. The dispatcher
+/// computes `need_swap = BE != cfg!(target_endian = "big")` and routes
+/// to scalar whenever a swap would be required (LE-on-BE-host or
+/// BE-on-LE-host). Per the codex review of #82 the scalar helper is
 /// target-endian-aware via `u16::from_be` / `u16::from_le`, so this
 /// scalar fallback emits the correct α plane on every host. Phase 4 will
 /// plumb BE through SIMD if a real BE-input sinker hot-path lands.
 ///
-/// Selects the highest available SIMD backend (`BE = false`); falls back
-/// to scalar. When `use_simd` is `false`, calls scalar directly.
+/// Truth table (`need_swap = BE != target_endian == "big"`):
+/// - LE data, LE host: `false != false = false` → SIMD (host-native LE u16 loads correct)
+/// - LE data, BE host: `false != true  = true`  → scalar (uses `u16::from_le`)
+/// - BE data, LE host: `true  != false = true`  → scalar (uses `u16::from_be`)
+/// - BE data, BE host: `true  != true  = false` → SIMD (host-native BE u16 loads correct)
+///
+/// Selects the highest available SIMD backend when host endian == data
+/// endian; falls back to scalar otherwise. When `use_simd` is `false`,
+/// calls scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
@@ -278,7 +288,9 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd || BE {
+  // Need a byte-swap if data byte order differs from host CPU's native order.
+  let need_swap = BE != cfg!(target_endian = "big");
+  if need_swap || !use_simd {
     return scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
@@ -327,12 +339,15 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
 /// conversion.
 ///
 /// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
-/// `true` = BE on disk/wire). When `BE = true` the dispatcher routes to
-/// scalar directly: see `copy_alpha_plane_u16_to_u8` above for the
-/// rationale (SIMD α-extract is BE-naïve; scalar is target-endian-aware).
+/// `true` = BE on disk/wire). The dispatcher computes
+/// `need_swap = BE != cfg!(target_endian = "big")` and routes to scalar
+/// whenever a swap would be required: see `copy_alpha_plane_u16_to_u8`
+/// above for the truth table and rationale (SIMD α-extract uses
+/// host-native u16 loads; scalar is target-endian-aware).
 ///
-/// Selects the highest available SIMD backend (`BE = false`); falls back
-/// to scalar. When `use_simd` is `false`, calls scalar directly.
+/// Selects the highest available SIMD backend when host endian == data
+/// endian; falls back to scalar otherwise. When `use_simd` is `false`,
+/// calls scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
@@ -340,7 +355,9 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd || BE {
+  // Need a byte-swap if data byte order differs from host CPU's native order.
+  let need_swap = BE != cfg!(target_endian = "big");
+  if need_swap || !use_simd {
     return scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
diff --git a/src/sinker/mixed/tests/planar_gbr_high_bit.rs b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
index 83cf13c4..ff006040 100644
--- a/src/sinker/mixed/tests/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
@@ -208,6 +208,91 @@ test_gbrap_strategy_a_plus!(
   16
 );
 
+// ---- Strategy A+: Gbrap combo RGB_u16+RGBA_u16 matches standalone RGBA_u16 -
+//
+// Mirrors the u8 Strategy A+ test above, but covers the native-depth combo
+// path (`with_rgb_u16` + `with_rgba_u16`) that routes through
+// `copy_alpha_plane_u16` rather than `copy_alpha_plane_u16_to_u8`. Without
+// this, a regression in the `BE != cfg!(target_endian)` dispatcher routing
+// or in the scalar α-extract helper would not be caught for the native-depth
+// path.
+//
+// Source planes are filled with full-range u16 values (`bits=16` argument
+// to `pseudo_random_u16_low_n_bits`) so the upper bits beyond BITS are
+// "dirty" — both paths must mask via `(1 << BITS) - 1`, so any drift between
+// them surfaces here.
+macro_rules! test_gbrap_strategy_a_plus_u16 {
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal) => {
+    #[test]
+    #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")]
+    fn $name() {
+      let w = 32usize;
+      let h = 8usize;
+      let n = w * h;
+      let mut g = std::vec![0u16; n];
+      let mut b = std::vec![0u16; n];
+      let mut r = std::vec![0u16; n];
+      let mut a = std::vec![0u16; n];
+      // Use full-range u16 (bits=16) so upper bits beyond BITS are dirty,
+      // exercising the mask in both the direct kernel and α-extract paths.
+      pseudo_random_u16_low_n_bits(&mut g, 0x55_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut b, 0x66_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut r, 0x77_u32.wrapping_add($bits), 16);
+      pseudo_random_u16_low_n_bits(&mut a, 0x88_u32.wrapping_add($bits), 16);
+
+      // Reference: standalone with_rgba_u16 (direct 4-channel kernel).
+      let src_ref = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32);
+      let mut rgba_u16_ref = std::vec![0u16; n * 4];
+      let mut sink_ref = MixedSinker::<crate::yuv::$marker>::new(w, h)
+        .with_rgba_u16(&mut rgba_u16_ref)
+        .unwrap();
+      crate::yuv::$walker(&src_ref, false, ColorMatrix::Bt709, &mut sink_ref).unwrap();
+
+      // Combo: with_rgb_u16 + with_rgba_u16 (Strategy A+ native-depth).
+      let src_combo = solid_gbrap_frame::<$bits>(&g, &b, &r, &a, w as u32, h as u32);
+      let mut rgb_u16_combo = std::vec![0u16; n * 3];
+      let mut rgba_u16_combo = std::vec![0u16; n * 4];
+      let mut sink_combo = MixedSinker::<crate::yuv::$marker>::new(w, h)
+        .with_rgb_u16(&mut rgb_u16_combo)
+        .unwrap()
+        .with_rgba_u16(&mut rgba_u16_combo)
+        .unwrap();
+      crate::yuv::$walker(&src_combo, false, ColorMatrix::Bt709, &mut sink_combo).unwrap();
+
+      // RGBA u16 elements must be byte-exact between standalone and combo paths.
+      assert_eq!(
+        rgba_u16_ref, rgba_u16_combo,
+        "Strategy A+ native-depth RGBA u16 mismatch for BITS={}", $bits,
+      );
+    }
+  };
+}
+
+test_gbrap_strategy_a_plus_u16!(
+  gbrap10_strategy_a_plus_u16_matches_standalone,
+  Gbrap10,
+  gbrap10_to,
+  10
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap12_strategy_a_plus_u16_matches_standalone,
+  Gbrap12,
+  gbrap12_to,
+  12
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap14_strategy_a_plus_u16_matches_standalone,
+  Gbrap14,
+  gbrap14_to,
+  14
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap16_strategy_a_plus_u16_matches_standalone,
+  Gbrap16,
+  gbrap16_to,
+  16
+);
+
 // ---- Gbrap alpha downshift correctness -------------------------------------
 
 macro_rules! test_gbrap_alpha_downshift {

From 9c7d533770eee89c465f9aaa95474eb8941ec07c Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 10:55:46 +1200
Subject: [PATCH 5/7] fix(be-tier10b): restrict alpha SIMD to LE-host/LE-data;
 add non-multiple-width tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 4th-pass review of PR #82 found a remaining ship-blocker: the prior
dispatcher routing in `src/row/dispatch/alpha_extract.rs`
(`need_swap = BE != cfg!(target_endian = "big")`) admitted SIMD on
BE-host/BE-data. The vector body's host-native u16 loads are correct in
that quadrant, but every alpha SIMD backend hardcodes its scalar tail to
`scalar::<BITS, false>` (NEON `src/row/arch/neon/alpha_extract.rs:249,295`,
SSE/AVX2/AVX-512/wasm mirror this). On a BE host with BE data and a
non-multiple width, the LE-only scalar tail then runs `u16::from_le` over
already-native samples, byte-swapping them before mask/shift — at BITS=10
sample `0x0123` becomes `0x2301 & 0x03ff = 0x0301`. Silent α corruption.

The existing Strategy A+ tests in
`src/sinker/mixed/tests/planar_gbr_high_bit.rs` use width 32, which is a
multiple of every backend's SIMD block, so the tail path was never
exercised.

Fix (option B from codex's recommendation — simpler than threading BE
through the SIMD helpers):

  let safe_for_simd = !BE && cfg!(target_endian = "little");
  if !safe_for_simd || !use_simd {
    // scalar — handles all (host_endian, BE) combinations correctly
  } else {
    // SIMD — only LE host + LE data
  }

Applied to both `copy_alpha_plane_u16_to_u8` and `copy_alpha_plane_u16`.

Truth table (`safe_for_simd = !BE && target_endian == "little"`):
- LE data, LE host: `!false && true  = true`  → SIMD (correct; tail
  `from_le` is a no-op)
- LE data, BE host: `!false && false = false` → scalar (correct;
  uses `from_le`)
- BE data, LE host: `!true  && true  = false` → scalar (correct;
  uses `from_be`)
- BE data, BE host: `!true  && false = false` → scalar (correct;
  uses `from_be`. SIMD vector body would be correct but the tail
  hardcodes BE=false and would corrupt non-multiple widths via
  `from_le` on already-native samples — until SIMD helpers are made
  const-generic over BE in a future Phase, scalar covers this rare
  quadrant correctly.)

Trade-off: BE-host/BE-data callers pay the scalar cost. Acceptable —
this is a rare quadrant; eventual Phase 4 work can thread BE into the
SIMD helpers if a real BE-input hot path lands.

Doc comments in both dispatchers updated to reflect the new contract.

New tests at non-multiple width 31 in
`src/sinker/mixed/tests/planar_gbr_high_bit.rs` exercise the SIMD tail
path on supported (LE) hosts:
- `gbrap10/12/14/16_strategy_a_plus_u16_matches_standalone_w31` — covers
  `copy_alpha_plane_u16` (no depth conv, u16→u16 RGBA path).
- `gbrap10_strategy_a_plus_matches_standalone_w31` — covers
  `copy_alpha_plane_u16_to_u8` (depth-conv `>> (BITS - 8)` path).

The two existing Strategy A+ macros (`test_gbrap_strategy_a_plus`,
`test_gbrap_strategy_a_plus_u16`) gained a 5-arg form taking width;
existing 4-arg callers default to width 32 unchanged. Test count moves
from 2183 to 2188 (+5).

Out of scope for this commit: touching the SIMD α-extract helpers
themselves (option A would require threading BE through 5 backends —
deferred until a real BE-input hot path needs it).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/dispatch/alpha_extract.rs             | 72 +++++++++++--------
 src/sinker/mixed/tests/planar_gbr_high_bit.rs | 71 ++++++++++++++++--
 2 files changed, 108 insertions(+), 35 deletions(-)

diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 8b7dabb8..00ecb61e 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -261,26 +261,31 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
 /// depth-conv `>> (BITS - 8)`.
 ///
 /// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
-/// `true` = BE on disk/wire). The SIMD α-extract backends use raw
-/// native-u16 loads (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`)
-/// and have no byte-swap path, so SIMD is only correct when the source
-/// byte order matches the host CPU's native byte order. The dispatcher
-/// computes `need_swap = BE != cfg!(target_endian = "big")` and routes
-/// to scalar whenever a swap would be required (LE-on-BE-host or
-/// BE-on-LE-host). Per the codex review of #82 the scalar helper is
-/// target-endian-aware via `u16::from_be` / `u16::from_le`, so this
-/// scalar fallback emits the correct α plane on every host. Phase 4 will
-/// plumb BE through SIMD if a real BE-input sinker hot-path lands.
+/// `true` = BE on disk/wire). The SIMD α-extract helpers use host-native
+/// `u16` loads (`vld1q_u16` / `_mm_loadu_si128` / `v128_load64_zero`) AND
+/// hardcode their scalar tail to `scalar::<BITS, false>`. So SIMD is only
+/// correct when BOTH the host CPU is little-endian AND the source data is
+/// little-endian — any other quadrant either loads the wrong byte order in
+/// the vector body (LE-data on BE-host / BE-data on LE-host) or feeds
+/// already-native u16 samples through `u16::from_le` in the scalar tail
+/// (BE-data on BE-host), corrupting the tail at non-multiple widths.
 ///
-/// Truth table (`need_swap = BE != target_endian == "big"`):
-/// - LE data, LE host: `false != false = false` → SIMD (host-native LE u16 loads correct)
-/// - LE data, BE host: `false != true  = true`  → scalar (uses `u16::from_le`)
-/// - BE data, LE host: `true  != false = true`  → scalar (uses `u16::from_be`)
-/// - BE data, BE host: `true  != true  = false` → SIMD (host-native BE u16 loads correct)
+/// The dispatcher computes
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to
+/// scalar in every other quadrant. The scalar helper is target-endian-aware
+/// via `u16::from_be` / `u16::from_le`, so this scalar fallback emits the
+/// correct α plane on every host. Phase 4 will plumb BE through the SIMD
+/// helpers if a BE-input sinker hot-path lands.
 ///
-/// Selects the highest available SIMD backend when host endian == data
-/// endian; falls back to scalar otherwise. When `use_simd` is `false`,
-/// calls scalar directly.
+/// Truth table (`safe_for_simd = !BE && target_endian == "little"`):
+/// - LE data, LE host: `!false && true  = true`  → SIMD (host-native LE u16 loads correct, tail `from_le` is no-op)
+/// - LE data, BE host: `!false && false = false` → scalar (handles via `from_le`)
+/// - BE data, LE host: `!true  && true  = false` → scalar (handles via `from_be`)
+/// - BE data, BE host: `!true  && false = false` → scalar (handles via `from_be`; SIMD vector body would be correct but tail `from_le` would corrupt non-multiple widths — see codex 4th-pass review of PR #82)
+///
+/// Selects the highest available SIMD backend on LE-host with LE-data;
+/// falls back to scalar otherwise. When `use_simd` is `false`, calls
+/// scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   alpha: &[u16],
@@ -288,9 +293,11 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
   width: usize,
   use_simd: bool,
 ) {
-  // Need a byte-swap if data byte order differs from host CPU's native order.
-  let need_swap = BE != cfg!(target_endian = "big");
-  if need_swap || !use_simd {
+  // SIMD α-extract helpers use host-native u16 loads + a scalar tail
+  // hardcoded to BE=false. They are only correct on LE host with LE
+  // source data. Force scalar in every other quadrant.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
     return scalar::copy_alpha_plane_u16_to_u8::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
@@ -340,14 +347,15 @@ pub(crate) fn copy_alpha_plane_u16_to_u8<const BITS: u32, const BE: bool>(
 ///
 /// `BE` selects the source α plane byte order (`false` = LE on disk/wire,
 /// `true` = BE on disk/wire). The dispatcher computes
-/// `need_swap = BE != cfg!(target_endian = "big")` and routes to scalar
-/// whenever a swap would be required: see `copy_alpha_plane_u16_to_u8`
-/// above for the truth table and rationale (SIMD α-extract uses
-/// host-native u16 loads; scalar is target-endian-aware).
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and routes to
+/// scalar in every other quadrant: see `copy_alpha_plane_u16_to_u8` above
+/// for the truth table and rationale (SIMD α-extract uses host-native u16
+/// loads AND hardcodes its scalar tail to `BE=false`, so it only handles
+/// the LE-host/LE-data quadrant correctly; scalar is target-endian-aware).
 ///
-/// Selects the highest available SIMD backend when host endian == data
-/// endian; falls back to scalar otherwise. When `use_simd` is `false`,
-/// calls scalar directly.
+/// Selects the highest available SIMD backend on LE-host with LE-data;
+/// falls back to scalar otherwise. When `use_simd` is `false`, calls
+/// scalar directly.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   alpha: &[u16],
@@ -355,9 +363,11 @@ pub(crate) fn copy_alpha_plane_u16<const BITS: u32, const BE: bool>(
   width: usize,
   use_simd: bool,
 ) {
-  // Need a byte-swap if data byte order differs from host CPU's native order.
-  let need_swap = BE != cfg!(target_endian = "big");
-  if need_swap || !use_simd {
+  // SIMD α-extract helpers use host-native u16 loads + a scalar tail
+  // hardcoded to BE=false. They are only correct on LE host with LE
+  // source data. Force scalar in every other quadrant.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
     return scalar::copy_alpha_plane_u16::<BITS, BE>(alpha, rgba_out, width);
   }
   cfg_select! {
diff --git a/src/sinker/mixed/tests/planar_gbr_high_bit.rs b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
index ff006040..d49e573e 100644
--- a/src/sinker/mixed/tests/planar_gbr_high_bit.rs
+++ b/src/sinker/mixed/tests/planar_gbr_high_bit.rs
@@ -140,10 +140,13 @@ test_gbrp_channel_reorder!(gbrp16_channel_reorder, Gbrp16, gbrp16_to, 16);
 
 macro_rules! test_gbrap_strategy_a_plus {
   ($name:ident, $marker:ident, $walker:ident, $bits:literal) => {
+    test_gbrap_strategy_a_plus!($name, $marker, $walker, $bits, 32);
+  };
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => {
     #[test]
     #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")]
     fn $name() {
-      let w = 32usize;
+      let w = $w as usize;
       let h = 8usize;
       let n = w * h;
       let mut g = std::vec![0u16; n];
@@ -177,7 +180,7 @@ macro_rules! test_gbrap_strategy_a_plus {
       // RGBA bytes must be identical between standalone and combo paths.
       assert_eq!(
         rgba_ref, rgba_combo,
-        "Strategy A+ RGBA mismatch for BITS={}", $bits,
+        "Strategy A+ RGBA mismatch for BITS={} w={}", $bits, $w,
       );
     }
   };
@@ -223,10 +226,13 @@ test_gbrap_strategy_a_plus!(
 // them surfaces here.
 macro_rules! test_gbrap_strategy_a_plus_u16 {
   ($name:ident, $marker:ident, $walker:ident, $bits:literal) => {
+    test_gbrap_strategy_a_plus_u16!($name, $marker, $walker, $bits, 32);
+  };
+  ($name:ident, $marker:ident, $walker:ident, $bits:literal, $w:literal) => {
     #[test]
     #[cfg_attr(miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri")]
     fn $name() {
-      let w = 32usize;
+      let w = $w as usize;
       let h = 8usize;
       let n = w * h;
       let mut g = std::vec![0u16; n];
@@ -262,7 +268,7 @@ macro_rules! test_gbrap_strategy_a_plus_u16 {
       // RGBA u16 elements must be byte-exact between standalone and combo paths.
       assert_eq!(
         rgba_u16_ref, rgba_u16_combo,
-        "Strategy A+ native-depth RGBA u16 mismatch for BITS={}", $bits,
+        "Strategy A+ native-depth RGBA u16 mismatch for BITS={} w={}", $bits, $w,
       );
     }
   };
@@ -293,6 +299,63 @@ test_gbrap_strategy_a_plus_u16!(
   16
 );
 
+// ---- Strategy A+ at non-multiple width (31) — exercises SIMD scalar tail ---
+//
+// The SIMD α-extract backends (`copy_alpha_plane_u16{_to_u8}`) hardcode
+// `scalar::<BITS, false>` for the tail (e.g. NEON block size 8 + width 31
+// leaves 7 px in the tail; AVX2/AVX-512 likewise). Codex's 4th-pass review
+// of PR #82 found that the prior dispatcher routing
+// (`need_swap = BE != cfg!(target_endian = "big")`) admitted SIMD on
+// BE-host/BE-data: the vector body's host-native loads are correct there,
+// but the LE-only scalar tail then byte-swaps already-native u16 samples,
+// silently corrupting α at non-multiple widths. The fix is to route SIMD
+// only for the LE-host/LE-data quadrant; these tests at width 31 exercise
+// the SIMD tail path on supported (LE) hosts, locking in the parity
+// guarantee for the LE/LE quadrant. (The LE/BE, BE/LE, BE/BE quadrants
+// are exercised at the scalar level by the `target_endian`-aware scalar
+// helper itself; the new dispatcher routes them to scalar always.)
+
+test_gbrap_strategy_a_plus_u16!(
+  gbrap10_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap10,
+  gbrap10_to,
+  10,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap12_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap12,
+  gbrap12_to,
+  12,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap14_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap14,
+  gbrap14_to,
+  14,
+  31
+);
+test_gbrap_strategy_a_plus_u16!(
+  gbrap16_strategy_a_plus_u16_matches_standalone_w31,
+  Gbrap16,
+  gbrap16_to,
+  16,
+  31
+);
+
+// u8-path Strategy A+ at width 31 — exercises the SIMD tail of
+// `copy_alpha_plane_u16_to_u8` (depth-conv `>> (BITS - 8)`). One BITS value
+// is sufficient to cover the same dispatcher path as the u16 set above;
+// Gbrap10 chosen for parity with the existing u8 Strategy A+ coverage.
+test_gbrap_strategy_a_plus!(
+  gbrap10_strategy_a_plus_matches_standalone_w31,
+  Gbrap10,
+  gbrap10_to,
+  10,
+  31
+);
+
 // ---- Gbrap alpha downshift correctness -------------------------------------
 
 macro_rules! test_gbrap_alpha_downshift {

From 8f2e329bdad7d002731cc41b84a7f15fcd07a25c Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 11:29:37 +1200
Subject: [PATCH 6/7] test(be-tier10b): gate LE-fixture scalar tests on
 cfg(target_endian = "little")
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The miri-sb-s390x CI job (BE host) on PR #82 was failing 36 scalar
tests in src/row/scalar/{alpha_extract,planar_gbr_high_bit}.rs. The
tests use host-native u16 literals (e.g. `vec![0x3FFu16, 0x1FF]`,
`[100u16; 1]`) as if they were on-disk LE encodings, then call kernels
with `<BITS, BE = false>` (LE path).

On a BE host (s390x), host-native u16 storage does NOT lay bytes out
little-endian, so the kernel's `u16::from_le` byte-swap correctly
reinterprets the host-native value and produces a different logical
value than the literal — making the assertion fail. The kernel itself
is correct; this is purely a test fixture-vs-kernel byte-order
mismatch on BE hosts.

The kernel's BE-host scalar correctness is locked down by the
dedicated `*_be_parity_*` / `*_be_parity_with_swapped_buffer` tests in
the same files. Those tests build BE-encoded fixtures via
`byte_swap_vec` / `swap_bytes` from LE inputs and assert that
`<BITS, true>` on the swapped buffer matches `<BITS, false>` on the
original buffer — byte-for-byte identical output on every host. They
are intentionally NOT gated.

Tests with byte-symmetric literals only (`0u16`, `u16::MAX`) are also
NOT gated — `from_le` is a no-op on those bit patterns regardless of
host endianness, so the assertions pass on BE without modification.

Gated tests:
  - alpha_extract.rs: 4 tests
    (copy_alpha_plane_u16{,_to_u8}_*)
  - planar_gbr_high_bit.rs: 32 tests
    (rgb_high_bit_*, rgb_u16_high_bit_*, rgba_opaque_*,
     gbra_rgba_*, gbr_to_rgb*_masks_*, gbra_to_rgba_*_masks_*,
     luma_u16_high_bit_*)

Test counts (aarch64-apple-darwin lib):
  Before: 2188 passed
  After:  2188 passed (no change on LE host — gates are no-ops)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/alpha_extract.rs       | 20 +++++++++++
 src/row/scalar/planar_gbr_high_bit.rs | 52 +++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index 23463a72..4d849485 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -294,7 +294,24 @@ mod tests {
     );
   }
 
+  // ---- LE-host fixture tests ----
+  //
+  // The tests below use host-native `u16` literals (e.g.
+  // `vec![0x3FFu16, 0x1FF]`) as if they were the on-disk LE encoding of
+  // those samples and then call the kernel with `<BITS, BE = false>`
+  // (LE path). On a BE host (e.g., s390x under miri-sb), host-native
+  // `u16` storage does NOT lay bytes out little-endian, so the kernel's
+  // `u16::from_le` byte-swap correctly reinterprets the host-native
+  // value and produces a different logical value than the literal —
+  // making the assertion fail. The kernel is correct: its BE-host
+  // scalar correctness is locked down by the dedicated
+  // `*_be_parity_with_swapped_buffer` tests below, which build
+  // BE-encoded fixtures via `swap_bytes` from LE inputs and assert
+  // byte-for-byte parity. Gating these LE-fixture tests on
+  // `target_endian = "little"` avoids fixture-vs-kernel byte-order
+  // confusion without weakening coverage.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_to_u8_depth_converts_at_each_bits_value() {
     // BITS=10
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF];
@@ -316,6 +333,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_preserves_native_u16_within_bits_range() {
     // In-range values pass through unchanged.
     let alpha: std::vec::Vec<u16> = std::vec![0x3FF, 0x1FF, 0x000];
@@ -328,6 +346,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_masks_overrange_to_bits_range() {
     // Over-range α (e.g., 0xFFFF at BITS=10) must be masked to low BITS.
     // Without the mask, raw u16 0xFFFF would leak straight to output and
@@ -343,6 +362,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_u16_to_u8_masks_overrange_then_shifts() {
     // Without the BITS mask, 0x0500 at BITS=10 would shift `>> 2` to
     // 320 and either narrow as u8 to 64 (scalar `as u8`) or saturate to
diff --git a/src/row/scalar/planar_gbr_high_bit.rs b/src/row/scalar/planar_gbr_high_bit.rs
index 22a60771..9cdc8568 100644
--- a/src/row/scalar/planar_gbr_high_bit.rs
+++ b/src/row/scalar/planar_gbr_high_bit.rs
@@ -505,9 +505,30 @@ mod tests {
   use super::*;
   use crate::ColorMatrix;
 
+  // ---- LE-host fixture tests ----
+  //
+  // The tests below use host-native `u16` literals (e.g. `[100u16; 1]`,
+  // `vec![400u16, 200u16, 0u16]`) as if they were the on-disk LE
+  // encoding of those samples and then call the kernel with
+  // `<BITS, BE = false>` (LE path). On a BE host (e.g., s390x under
+  // miri-sb), host-native `u16` storage does NOT lay bytes out
+  // little-endian, so the kernel's `u16::from_le` byte-swap correctly
+  // reinterprets the host-native value and produces a different
+  // logical value than the literal — making the assertion fail. The
+  // kernel is correct: its BE-host scalar correctness is locked down
+  // by the dedicated `scalar_*_be_parity_*` tests further below, which
+  // build BE-encoded fixtures via `byte_swap_vec` from LE inputs and
+  // assert byte-for-byte parity. Gating these LE-fixture tests on
+  // `target_endian = "little"` avoids fixture-vs-kernel byte-order
+  // confusion without weakening coverage.
+  // Tests with all-zero / all-`u16::MAX` (byte-symmetric) literals are
+  // intentionally NOT gated — `from_le` is a no-op on those bit
+  // patterns regardless of host endianness.
+
   // ---- gbr_to_rgb_high_bit_row: u8 output, downshift ----------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_channel_reorder() {
     // G=0, B=100, R=1000 → packed R,G,B = 1000>>2, 0>>2, 100>>2 = 250, 0, 25
     let g = [0u16; 1];
@@ -521,6 +542,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_max_value_becomes_0xff() {
     let max = (1u16 << 10) - 1; // 1023
     let g = [max; 4];
@@ -553,6 +575,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits9_downshift_by_1() {
     // BITS=9: shift = 1. Value 510 >> 1 = 255.
     let g = [510u16; 1];
@@ -564,6 +587,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits12_downshift_by_4() {
     // BITS=12: shift = 4. Value 4080 >> 4 = 255.
     let r = [4080u16; 1];
@@ -575,6 +599,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_multiple_pixels_correct_layout() {
     // 3 pixels: (R,G,B) = (100,200,300>>2=75), (200>>2=50,0,0), (0,150>>2=37,50>>2=12)
     // BITS=10, shift=2
@@ -600,6 +625,7 @@ mod tests {
   // ---- gbr_to_rgb_u16_high_bit_row: u16 output, no shift ------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_channel_reorder() {
     let g = [111u16; 1];
     let b = [222u16; 1];
@@ -612,6 +638,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_bits10_max_preserved() {
     let max = (1u16 << 10) - 1; // 1023
     let g = [max; 4];
@@ -634,6 +661,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_u16_high_bit_values_not_shifted() {
     // Verify that u16 output does NOT shift values (unlike u8 output).
     let g = [1000u16; 1];
@@ -649,6 +677,7 @@ mod tests {
   // ---- gbr_to_rgba_opaque_high_bit_row: u8 RGBA with constant alpha --------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_high_bit_bits10_alpha_is_0xff() {
     let max = (1u16 << 10) - 1;
     let g = [max; 4];
@@ -663,6 +692,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_high_bit_bits9_downshift_correct() {
     // BITS=9, shift=1. Value 510 >> 1 = 255.
     let g = [510u16; 1];
@@ -677,6 +707,7 @@ mod tests {
   // ---- gbr_to_rgba_opaque_u16_high_bit_row: u16 RGBA with constant alpha ---
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgba_opaque_u16_high_bit_bits10_alpha_is_1023() {
     let g = [500u16; 2];
     let b = [200u16; 2];
@@ -714,6 +745,7 @@ mod tests {
   // ---- gbra_to_rgba_high_bit_row: u8 RGBA with source alpha ----------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits10_source_alpha_downshifted() {
     // BITS=10, shift=2. Alpha value 512 >> 2 = 128.
     let g = [0u16; 1];
@@ -726,6 +758,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits10_max_alpha_is_0xff() {
     let max = (1u16 << 10) - 1;
     let g = [max; 2];
@@ -740,6 +773,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_high_bit_bits14_channel_reorder_and_shift() {
     // BITS=14, shift=6. R=16320 >> 6 = 255, G=0, B=0, A=8192 >> 6 = 128.
     let g = [0u16; 1];
@@ -757,6 +791,7 @@ mod tests {
   // ---- gbra_to_rgba_u16_high_bit_row: u16 RGBA with source alpha -----------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_u16_high_bit_source_alpha_preserved() {
     let g = [100u16; 1];
     let b = [200u16; 1];
@@ -771,6 +806,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_rgba_u16_high_bit_bits16_all_channels_preserved() {
     let g = [10000u16; 2];
     let b = [20000u16; 2];
@@ -789,6 +825,7 @@ mod tests {
   // ---- Round-trip parity: high-bit u8 output matches 8-bit source ----------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits10_parity_with_scaled_8bit() {
     // val=128 in 8-bit; in 10-bit: 128 << 2 = 512. 512 >> 2 = 128.
     let val: u16 = 128u16 << 2;
@@ -801,6 +838,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn rgb_high_bit_bits12_parity_with_scaled_8bit() {
     // val=200 in 8-bit; in 12-bit: 200 << 4 = 3200. 3200 >> 4 = 200.
     let val: u16 = 200u16 << 4;
@@ -817,6 +855,7 @@ mod tests {
   // correctly before processing, ensuring scalar/SIMD produce identical output.
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_high_bit_masks_upper_bits_bits10() {
     // BITS=10, mask=0x03FF. Input 0x0CFF has upper bits set.
     // masked = 0x0CFF & 0x03FF = 0x00FF = 255. 255 >> 2 = 63 as u8.
@@ -843,6 +882,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_high_bit_masks_upper_bits_multiple_widths_bits10() {
     // Width sweep: [1, 7, 8, 16, 17, 32, 33, 64, 128, 130].
     let dirty: u16 = 0x0500; // BITS=10: mask&0x0500 = 0x0100=256; 256>>2=64.
@@ -871,6 +911,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_high_bit_masks_upper_bits_alpha_bits10() {
     // Verify that the alpha channel is also masked before shifting.
     // BITS=10: dirty_alpha = 0x0800 | 512 = 0x0A00 = 2560.
@@ -890,6 +931,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgb_u16_high_bit_masks_upper_bits_bits10() {
     // u16-output: verify that masked sample is in the output (not raw dirty value).
     let dirty: u16 = 0x0CFF;
@@ -905,6 +947,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_u16_high_bit_masks_upper_bits_bits10() {
     // u16 RGBA output: all channels masked.
     let dirty: u16 = 0x0555; // BITS=10: masked = 0x0555 & 0x03FF = 0x0155 = 341.
@@ -922,6 +965,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgba_opaque_high_bit_masks_upper_bits_bits10() {
     // u8 RGBA opaque: RGB channels masked, alpha always 0xFF.
     let dirty: u16 = 0x0CFF; // masked & 0x03FF = 0x00FF = 255. 255>>2=63.
@@ -939,6 +983,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbr_to_rgba_opaque_u16_high_bit_masks_upper_bits_bits10() {
     // u16 RGBA opaque: RGB masked, alpha is opaque mask value.
     let dirty: u16 = 0x0CFF; // masked = 0x00FF = 255.
@@ -973,6 +1018,7 @@ mod tests {
   // ---- Cross-path consistency: direct GBRA vs masked RGB + separate alpha ---
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn gbra_to_rgba_high_bit_cross_path_consistency_bits10() {
     // With upper-bits-set alpha: direct gbra_to_rgba == manual masking.
     // BITS=10, dirty_alpha = 0x0800 | 0x0100 = 0x0900; masked=0x0100=256; 256>>2=64.
@@ -1004,6 +1050,7 @@ mod tests {
   // ---- gbr_to_luma_u16_high_bit_row: native-depth luma --------------------
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_max_white_not_banded() {
     // BITS=10: max = 1023. Old path gave (255 as u16) << 2 = 1020, not 1023.
     // New kernel must produce a value near 1023 for all-white input.
@@ -1027,6 +1074,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits12_max_white_not_banded() {
     // BITS=12: max = 4095. Old path: (255 as u16) << 4 = 4080.
     // New kernel should give a value in [4090, 4095].
@@ -1063,6 +1111,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_neutral_gray_midrange() {
     // BITS=10: mid = 512. Luma of neutral gray ≈ 512.
     let mid = 512u16;
@@ -1089,6 +1138,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_full_range_vs_limited_range() {
     // For mid-gray input, limited-range luma should be in [16<<2, 235<<2] = [64, 940].
     let mid = 512u16;
@@ -1174,6 +1224,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits16_limited_range_near_white_keeps_gradation() {
     // BITS=16, BT.709 luma weights ≈ Kr=0.2126, Kg=0.7152, Kb=0.0722.
     // Setting all 3 channels equal makes the matrix multiply produce
@@ -1208,6 +1259,7 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn luma_u16_high_bit_bits10_limited_range_endpoints() {
     // BITS=10: y_off=64 (=16<<2), y_max=940 (=235<<2), native_max=1023.
     // BT.709 luma at all-equal channels passes y_full ≈ input through.

From 8af8d6e3bac0729b1074ad195fa75ebef237cbb3 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 11:32:42 +1200
Subject: [PATCH 7/7] docs(be-tier10b): correct BITS range in
 copy_alpha_plane_u16_to_u8 doc

Copilot review on PR #82 caught: doc comment said `BITS` is "9, 10, 12,
or 14" but the runtime `assert!(BITS >= 8 && BITS <= 16)` allows the
full [8, 16] range, and real call sites pass `BITS = 16` (Yuva420p16le,
Gbrap16, etc.).  Updated to reflect actual behavior + enumerate the
formats that consume this helper (Yuva*p9/10/12/14/16 + Gbrap10/12/14/16).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/alpha_extract.rs | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index 4d849485..6c77346a 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -96,12 +96,14 @@ pub(crate) fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usiz
   }
 }
 
-/// Yuva*p9/10/12/14 → u8 RGBA: scatter α plane (u16) into
-/// `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`.
+/// Yuva*p9/10/12/14/16 + Gbrap10/12/14/16 → u8 RGBA: scatter α plane
+/// (u16) into `rgba_out[3 + 4*n]` (u8) with depth-conv `>> (BITS - 8)`.
 ///
-/// `BITS` is the source α bit depth (9, 10, 12, or 14). `BE` selects the
-/// **byte order** of the encoded source α plane: `false` = LE on disk/wire
-/// (e.g., AV `Yuva420p10le`), `true` = BE on disk/wire (e.g., `Yuva420p10be`).
+/// `BITS` is the source α bit depth (any value in `[8, 16]`; the runtime
+/// `assert!` enforces the range). In practice callers pass 9, 10, 12, 14,
+/// or 16. `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire (e.g., AV `Yuva420p10le`, `Gbrap10le`),
+/// `true` = BE on disk/wire (e.g., `Yuva420p10be`, `Gbrap10be`).
 ///
 /// Each raw u16 sample is converted from its disk byte order into host-native
 /// order via `u16::from_le` / `u16::from_be` BEFORE the BITS-mask + shift.