From 749971125e31369a67ea7d86e2c55ee3d30882c0 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Thu, 7 May 2026 23:36:21 +1200
Subject: [PATCH 01/10] feat(be-tier9): BE support for Rgbf32/Rgbf16 row
 kernels

Add `<const BE: bool>` to all Rgbf32 and Rgbf16 row kernels across
scalar, NEON, SSE4.1, AVX2, AVX-512, and wasm-simd128 backends, plus
both dispatchers. BE parity tests added to every backend; existing
callers (sinkers, scalar tests, arch tests) updated to `<false>`.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/row/arch/neon/packed_rgb_float.rs         | 306 ++++++++++++----
 src/row/arch/neon/tests/packed_rgb_float.rs   | 282 +++++++++++++--
 src/row/arch/wasm_simd128/packed_rgb_float.rs | 220 +++++++++---
 .../wasm_simd128/tests/packed_rgb_float.rs    | 255 +++++++++++--
 src/row/arch/x86_avx2/packed_rgb_float.rs     | 225 ++++++++----
 .../arch/x86_avx2/tests/packed_rgb_float.rs   | 308 ++++++++++++++--
 src/row/arch/x86_avx512/packed_rgb_float.rs   | 231 ++++++++----
 .../arch/x86_avx512/tests/packed_rgb_float.rs | 338 ++++++++++++++++--
 src/row/arch/x86_sse41/packed_rgb_float.rs    | 233 ++++++++----
 .../arch/x86_sse41/tests/packed_rgb_float.rs  | 328 +++++++++++++++--
 src/row/dispatch/rgb_f16_ops.rs               |  94 ++---
 src/row/dispatch/rgb_float_ops.rs             |  95 +++--
 src/row/scalar/packed_rgb_float.rs            | 187 ++++++++--
 src/row/scalar/tests.rs                       |  20 +-
 src/sinker/mixed/packed_rgb_f16.rs            |  14 +-
 src/sinker/mixed/packed_rgb_float.rs          |  12 +-
 16 files changed, 2566 insertions(+), 582 deletions(-)
diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
index 548a81f0..d775e016 100644
--- a/src/row/arch/neon/packed_rgb_float.rs
+++ b/src/row/arch/neon/packed_rgb_float.rs
@@ -15,14 +15,37 @@
 //! vector to 4 bytes / 4 u16 elements with `vst*` straight into the
 //! `R, G, B, R, …` packed output) and trivially fine for the lossless
 //! `f32` pass-through (just `vst1q_f32`).
+//!
+//! For `<const BE: bool>` kernels, each 4-lane f32 load is replaced by
+//! an endian-aware u32x4 load (via `load_endian_u32x4::<BE>`) followed
+//! by a `vreinterpretq_f32_u32` cast. For LE (BE=false) this is a
+//! pure load; for BE it adds a `vrev32q_u8` byte-swap.
 
 use core::arch::aarch64::*;
 
-use super::scalar;
+use super::{endian::load_endian_u32x4, scalar};
+
+/// Load 4 `f32` lanes from `ptr` in endian-aware fashion.
+/// `BE = false` → host-native load (identical to `vld1q_f32`).
+/// `BE = true`  → load as u32 with byte-swap, then reinterpret as f32.
+///
+/// # Safety
+///
+/// * NEON must be available.
+/// * `ptr` must be valid for 16 bytes.
+#[inline(always)]
+unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> float32x4_t {
+  unsafe {
+    let u = load_endian_u32x4::<BE>(ptr as *const u8);
+    vreinterpretq_f32_u32(u)
+  }
+}
 
 /// f32 RGB → u8 RGB. Clamp `[0, 1]` × 255, saturating round-to-nearest
 /// cast.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. NEON must be available (`is_aarch64_feature_detected!("neon")`).
@@ -30,7 +53,11 @@ use super::scalar;
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -45,9 +72,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 12 <= total_lanes {
-      let v0 = vld1q_f32(rgb_in.as_ptr().add(lane));
-      let v1 = vld1q_f32(rgb_in.as_ptr().add(lane + 4));
-      let v2 = vld1q_f32(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale);
       let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale);
@@ -84,7 +111,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     let pix_done = lane / 3;
     let tail_pix = width - pix_done;
     if tail_pix > 0 {
-      scalar::rgbf32_to_rgb_row(
+      scalar::rgbf32_to_rgb_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         tail_pix,
@@ -95,12 +122,18 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
 
 /// f32 RGB → u8 RGBA (alpha forced to `0xFF`).
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -126,10 +159,50 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
       // — the f32→u8 cast itself is the cost, not the gather.
       for sub in 0..4 {
         let base = (x + sub * 4) * 3;
-        let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
-        let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.0, zero), one), scale);
-        let g_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.1, zero), one), scale);
-        let b_clamped = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.2, zero), one), scale);
+        let v_rgb = if BE {
+          // For BE we cannot use vld3q_f32 directly (it always loads
+          // native-endian bytes). Load each f32 vector individually
+          // via the endian-aware helper, then manually deinterleave.
+          // Load 12 f32 values as 3 × float32x4_t, then deinterleave
+          // the R/G/B channels using vtrnq / vuzpq.
+          let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
+          let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
+          let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
+          // raw0 = [R0,G0,B0,R1], raw1 = [G1,B1,R2,G2], raw2 = [B2,R3,G3,B3]
+          // Deinterleave into per-channel vectors via vuzpq:
+          //   r = [R0,B0,G1,R2, R1,B1,…] — need proper deinterleave.
+          // Use the same scalar path for the BE deinterleave case to
+          // keep correctness simple.
+          float32x4x3_t(raw0, raw1, raw2)
+        } else {
+          vld3q_f32(rgb_in.as_ptr().add(base))
+        };
+
+        let (r_v, g_v, b_v) = if BE {
+          // Manual deinterleave: raw interleaved [R0,G0,B0,R1,G1,B1,R2,G2,B2,R3,G3,B3]
+          // split into three 4-element f32 arrays via temporary scalar approach.
+          let mut r_arr = [0.0f32; 4];
+          let mut g_arr = [0.0f32; 4];
+          let mut b_arr = [0.0f32; 4];
+          vst1q_f32(r_arr.as_mut_ptr(), v_rgb.0);
+          vst1q_f32(g_arr.as_mut_ptr(), v_rgb.1);
+          vst1q_f32(b_arr.as_mut_ptr(), v_rgb.2);
+          // r_arr = [R0,G0,B0,R1], g_arr = [G1,B1,R2,G2], b_arr = [B2,R3,G3,B3]
+          let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]];
+          let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]];
+          let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]];
+          (
+            vld1q_f32(r_deint.as_ptr()),
+            vld1q_f32(g_deint.as_ptr()),
+            vld1q_f32(b_deint.as_ptr()),
+          )
+        } else {
+          (v_rgb.0, v_rgb.1, v_rgb.2)
+        };
+
+        let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
+        let g_clamped = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale);
+        let b_clamped = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale);
 
         let r_u32 = vcvtnq_u32_f32(r_clamped);
         let g_u32 = vcvtnq_u32_f32(g_clamped);
@@ -163,7 +236,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
     }
 
     if x < width {
-      scalar::rgbf32_to_rgba_row(
+      scalar::rgbf32_to_rgba_row::<BE>(
         &rgb_in[x * 3..width * 3],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -174,13 +247,19 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
 
 /// f32 RGB → u16 RGB. Clamp `[0, 1]` × 65535, saturating cast.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
 
@@ -193,9 +272,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 12 <= total_lanes {
-      let v0 = vld1q_f32(rgb_in.as_ptr().add(lane));
-      let v1 = vld1q_f32(rgb_in.as_ptr().add(lane + 4));
-      let v2 = vld1q_f32(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale);
       let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale);
@@ -212,7 +291,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     let pix_done = lane / 3;
     let tail_pix = width - pix_done;
     if tail_pix > 0 {
-      scalar::rgbf32_to_rgb_u16_row(
+      scalar::rgbf32_to_rgb_u16_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         tail_pix,
@@ -223,13 +302,19 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
 
 /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`).
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_u16_row`] but the output is `&mut [u16]`
 /// with `len() >= 4 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
 
@@ -253,10 +338,32 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
       let mut b_h = [0u16; 8];
       for sub in 0..2 {
         let base = (x + sub * 4) * 3;
-        let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
-        let r_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.0, zero), one), scale);
-        let g_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.1, zero), one), scale);
-        let b_s = vmulq_f32(vminq_f32(vmaxq_f32(v_rgb.2, zero), one), scale);
+        let (r_v, g_v, b_v) = if BE {
+          let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
+          let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
+          let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
+          let mut r_arr = [0.0f32; 4];
+          let mut g_arr = [0.0f32; 4];
+          let mut b_arr = [0.0f32; 4];
+          vst1q_f32(r_arr.as_mut_ptr(), raw0);
+          vst1q_f32(g_arr.as_mut_ptr(), raw1);
+          vst1q_f32(b_arr.as_mut_ptr(), raw2);
+          let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]];
+          let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]];
+          let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]];
+          (
+            vld1q_f32(r_deint.as_ptr()),
+            vld1q_f32(g_deint.as_ptr()),
+            vld1q_f32(b_deint.as_ptr()),
+          )
+        } else {
+          let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
+          (v_rgb.0, v_rgb.1, v_rgb.2)
+        };
+
+        let r_s = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
+        let g_s = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale);
+        let b_s = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale);
         let r_u = vqmovn_u32(vcvtnq_u32_f32(r_s));
         let g_u = vqmovn_u32(vcvtnq_u32_f32(g_s));
         let b_u = vqmovn_u32(vcvtnq_u32_f32(b_s));
@@ -273,7 +380,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
     }
 
     if x < width {
-      scalar::rgbf32_to_rgba_u16_row(
+      scalar::rgbf32_to_rgba_u16_row::<BE>(
         &rgb_in[x * 3..width * 3],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -284,27 +391,48 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
 
 /// f32 RGB → f32 RGB lossless pass-through.
 ///
+/// When `BE = true` the input values are byte-swapped to host-native
+/// before being written (big-endian input → host-native output).
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    while i + 4 <= total {
-      let v = vld1q_f32(rgb_in.as_ptr().add(i));
-      vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
-      i += 4;
-    }
-    while i < total {
-      *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
-      i += 1;
+    if BE {
+      // For BE pass-through: load as u32 with byte-swap, store as f32.
+      while i + 4 <= total {
+        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
+        vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
+        i += 4;
+      }
+      while i < total {
+        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        i += 1;
+      }
+    } else {
+      while i + 4 <= total {
+        let v = vld1q_f32(rgb_in.as_ptr().add(i));
+        vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
+        i += 4;
+      }
+      while i < total {
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        i += 1;
+      }
     }
   }
 }
@@ -316,25 +444,42 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 // 4 pixels (= 12 f16 values) which matches the Rgbf32 loop granularity.
 //
 // `vcvt_f32_f16` widens 4 × f16 to 4 × f32 in a single FCVT instruction.
-// We load the raw u16 bits with `vld1_u16` and reinterpret the result as
-// `float16x4_t` before calling `vcvt_f32_f16`.
+//
+// For BE: we load the u16 bits via `load_endian_u16x8::<BE>` (loads 8 u16
+// with byte-swap for BE), extract the low 4 lanes into a `uint16x4_t`, then
+// reinterpret as `float16x4_t` before widening with `vcvt_f32_f16`.
+
+use super::endian::load_endian_u16x8;
 
 /// Widen 4 half-precision floats (`f16x4`, i.e. 8 bytes starting at `ptr`)
 /// to 4 single-precision floats into `out[0..4]`.
 ///
+/// For `BE = true` the f16 values are stored big-endian (bytes swapped);
+/// the byte-swap is applied before the widening conversion.
+///
 /// # Safety
 ///
 /// * NEON must be available.
-/// * `ptr` must be valid for 4 × u16 reads (8 bytes, no alignment required
-///   because `vld1_u16` accepts unaligned pointers on AArch64).
+/// * `ptr` must be valid for 4 × u16 reads (8 bytes).
 /// * `out` must be valid for 4 × f32 writes.
 #[inline(always)]
-unsafe fn widen_f16x4(ptr: *const half::f16, out: *mut f32) {
+unsafe fn widen_f16x4<const BE: bool>(ptr: *const half::f16, out: *mut f32) {
   unsafe {
-    let u16s = vld1_u16(ptr as *const u16);
-    let f16s = vreinterpret_f16_u16(u16s);
-    let f32s = vcvt_f32_f16(f16s);
-    vst1q_f32(out, f32s);
+    if BE {
+      // Load 8 bytes as u16x8, byte-swap each u16, take low 4.
+      let u8_ptr = ptr as *const u8;
+      let u16x8 = load_endian_u16x8::<BE>(u8_ptr);
+      // Extract low 4 lanes (the ones we need for 4 f16 values).
+      let u16x4 = vget_low_u16(u16x8);
+      let f16x4 = vreinterpret_f16_u16(u16x4);
+      let f32x4 = vcvt_f32_f16(f16x4);
+      vst1q_f32(out, f32x4);
+    } else {
+      let u16s = vld1_u16(ptr as *const u16);
+      let f16s = vreinterpret_f16_u16(u16s);
+      let f32s = vcvt_f32_f16(f16s);
+      vst1q_f32(out, f32s);
+    }
   }
 }
 
@@ -353,6 +498,8 @@ unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) {
 
 /// f16 RGB → u8 RGB.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. NEON must be available.
@@ -360,7 +507,11 @@ unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) {
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "neon,fp16")]
-pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -370,16 +521,16 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
-      widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
-      widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
+      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_row(
+    scalar::rgbf16_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -389,12 +540,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
 
 /// f16 RGB → u8 RGBA (alpha forced to `0xFF`).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "neon,fp16")]
-pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -404,16 +561,16 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
-      widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
-      widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
+      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_row(
+    scalar::rgbf16_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -423,13 +580,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
 
 /// f16 RGB → u16 RGB.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "neon,fp16")]
-pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -442,16 +601,16 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
-      widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
-      widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
+      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_u16_row(
+    scalar::rgbf16_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -461,13 +620,15 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
 
 /// f16 RGB → u16 RGBA (alpha forced to `0xFFFF`).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_u16_row`] but the output is `&mut [u16]` with
 /// `len() >= 4 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "neon,fp16")]
-pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -481,16 +642,16 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      widen_f16x4(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
-      widen_f16x4(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
-      widen_f16x4(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
+      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_u16_row(
+    scalar::rgbf16_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -500,13 +661,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
 
 /// f16 RGB → f32 RGB (lossless widen).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "neon,fp16")]
-pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -518,7 +681,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   let mut lane = 0usize;
   while lane + 4 <= total_lanes {
     unsafe {
-      widen_f16x4(rgb_in.as_ptr().add(lane), rgb_out.as_mut_ptr().add(lane));
+      widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), rgb_out.as_mut_ptr().add(lane));
     }
     lane += 4;
   }
@@ -534,13 +697,16 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
 
 /// f16 RGB → f16 RGB lossless pass-through.
 ///
+/// When `BE = true` the input values are byte-swapped to host-native order
+/// on output.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with
 /// `len() >= 3 * width` f16 elements.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
@@ -548,6 +714,6 @@ pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
 
-  // Bit-exact copy: reuse scalar which is already just copy_from_slice.
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  // Bit-exact copy / byte-swap: delegate to scalar.
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
index 42f566fe..6f0dadbc 100644
--- a/src/row/arch/neon/tests/packed_rgb_float.rs
+++ b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -34,9 +34,9 @@ fn rgbf32_to_rgb_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -49,9 +49,9 @@ fn rgbf32_to_rgba_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_neon, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -64,9 +64,9 @@ fn rgbf32_to_rgb_u16_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -79,9 +79,9 @@ fn rgbf32_to_rgba_u16_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_neon, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -94,9 +94,9 @@ fn rgbf32_to_rgb_f32_neon_matches_scalar_widths() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_neon = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_neon, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
     // Lossless: output should equal input bit-exact.
@@ -131,9 +131,9 @@ fn neon_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_neon = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -152,9 +152,9 @@ fn neon_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_neon = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_neon, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -173,9 +173,9 @@ fn neon_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_neon = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -194,9 +194,9 @@ fn neon_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_neon = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -215,9 +215,9 @@ fn neon_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_neon = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
   }
@@ -233,12 +233,250 @@ fn neon_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_neon = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_neon, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_neon, w);
     }
     assert_eq!(out_scalar, out_neon, "width {w}");
     // Lossless: output should equal input bit-exact.
     assert_eq!(out_neon, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — Rgbf32 -----------------------------------------------
+//
+// For each kernel: byte-swap the LE f32 inputs into a BE buffer, call the
+// kernel with `BE=true`, and assert the output matches the LE run (`BE=false`).
+
+/// Build a BE-encoded f32 slice by byte-swapping every 32-bit element.
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+/// Build a BE-encoded f16 slice by byte-swapping every 16-bit element.
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_f32_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    // BE path byte-swaps each f32, producing host-native = same as LE.
+    assert_eq!(out_le, out_be, "NEON rgbf32_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+// ---- BE parity tests — Rgbf16 -----------------------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgba_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "NEON rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_f32_be_matches_le() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_to_rgb_f16_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    // BE byte-swap should reconstruct original LE output bit-exact.
+    assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f16 BE parity width {w}");
+  }
+}
diff --git a/src/row/arch/wasm_simd128/packed_rgb_float.rs b/src/row/arch/wasm_simd128/packed_rgb_float.rs
index 82b6c538..06e7b2f7 100644
--- a/src/row/arch/wasm_simd128/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/packed_rgb_float.rs
@@ -10,7 +10,9 @@
 
 use core::arch::wasm32::*;
 
-use super::scalar;
+use super::{endian::load_endian_u32x4, scalar};
+
+// ---- helpers ------------------------------------------------------------------
 
 #[inline(always)]
 fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 {
@@ -21,6 +23,23 @@ fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 {
   i32x4_trunc_sat_f32x4(rounded)
 }
 
+/// Load 4 f32 values from `ptr`, byte-swapping each 32-bit element when
+/// `BE = true`.  The returned `v128` holds f32 bit patterns in host-native
+/// order so downstream float arithmetic is correct.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  simd128 must be
+/// available (compile-time `target_feature`).
+#[inline(always)]
+unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> v128 {
+  // load_endian_u32x4 byte-swaps each 32-bit lane when BE=true, giving us
+  // host-native f32 bit patterns.
+  unsafe { load_endian_u32x4::<BE>(ptr as *const u8) }
+}
+
+// ---- Tier 9 — Rgbf32 wasm-simd128 kernels ------------------------------------
+
 /// f32 RGB → u8 RGB.
 ///
 /// # Safety
@@ -30,7 +49,11 @@ fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 {
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -43,9 +66,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
   // 4 pixels = 12 lanes per iter.
   while lane + 12 <= total_lanes {
     unsafe {
-      let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128);
-      let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128);
-      let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128);
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_i32(v0, zero, one, scale);
       let i1 = clamp_scale_to_i32(v1, zero, one, scale);
@@ -68,7 +91,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf32_to_rgb_row(
+    scalar::rgbf32_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -79,7 +102,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
 /// f32 RGB → u8 RGBA (alpha forced to `0xFF`).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -92,9 +119,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
   let mut pix = 0usize;
   while lane + 12 <= total_lanes {
     unsafe {
-      let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128);
-      let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128);
-      let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128);
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_i32(v0, zero, one, scale);
       let i1 = clamp_scale_to_i32(v1, zero, one, scale);
@@ -118,7 +145,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf32_to_rgba_row(
+    scalar::rgbf32_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -129,7 +156,11 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
 /// f32 RGB → u16 RGB.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
 
@@ -141,9 +172,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
   let mut lane = 0usize;
   while lane + 12 <= total_lanes {
     unsafe {
-      let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128);
-      let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128);
-      let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128);
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_i32(v0, zero, one, scale);
       let i1 = clamp_scale_to_i32(v1, zero, one, scale);
@@ -167,7 +198,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf32_to_rgb_u16_row(
+    scalar::rgbf32_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -178,7 +209,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
 /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
 
@@ -191,9 +226,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
   let mut pix = 0usize;
   while lane + 12 <= total_lanes {
     unsafe {
-      let v0 = v128_load(rgb_in.as_ptr().add(lane) as *const v128);
-      let v1 = v128_load(rgb_in.as_ptr().add(lane + 4) as *const v128);
-      let v2 = v128_load(rgb_in.as_ptr().add(lane + 8) as *const v128);
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_i32(v0, zero, one, scale);
       let i1 = clamp_scale_to_i32(v1, zero, one, scale);
@@ -217,7 +252,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf32_to_rgba_u16_row(
+    scalar::rgbf32_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -225,27 +260,55 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
   }
 }
 
-/// f32 RGB → f32 RGB lossless pass-through.
+/// f32 RGB → f32 RGB lossless pass-through / byte-swap.
+///
+/// - `BE = false`: fast `v128_load` → `v128_store` copy (no math).
+/// - `BE = true`:  load each element as u32, byte-swap, store as f32.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
-  let total = width * 3;
-  let mut i = 0usize;
-  while i + 4 <= total {
-    unsafe {
-      let v = v128_load(rgb_in.as_ptr().add(i) as *const v128);
-      v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, v);
+  if !BE {
+    let total = width * 3;
+    let mut i = 0usize;
+    while i + 4 <= total {
+      unsafe {
+        let v = v128_load(rgb_in.as_ptr().add(i) as *const v128);
+        v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, v);
+      }
+      i += 4;
     }
-    i += 4;
-  }
-  while i < total {
-    unsafe {
-      *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+    while i < total {
+      unsafe {
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+      }
+      i += 1;
+    }
+  } else {
+    // BE: byte-swap each f32 element via u32 lane reinterpretation.
+    let total = width * 3;
+    let mut i = 0usize;
+    while i + 4 <= total {
+      unsafe {
+        // load_endian_u32x4::<true> byte-swaps each 32-bit lane.
+        let swapped = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
+        v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, swapped);
+      }
+      i += 4;
+    }
+    while i < total {
+      unsafe {
+        let bits = rgb_in.get_unchecked(i).to_bits().swap_bytes();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+      }
+      i += 1;
     }
-    i += 1;
   }
 }
 
@@ -256,9 +319,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 // `[f32; CHUNK_PIXELS * 3]` buffer, then call the existing wasm-simd128
 // Rgbf32 downstream kernels for the f32→u8/u16/f32 work.
 //
-// The widening loop is cheap relative to the subsequent SIMD integer conversion,
-// so this hybrid strategy avoids a full scalar fallback while keeping the
-// heavier per-sample math in SIMD.
+// For BE inputs the byte-swap is applied before widening so the widened f32
+// buffer is already host-native; downstream f32 kernels are called with
+// `BE=false` to avoid a second swap.
 //
 // CHUNK_PIXELS = 4 (= 12 f32 lanes), matching the simd128 Rgbf32 loop stride.
 
@@ -271,7 +334,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -281,16 +348,23 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
-      buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() };
+      let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let bits = if BE {
+        f.to_bits().swap_bytes()
+      } else {
+        f.to_bits()
+      };
+      buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is now host-native f32; call LE downstream.
+      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_row(
+    scalar::rgbf16_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -305,7 +379,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
 /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -315,16 +393,22 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
-      buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() };
+      let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let bits = if BE {
+        f.to_bits().swap_bytes()
+      } else {
+        f.to_bits()
+      };
+      buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_row(
+    scalar::rgbf16_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -340,7 +424,7 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -353,16 +437,22 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
-      buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() };
+      let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let bits = if BE {
+        f.to_bits().swap_bytes()
+      } else {
+        f.to_bits()
+      };
+      buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_u16_row(
+    scalar::rgbf16_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -377,7 +467,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
 /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -391,16 +481,22 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
-      buf[k] = unsafe { rgb_in.get_unchecked(lane + k).to_f32() };
+      let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let bits = if BE {
+        f.to_bits().swap_bytes()
+      } else {
+        f.to_bits()
+      };
+      buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_u16_row(
+    scalar::rgbf16_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -416,7 +512,7 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -428,12 +524,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   let total_lanes = width * 3;
   for i in 0..total_lanes {
     unsafe {
-      *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32();
+      let f = rgb_in.get_unchecked(i);
+      let bits = if BE {
+        f.to_bits().swap_bytes()
+      } else {
+        f.to_bits()
+      };
+      *rgb_out.get_unchecked_mut(i) = half::f16::from_bits(bits).to_f32();
     }
   }
 }
 
-/// f16 RGB → f16 RGB lossless pass-through (wasm-simd128).
+/// f16 RGB → f16 RGB lossless pass-through / byte-swap (wasm-simd128).
 ///
 /// # Safety
 ///
@@ -441,12 +543,12 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
 /// `len() >= 3 * width` f16 elements.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
index 280889d3..6a13b394 100644
--- a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
@@ -26,9 +26,9 @@ fn wasm_rgbf32_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb width {w}");
   }
@@ -40,9 +40,9 @@ fn wasm_rgbf32_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgba width {w}");
   }
@@ -54,9 +54,9 @@ fn wasm_rgbf32_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb_u16 width {w}");
   }
@@ -68,9 +68,9 @@ fn wasm_rgbf32_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgba_u16 width {w}");
   }
@@ -82,9 +82,9 @@ fn wasm_rgbf32_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf32_to_rgb_f32 width {w}");
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
@@ -110,9 +110,9 @@ fn wasm_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb width {w}");
   }
@@ -128,9 +128,9 @@ fn wasm_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgba width {w}");
   }
@@ -146,9 +146,9 @@ fn wasm_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_u16 width {w}");
   }
@@ -164,9 +164,9 @@ fn wasm_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgba_u16 width {w}");
   }
@@ -182,9 +182,9 @@ fn wasm_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_f32 width {w}");
   }
@@ -200,11 +200,222 @@ fn wasm_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_simd = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "wasm rgbf16_to_rgb_f16 width {w}");
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — wasm-simd128 Rgbf32 -----------------------------------
+
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+fn wasm_rgbf32_to_rgb_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+fn wasm_rgbf32_to_rgba_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+fn wasm_rgbf32_to_rgb_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+fn wasm_rgbf32_to_rgba_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "wasm rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+fn wasm_rgbf32_to_rgb_f32_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf32_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+// ---- BE parity tests — wasm-simd128 Rgbf16 -----------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgb_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgba_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf16_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgb_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgba_u16_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "wasm rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgb_f32_be_matches_le() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn wasm_rgbf16_to_rgb_f16_be_is_byteswap() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "wasm rgbf16_to_rgb_f16 BE parity width {w}");
+  }
+}
diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs
index d40d622b..80712abf 100644
--- a/src/row/arch/x86_avx2/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/packed_rgb_float.rs
@@ -6,12 +6,33 @@
 //! narrow); cross-lane unpacks need `_mm256_permute4x64_epi64` to fix
 //! the 128-bit lane interleave that AVX2 packs leave behind.
 //!
+//! For `<const BE: bool>` kernels, each 8-lane f32 load is replaced by
+//! `load_endian_u32x8::<BE>` (a `__m256i` with byte-swapped u32 lanes
+//! for BE inputs) followed by `_mm256_castsi256_ps` to reinterpret as f32.
+//!
 //! Pixel-aligned chunks of 8 pixels = 24 lanes per iteration so the
 //! tail handles 0–7 leftover pixels.
 
 use core::arch::x86_64::*;
 
+use super::endian::load_endian_u32x8;
+// For f16 widen we need 128-bit u16 load (8 × u16).
 use super::scalar;
+use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
+
+/// Load 8 f32 lanes from `ptr` in endian-aware fashion.
+///
+/// # Safety
+///
+/// AVX2 must be available; `ptr` must be valid for 32 bytes.
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn load_f32x8<const BE: bool>(ptr: *const f32) -> __m256 {
+  unsafe {
+    let u = load_endian_u32x8::<BE>(ptr as *const u8);
+    _mm256_castsi256_ps(u)
+  }
+}
 
 #[inline(always)]
 unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __m256) -> __m256i {
@@ -29,6 +50,8 @@ unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __
 
 /// f32 RGB → u8 RGB.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available.
@@ -36,7 +59,11 @@ unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -49,9 +76,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     let mut lane = 0usize;
     // 8 pixels = 24 lanes per iter. Three 256-bit f32 loads → 24 lanes.
     while lane + 24 <= total_lanes {
-      let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8));
-      let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16));
+      let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
 
       let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
@@ -88,7 +115,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_row(
+      scalar::rgbf32_to_rgb_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -98,9 +125,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
 }
 
 /// f32 RGB → u8 RGBA (alpha forced to `0xFF`).
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -113,9 +146,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
     let mut lane = 0usize;
     let mut pix = 0usize;
     while lane + 24 <= total_lanes {
-      let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8));
-      let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16));
+      let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
 
       let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
@@ -144,7 +177,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
       pix += 8;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_row(
+      scalar::rgbf32_to_rgba_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -154,9 +187,15 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
 }
 
 /// f32 RGB → u16 RGB.
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
 
@@ -168,9 +207,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 24 <= total_lanes {
-      let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8));
-      let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16));
+      let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
 
       let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
@@ -197,7 +236,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_u16_row(
+      scalar::rgbf32_to_rgb_u16_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -207,9 +246,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
 }
 
 /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`).
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
 
@@ -222,9 +267,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
     let mut lane = 0usize;
     let mut pix = 0usize;
     while lane + 24 <= total_lanes {
-      let v0 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 8));
-      let v2 = _mm256_loadu_ps(rgb_in.as_ptr().add(lane + 16));
+      let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
 
       let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
@@ -250,7 +295,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
       pix += 8;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_u16_row(
+      scalar::rgbf32_to_rgba_u16_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -260,23 +305,43 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
 }
 
 /// f32 RGB → f32 RGB lossless pass-through.
+///
+/// When `BE = true` the input values are byte-swapped to host-native before
+/// being written.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    while i + 8 <= total {
-      let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i));
-      _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
-      i += 8;
-    }
-    while i < total {
-      *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
-      i += 1;
+    if BE {
+      while i + 8 <= total {
+        let v = load_f32x8::<BE>(rgb_in.as_ptr().add(i));
+        _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 8;
+      }
+      while i < total {
+        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        i += 1;
+      }
+    } else {
+      while i + 8 <= total {
+        let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i));
+        _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 8;
+      }
+      while i < total {
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        i += 1;
+      }
     }
   }
 }
@@ -285,31 +350,38 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 //
 // `_mm256_cvtph_ps` (F16C) widens 8 × f16 (stored as 8 × i16 in a __m128i)
 // to 8 × f32 in a __m256.  We load 16 bytes (8 f16 values) via
-// `_mm_loadu_si128`.
-//
-// Downstream: after widening a 24-lane chunk (= 8 pixels) to f32, we call the
-// existing AVX2 Rgbf32 kernels. The scalar tail uses
-// `crate::row::scalar::rgbf16_to_*_row`.
+// `_mm_loadu_si128` (LE) or `load_endian_u16x8::<BE>` (with byte-swap for BE).
 //
 // `#[target_feature(enable = "avx2,f16c")]` ensures both features are active.
 
 /// Widen 8 × f16 (at `ptr`, 16 bytes) to 8 × f32 (returned as `__m256`).
 ///
+/// For `BE = true` the f16 values are stored big-endian; bytes are swapped
+/// before the F16C widening conversion.
+///
 /// # Safety
 ///
 /// * AVX2 + F16C must be available.
 /// * `ptr` must be valid for 16 bytes (8 × u16 / f16).
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-unsafe fn widen_f16x8_avx(ptr: *const half::f16) -> __m256 {
+unsafe fn widen_f16x8_avx<const BE: bool>(ptr: *const half::f16) -> __m256 {
   unsafe {
-    let raw = _mm_loadu_si128(ptr as *const __m128i);
-    _mm256_cvtph_ps(raw)
+    if BE {
+      // Load 16 bytes as u16x8 with byte-swap, then widen to f32x8.
+      let raw = load_endian_u16x8::<BE>(ptr as *const u8);
+      _mm256_cvtph_ps(raw)
+    } else {
+      let raw = _mm_loadu_si128(ptr as *const __m128i);
+      _mm256_cvtph_ps(raw)
+    }
   }
 }
 
 /// f16 RGB → u8 RGB (AVX2 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. AVX2 and F16C must be available.
@@ -317,7 +389,11 @@ unsafe fn widen_f16x8_avx(ptr: *const half::f16) -> __m256 {
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -327,19 +403,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
   while lane + 24 <= total_lanes {
     let mut buf = [0.0f32; 24];
     unsafe {
-      let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8));
-      let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16));
+      let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
+      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
     }
     lane += 24;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_row(
+    scalar::rgbf16_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -349,12 +425,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
 
 /// f16 RGB → u8 RGBA (alpha `0xFF`) (AVX2 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -364,19 +446,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
   while lane + 24 <= total_lanes {
     let mut buf = [0.0f32; 24];
     unsafe {
-      let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8));
-      let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16));
+      let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
+      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
     }
     lane += 24;
     pix += 8;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_row(
+    scalar::rgbf16_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -386,13 +468,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
 
 /// f16 RGB → u16 RGB (AVX2 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -405,19 +489,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
   while lane + 24 <= total_lanes {
     let mut buf = [0.0f32; 24];
     unsafe {
-      let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8));
-      let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16));
+      let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
+      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
     }
     lane += 24;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_u16_row(
+    scalar::rgbf16_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -427,12 +511,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
 
 /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (AVX2 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -446,19 +532,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
   while lane + 24 <= total_lanes {
     let mut buf = [0.0f32; 24];
     unsafe {
-      let f0 = widen_f16x8_avx(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 8));
-      let f2 = widen_f16x8_avx(rgb_in.as_ptr().add(lane + 16));
+      let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
+      let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
+      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
     }
     lane += 24;
     pix += 8;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_u16_row(
+    scalar::rgbf16_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -468,13 +554,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
 
 /// f16 RGB → f32 RGB (lossless widen) (AVX2 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -486,33 +574,38 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   let mut lane = 0usize;
   while lane + 8 <= total_lanes {
     unsafe {
-      let f = widen_f16x8_avx(rgb_in.as_ptr().add(lane));
+      let f = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
       _mm256_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
     }
     lane += 8;
   }
   // Scalar tail for the last 0-7 lanes.
+  #[allow(clippy::needless_range_loop)]
   for i in lane..total_lanes {
+    let bits = rgb_in[i].to_bits();
+    let h = half::f16::from_bits(if BE { bits.swap_bytes() } else { bits });
     unsafe {
-      *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32();
+      *rgb_out.get_unchecked_mut(i) = h.to_f32();
     }
   }
 }
 
 /// f16 RGB → f16 RGB lossless pass-through (AVX2 + F16C).
 ///
+/// When `BE = true` the input values are byte-swapped to host-native order.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with
 /// `len() >= 3 * width` f16 elements.
 #[inline]
 #[target_feature(enable = "avx2,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
index ab32d1a0..9fde3182 100644
--- a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
@@ -29,9 +29,9 @@ fn avx2_rgbf32_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb width {w}");
   }
@@ -46,9 +46,9 @@ fn avx2_rgbf32_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgba width {w}");
   }
@@ -63,9 +63,9 @@ fn avx2_rgbf32_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb_u16 width {w}");
   }
@@ -80,9 +80,9 @@ fn avx2_rgbf32_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgba_u16 width {w}");
   }
@@ -97,9 +97,9 @@ fn avx2_rgbf32_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2 rgbf32_to_rgb_f32 width {w}");
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
@@ -128,9 +128,9 @@ fn avx2_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2+F16C rgbf16_to_rgb width {w}");
   }
@@ -149,9 +149,9 @@ fn avx2_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX2+F16C rgbf16_to_rgba width {w}");
   }
@@ -170,9 +170,9 @@ fn avx2_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -194,9 +194,9 @@ fn avx2_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -218,9 +218,9 @@ fn avx2_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -242,9 +242,9 @@ fn avx2_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_simd = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -253,3 +253,267 @@ fn avx2_rgbf16_to_rgb_f16_matches_scalar() {
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — AVX2 Rgbf32 ------------------------------------------
+
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb_u16 BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2 rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgb_f32_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX2 rgbf32_to_rgb_f32 BE parity width {w}");
+  }
+}
+
+// ---- BE parity tests — AVX2 + F16C Rgbf16 ------------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgb BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgba BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgb_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_f32_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgb_f32 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_f16_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX2+F16C rgbf16_to_rgb_f16 BE parity width {w}"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs
index 90bdbd78..4db3e62d 100644
--- a/src/row/arch/x86_avx512/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/packed_rgb_float.rs
@@ -2,13 +2,34 @@
 //! (`Rgbf32`) source. 16-lane `__m512` registers; same lane-aligned
 //! pixel chunking as the AVX2 backend at twice the throughput.
 //!
+//! For `<const BE: bool>` kernels, each 16-lane f32 load is replaced by
+//! `load_endian_u32x16::<BE>` (a `__m512i` with byte-swapped u32 lanes
+//! for BE inputs) followed by `_mm512_castsi512_ps` to reinterpret as f32.
+//!
 //! Process 16 pixels = 48 lanes per iteration so the loop boundary
 //! lands on a pixel boundary; the scalar tail handles the leftover
 //! 0–15 pixels.
 
 use core::arch::x86_64::*;
 
+use super::endian::load_endian_u32x16;
+// For f16 widen we need a 256-bit u16 load (16 × u16 = 32 bytes).
 use super::scalar;
+use crate::row::arch::x86_avx2::endian::load_endian_u16x16;
+
+/// Load 16 f32 lanes from `ptr` in endian-aware fashion.
+///
+/// # Safety
+///
+/// AVX-512F must be available; `ptr` must be valid for 64 bytes.
+#[inline]
+#[target_feature(enable = "avx512f")]
+unsafe fn load_f32x16<const BE: bool>(ptr: *const f32) -> __m512 {
+  unsafe {
+    let u = load_endian_u32x16::<BE>(ptr as *const u8);
+    _mm512_castsi512_ps(u)
+  }
+}
 
 #[inline(always)]
 unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __m512) -> __m512i {
@@ -24,6 +45,8 @@ unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __
 
 /// f32 RGB → u8 RGB.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. AVX-512F + AVX-512BW must be available.
@@ -31,7 +54,11 @@ unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -44,9 +71,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     let mut lane = 0usize;
     // 16 pixels = 48 lanes per iter (3 × 16-lane f32 loads).
     while lane + 48 <= total_lanes {
-      let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16));
-      let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32));
+      let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
 
       let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
@@ -67,7 +94,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_row(
+      scalar::rgbf32_to_rgb_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -77,9 +104,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
 }
 
 /// f32 RGB → u8 RGBA (alpha forced to `0xFF`).
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -92,9 +125,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
     let mut lane = 0usize;
     let mut pix = 0usize;
     while lane + 48 <= total_lanes {
-      let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16));
-      let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32));
+      let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
 
       let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
@@ -122,7 +155,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
       pix += 16;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_row(
+      scalar::rgbf32_to_rgba_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -132,9 +165,15 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
 }
 
 /// f32 RGB → u16 RGB.
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
 
@@ -146,9 +185,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 48 <= total_lanes {
-      let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16));
-      let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32));
+      let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
 
       let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
@@ -168,7 +207,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_u16_row(
+      scalar::rgbf32_to_rgb_u16_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -178,9 +217,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
 }
 
 /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`).
+///
+/// When `BE = true` the input `f32` values are big-endian encoded.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
 
@@ -193,9 +238,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
     let mut lane = 0usize;
     let mut pix = 0usize;
     while lane + 48 <= total_lanes {
-      let v0 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 16));
-      let v2 = _mm512_loadu_ps(rgb_in.as_ptr().add(lane + 32));
+      let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
 
       let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
@@ -221,7 +266,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
       pix += 16;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_u16_row(
+      scalar::rgbf32_to_rgba_u16_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -231,23 +276,43 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
 }
 
 /// f32 RGB → f32 RGB lossless pass-through.
+///
+/// When `BE = true` the input values are byte-swapped to host-native before
+/// being written.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    while i + 16 <= total {
-      let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i));
-      _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
-      i += 16;
-    }
-    while i < total {
-      *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
-      i += 1;
+    if BE {
+      while i + 16 <= total {
+        let v = load_f32x16::<BE>(rgb_in.as_ptr().add(i));
+        _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 16;
+      }
+      while i < total {
+        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        i += 1;
+      }
+    } else {
+      while i + 16 <= total {
+        let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i));
+        _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 16;
+      }
+      while i < total {
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        i += 1;
+      }
     }
   }
 }
@@ -255,36 +320,41 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 // ---- Tier 9 — Rgbf16 AVX-512 + F16C entry points ---------------------------
 //
 // `_mm512_cvtph_ps` (F16C + AVX-512F) widens 16 × f16 (stored as 16 × i16 in
-// a __m256i) to 16 × f32 in a __m512.  We load 32 bytes (16 f16 values) via
-// `_mm256_loadu_si256`.
+// a __m256i) to 16 × f32 in a __m512.
 //
-// Downstream: after widening a 48-lane chunk (= 16 pixels) to f32, we call the
-// existing AVX-512 Rgbf32 kernels.  The scalar tail uses
-// `crate::row::scalar::rgbf16_to_*_row`.
+// For BE: load 32 bytes as __m256i via `load_endian_u16x16::<BE>` (which
+// byte-swaps each u16 for big-endian inputs), then call `_mm512_cvtph_ps`.
 //
 // `#[target_feature(enable = "avx512f,f16c")]` — `f16c` is the half↔single
-// narrowing/widening extension. AVX-512F + F16C is the minimum for
-// `_mm512_cvtph_ps`. AVX-512BW is a separate CPU-feature bit and is NOT
-// implied by AVX-512F; only enable `avx512bw` on functions that actually
-// use byte/word AVX-512 ops.
+// narrowing/widening extension.
 
 /// Widen 16 × f16 (at `ptr`, 32 bytes) to 16 × f32 (returned as `__m512`).
 ///
+/// For `BE = true` the f16 values are stored big-endian; bytes are swapped
+/// before the F16C widening conversion.
+///
 /// # Safety
 ///
 /// * AVX-512F + F16C must be available.
 /// * `ptr` must be valid for 32 bytes (16 × u16 / f16).
 #[inline]
 #[target_feature(enable = "avx512f,f16c")]
-unsafe fn widen_f16x16_avx512(ptr: *const half::f16) -> __m512 {
+unsafe fn widen_f16x16_avx512<const BE: bool>(ptr: *const half::f16) -> __m512 {
   unsafe {
-    let raw = _mm256_loadu_si256(ptr as *const __m256i);
-    _mm512_cvtph_ps(raw)
+    if BE {
+      let raw = load_endian_u16x16::<BE>(ptr as *const u8);
+      _mm512_cvtph_ps(raw)
+    } else {
+      let raw = _mm256_loadu_si256(ptr as *const __m256i);
+      _mm512_cvtph_ps(raw)
+    }
   }
 }
 
 /// f16 RGB → u8 RGB (AVX-512F + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. AVX-512F, AVX-512BW, and F16C must be available.
@@ -292,7 +362,11 @@ unsafe fn widen_f16x16_avx512(ptr: *const half::f16) -> __m512 {
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -302,19 +376,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
   while lane + 48 <= total_lanes {
     let mut buf = [0.0f32; 48];
     unsafe {
-      let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16));
-      let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32));
+      let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
+      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
     }
     lane += 48;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_row(
+    scalar::rgbf16_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -324,12 +398,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
 
 /// f16 RGB → u8 RGBA (alpha `0xFF`) (AVX-512F + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -339,19 +419,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
   while lane + 48 <= total_lanes {
     let mut buf = [0.0f32; 48];
     unsafe {
-      let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16));
-      let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32));
+      let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
+      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
     }
     lane += 48;
     pix += 16;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_row(
+    scalar::rgbf16_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -361,13 +441,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
 
 /// f16 RGB → u16 RGB (AVX-512F + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -380,19 +462,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
   while lane + 48 <= total_lanes {
     let mut buf = [0.0f32; 48];
     unsafe {
-      let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16));
-      let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32));
+      let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
+      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
     }
     lane += 48;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_u16_row(
+    scalar::rgbf16_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -402,12 +484,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
 
 /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (AVX-512F + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -421,19 +505,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
   while lane + 48 <= total_lanes {
     let mut buf = [0.0f32; 48];
     unsafe {
-      let f0 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 16));
-      let f2 = widen_f16x16_avx512(rgb_in.as_ptr().add(lane + 32));
+      let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
+      let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
+      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
     }
     lane += 48;
     pix += 16;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_u16_row(
+    scalar::rgbf16_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -443,13 +527,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
 
 /// f16 RGB → f32 RGB (lossless widen) (AVX-512F + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -461,33 +547,38 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   let mut lane = 0usize;
   while lane + 16 <= total_lanes {
     unsafe {
-      let f = widen_f16x16_avx512(rgb_in.as_ptr().add(lane));
+      let f = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
       _mm512_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
     }
     lane += 16;
   }
   // Scalar tail for the last 0-15 lanes.
+  #[allow(clippy::needless_range_loop)]
   for i in lane..total_lanes {
+    let bits = rgb_in[i].to_bits();
+    let h = half::f16::from_bits(if BE { bits.swap_bytes() } else { bits });
     unsafe {
-      *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32();
+      *rgb_out.get_unchecked_mut(i) = h.to_f32();
     }
   }
 }
 
 /// f16 RGB → f16 RGB lossless pass-through (AVX-512F + F16C).
 ///
+/// When `BE = true` the input values are byte-swapped to host-native order.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with
 /// `len() >= 3 * width` f16 elements.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
index 38c27804..493f07e0 100644
--- a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
@@ -29,9 +29,9 @@ fn avx512_rgbf32_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb width {w}");
   }
@@ -46,9 +46,9 @@ fn avx512_rgbf32_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgba width {w}");
   }
@@ -63,9 +63,9 @@ fn avx512_rgbf32_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb_u16 width {w}");
   }
@@ -80,9 +80,9 @@ fn avx512_rgbf32_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgba_u16 width {w}");
   }
@@ -97,9 +97,9 @@ fn avx512_rgbf32_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512 rgbf32_to_rgb_f32 width {w}");
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
@@ -130,9 +130,9 @@ fn avx512_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "AVX-512+F16C rgbf16_to_rgb width {w}");
   }
@@ -153,9 +153,9 @@ fn avx512_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -179,9 +179,9 @@ fn avx512_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -205,9 +205,9 @@ fn avx512_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -231,9 +231,9 @@ fn avx512_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -257,9 +257,9 @@ fn avx512_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_simd = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -268,3 +268,297 @@ fn avx512_rgbf16_to_rgb_f16_matches_scalar() {
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — AVX-512 Rgbf32 ----------------------------------------
+
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX-512 rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "AVX-512 rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 rgbf32_to_rgb_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgb_f32_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("avx512f") {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512 rgbf32_to_rgb_f32 BE parity width {w}"
+    );
+  }
+}
+
+// ---- BE parity tests — AVX-512 + F16C Rgbf16 ---------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgb BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgba BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgb_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_f32_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("avx512f")
+    || !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgb_f32 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_f16_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("avx512f") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "AVX-512+F16C rgbf16_to_rgb_f16 BE parity width {w}"
+    );
+  }
+}
diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs
index b4aec862..a2e61611 100644
--- a/src/row/arch/x86_sse41/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/packed_rgb_float.rs
@@ -8,13 +8,31 @@
 //! round-to-nearest-even cast, and `_mm_packus_*` for the saturating
 //! narrow.
 //!
+//! For `<const BE: bool>` kernels, each 4-lane f32 load is replaced by
+//! `load_endian_u32x4::<BE>` (a `__m128i` with byte-swapped u32 lanes
+//! for BE inputs) followed by `_mm_castsi128_ps` to reinterpret as f32.
+//!
 //! Pixel-aligned chunks (4 pixels = 12 lanes per iter for the u8/u16
 //! integer-output paths) keep the loop boundary on a pixel boundary
 //! so the scalar tail handles only the final 0–3 pixels.
 
 use core::arch::x86_64::*;
 
-use super::scalar;
+use super::{endian::load_endian_u32x4, scalar};
+
+/// Load 4 f32 lanes from `ptr` in endian-aware fashion.
+///
+/// # Safety
+///
+/// SSE4.1 + SSSE3 must be available; `ptr` must be valid for 16 bytes.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> __m128 {
+  unsafe {
+    let u = load_endian_u32x4::<BE>(ptr as *const u8);
+    _mm_castsi128_ps(u)
+  }
+}
 
 #[inline(always)]
 unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128) -> __m128i {
@@ -34,6 +52,8 @@ unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128
 
 /// f32 RGB → u8 RGB. Clamp `[0, 1]` × 255, saturating cast.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available.
@@ -41,7 +61,11 @@ unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -53,9 +77,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 12 <= total_lanes {
-      let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4));
-      let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_u32(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32(v1, zero, one, scale);
@@ -78,7 +102,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_row(
+      scalar::rgbf32_to_rgb_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -89,12 +113,18 @@ pub(crate) unsafe fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width
 
 /// f32 RGB → u8 RGBA (alpha forced to `0xFF`).
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -111,9 +141,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
     // R, G, B, R, G, B, … layout, so we widen the 12 bytes to 16 by
     // inserting alpha at the trailing position of each 4-byte group).
     while lane + 12 <= total_lanes {
-      let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4));
-      let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_u32(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32(v1, zero, one, scale);
@@ -139,7 +169,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
       pix += 4;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_row(
+      scalar::rgbf32_to_rgba_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -150,13 +180,19 @@ pub(crate) unsafe fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], wid
 
 /// f32 RGB → u16 RGB. Clamp `[0, 1]` × 65535, saturating cast.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
 
@@ -168,9 +204,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     let total_lanes = width * 3;
     let mut lane = 0usize;
     while lane + 12 <= total_lanes {
-      let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4));
-      let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_u32(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32(v1, zero, one, scale);
@@ -189,7 +225,7 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
     }
     let pix_done = lane / 3;
     if pix_done < width {
-      scalar::rgbf32_to_rgb_u16_row(
+      scalar::rgbf32_to_rgb_u16_row::<BE>(
         &rgb_in[pix_done * 3..width * 3],
         &mut rgb_out[pix_done * 3..width * 3],
         width - pix_done,
@@ -200,13 +236,19 @@ pub(crate) unsafe fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16],
 
 /// f32 RGB → u16 RGBA (alpha forced to `0xFFFF`).
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_u16_row`] but the output is `&mut [u16]`
 /// with `len() >= 4 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
 
@@ -219,9 +261,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
     let mut lane = 0usize;
     let mut pix = 0usize;
     while lane + 12 <= total_lanes {
-      let v0 = _mm_loadu_ps(rgb_in.as_ptr().add(lane));
-      let v1 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 4));
-      let v2 = _mm_loadu_ps(rgb_in.as_ptr().add(lane + 8));
+      let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
+      let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
 
       let i0 = clamp_scale_to_u32(v0, zero, one, scale);
       let i1 = clamp_scale_to_u32(v1, zero, one, scale);
@@ -246,7 +288,7 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
       pix += 4;
     }
     if pix < width {
-      scalar::rgbf32_to_rgba_u16_row(
+      scalar::rgbf32_to_rgba_u16_row::<BE>(
         &rgb_in[pix * 3..width * 3],
         &mut rgba_out[pix * 4..width * 4],
         width - pix,
@@ -257,27 +299,47 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16]
 
 /// f32 RGB → f32 RGB lossless pass-through.
 ///
+/// When `BE = true` the input values are byte-swapped to host-native
+/// before being written.
+///
 /// # Safety
 ///
 /// Same as [`rgbf32_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    while i + 4 <= total {
-      let v = _mm_loadu_ps(rgb_in.as_ptr().add(i));
-      _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
-      i += 4;
-    }
-    while i < total {
-      *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
-      i += 1;
+    if BE {
+      while i + 4 <= total {
+        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
+        _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 4;
+      }
+      while i < total {
+        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        i += 1;
+      }
+    } else {
+      while i + 4 <= total {
+        let v = _mm_loadu_ps(rgb_in.as_ptr().add(i));
+        _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
+        i += 4;
+      }
+      while i < total {
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        i += 1;
+      }
     }
   }
 }
@@ -288,31 +350,44 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32],
 // of a __m128i) to 4 × f32 in a __m128.  We load 8 bytes (4 f16 values) via
 // `_mm_loadl_epi64` (64-bit load into the low half of __m128i).
 //
-// Downstream: after widening a 12-lane chunk (= 4 pixels) to f32, we call the
-// existing SSE4.1 Rgbf32 kernels.  The scalar tail uses
-// `crate::row::scalar::rgbf16_to_*_row`.
+// For BE: load 8 bytes via `load_endian_u16x8::<BE>` which byte-swaps each
+// u16 for big-endian inputs, then call `_mm_cvtph_ps` on the result.
 //
 // `#[target_feature(enable = "sse4.1,f16c")]` ensures both features are active
 // in the body even though F16C is an independent feature bit.
 
+use super::endian::load_endian_u16x8;
+
 /// Widen 4 × f16 (at `ptr`, 8 bytes) to 4 × f32 (returned as `__m128`).
 ///
+/// For `BE = true` the f16 values are stored big-endian; bytes are swapped
+/// before the F16C widening conversion.
+///
 /// # Safety
 ///
 /// * SSE4.1 + F16C must be available.
 /// * `ptr` must be valid for 8 bytes (4 × u16 / f16).
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-unsafe fn widen_f16x4_sse(ptr: *const half::f16) -> __m128 {
+unsafe fn widen_f16x4_sse<const BE: bool>(ptr: *const half::f16) -> __m128 {
   unsafe {
-    // _mm_loadl_epi64: 64-bit load into the low half of __m128i.
-    let raw = _mm_loadl_epi64(ptr as *const __m128i);
-    _mm_cvtph_ps(raw)
+    if BE {
+      // Load 16 bytes (8 × u16) with byte-swap; the low 4 u16 are our
+      // 4 f16 values byte-swapped to host-native. Use the low 64 bits.
+      let raw = load_endian_u16x8::<BE>(ptr as *const u8);
+      _mm_cvtph_ps(raw)
+    } else {
+      // _mm_loadl_epi64: 64-bit load into the low half of __m128i.
+      let raw = _mm_loadl_epi64(ptr as *const __m128i);
+      _mm_cvtph_ps(raw)
+    }
   }
 }
 
 /// f16 RGB → u8 RGB (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 and F16C must be available.
@@ -320,7 +395,11 @@ unsafe fn widen_f16x4_sse(ptr: *const half::f16) -> __m128 {
 /// 3. `rgb_in` / `rgb_out` must not alias.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
 
@@ -330,19 +409,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4));
-      let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8));
+      let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgb_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_row(
+    scalar::rgbf16_to_rgb_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -352,12 +431,18 @@ pub(crate) unsafe fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8],
 
 /// f16 RGB → u8 RGBA (alpha `0xFF`) (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
 
@@ -367,19 +452,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4));
-      let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8));
+      let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgba_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_row(
+    scalar::rgbf16_to_rgba_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -389,13 +474,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8
 
 /// f16 RGB → u16 RGB (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [u16]` with
 /// `len() >= 3 * width` u16 elements.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -408,19 +495,19 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4));
-      let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8));
+      let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgb_u16_row(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
   let pix_done = lane / 3;
   if pix_done < width {
-    scalar::rgbf16_to_rgb_u16_row(
+    scalar::rgbf16_to_rgb_u16_row::<BE>(
       &rgb_in[pix_done * 3..width * 3],
       &mut rgb_out[pix_done * 3..width * 3],
       width - pix_done,
@@ -430,12 +517,14 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row(
 
 /// f16 RGB → u16 RGBA (alpha `0xFFFF`) (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
+pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -449,19 +538,19 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
   while lane + 12 <= total_lanes {
     let mut buf = [0.0f32; 12];
     unsafe {
-      let f0 = widen_f16x4_sse(rgb_in.as_ptr().add(lane));
-      let f1 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 4));
-      let f2 = widen_f16x4_sse(rgb_in.as_ptr().add(lane + 8));
+      let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
+      let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
+      let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgba_u16_row(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
     }
     lane += 12;
     pix += 4;
   }
   if pix < width {
-    scalar::rgbf16_to_rgba_u16_row(
+    scalar::rgbf16_to_rgba_u16_row::<BE>(
       &rgb_in[pix * 3..width * 3],
       &mut rgba_out[pix * 4..width * 4],
       width - pix,
@@ -471,13 +560,15 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row(
 
 /// f16 RGB → f32 RGB (lossless widen) (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [f32]` with
 /// `len() >= 3 * width` f32 elements.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -489,7 +580,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   let mut lane = 0usize;
   while lane + 4 <= total_lanes {
     unsafe {
-      let f = widen_f16x4_sse(rgb_in.as_ptr().add(lane));
+      let f = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
       _mm_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
     }
     lane += 4;
@@ -497,25 +588,35 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row(
   // Scalar tail for the last 0-3 lanes.
   for i in lane..total_lanes {
     unsafe {
-      *rgb_out.get_unchecked_mut(i) = rgb_in.get_unchecked(i).to_f32();
+      let v = load_f16_scalar::<BE>(rgb_in, i);
+      *rgb_out.get_unchecked_mut(i) = v.to_f32();
     }
   }
 }
 
 /// f16 RGB → f16 RGB lossless pass-through (SSE4.1 + F16C).
 ///
+/// When `BE = true` the input values are byte-swapped to host-native order.
+///
 /// # Safety
 ///
 /// Same as [`rgbf16_to_rgb_row`] but `rgb_out` is `&mut [half::f16]` with
 /// `len() >= 3 * width` f16 elements.
 #[inline]
 #[target_feature(enable = "sse4.1,f16c")]
-pub(crate) unsafe fn rgbf16_to_rgb_f16_row(
+pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
+}
+
+/// Scalar f16 load helper for tail loops (SSE4.1 module).
+#[inline(always)]
+fn load_f16_scalar<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
+  let bits = rgb_in[i].to_bits();
+  half::f16::from_bits(if BE { bits.swap_bytes() } else { bits })
 }
diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
index 2b917752..02f5b9ef 100644
--- a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
@@ -52,8 +52,8 @@ fn rgbf32_to_rgb_row_simd_matches_scalar_under_truncate_mxcsr() {
   let mut simd_out = std::vec![0u8; width * 3];
   let mut scalar_out = std::vec![0u8; width * 3];
 
-  unsafe { rgbf32_to_rgb_row(&rgb, &mut simd_out, width) };
-  scalar::rgbf32_to_rgb_row(&rgb, &mut scalar_out, width);
+  unsafe { rgbf32_to_rgb_row::<false>(&rgb, &mut simd_out, width) };
+  scalar::rgbf32_to_rgb_row::<false>(&rgb, &mut scalar_out, width);
 
   // Restore MXCSR before any assertion so panic formatting doesn't misfire.
   unsafe { write_mxcsr(saved) };
@@ -91,9 +91,9 @@ fn sse41_rgbf32_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf32_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb width {w}");
   }
@@ -108,9 +108,9 @@ fn sse41_rgbf32_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf32_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgba width {w}");
   }
@@ -125,9 +125,9 @@ fn sse41_rgbf32_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf32_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb_u16 width {w}");
   }
@@ -142,9 +142,9 @@ fn sse41_rgbf32_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf32_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf32_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgba_u16 width {w}");
   }
@@ -159,9 +159,9 @@ fn sse41_rgbf32_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf32(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf32_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf32_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf32_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf32_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1 rgbf32_to_rgb_f32 width {w}");
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
@@ -191,9 +191,9 @@ fn sse41_rgbf16_to_rgb_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 3];
     let mut out_simd = std::vec![0u8; w * 3];
-    scalar::rgbf16_to_rgb_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1+F16C rgbf16_to_rgb width {w}");
   }
@@ -213,9 +213,9 @@ fn sse41_rgbf16_to_rgba_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u8; w * 4];
     let mut out_simd = std::vec![0u8; w * 4];
-    scalar::rgbf16_to_rgba_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(out_scalar, out_simd, "SSE4.1+F16C rgbf16_to_rgba width {w}");
   }
@@ -235,9 +235,9 @@ fn sse41_rgbf16_to_rgb_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 3];
     let mut out_simd = std::vec![0u16; w * 3];
-    scalar::rgbf16_to_rgb_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -260,9 +260,9 @@ fn sse41_rgbf16_to_rgba_u16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0u16; w * 4];
     let mut out_simd = std::vec![0u16; w * 4];
-    scalar::rgbf16_to_rgba_u16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgba_u16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgba_u16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgba_u16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -285,9 +285,9 @@ fn sse41_rgbf16_to_rgb_f32_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![0.0f32; w * 3];
     let mut out_simd = std::vec![0.0f32; w * 3];
-    scalar::rgbf16_to_rgb_f32_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f32_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f32_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f32_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -310,9 +310,9 @@ fn sse41_rgbf16_to_rgb_f16_matches_scalar() {
     let input = pseudo_random_rgbf16(w);
     let mut out_scalar = std::vec![half::f16::ZERO; w * 3];
     let mut out_simd = std::vec![half::f16::ZERO; w * 3];
-    scalar::rgbf16_to_rgb_f16_row(&input, &mut out_scalar, w);
+    scalar::rgbf16_to_rgb_f16_row::<false>(&input, &mut out_scalar, w);
     unsafe {
-      rgbf16_to_rgb_f16_row(&input, &mut out_simd, w);
+      rgbf16_to_rgb_f16_row::<false>(&input, &mut out_simd, w);
     }
     assert_eq!(
       out_scalar, out_simd,
@@ -321,3 +321,283 @@ fn sse41_rgbf16_to_rgb_f16_matches_scalar() {
     assert_eq!(out_simd, input[..w * 3], "lossless width {w}");
   }
 }
+
+// ---- BE parity tests — SSE4.1 Rgbf32 ----------------------------------------
+//
+// For each kernel: byte-swap the LE f32 inputs into a BE buffer, call the
+// kernel with `BE=true`, and assert the output matches the LE run (`BE=false`).
+// x86 feature detection guards required (memory: x86_test_feature_guard).
+
+fn be_rgbf32(le: &[f32]) -> std::vec::Vec<f32> {
+  le.iter()
+    .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+fn be_rgbf16(le: &[half::f16]) -> std::vec::Vec<half::f16> {
+  le.iter()
+    .map(|v| half::f16::from_bits(v.to_bits().swap_bytes()))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf32_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "SSE4.1 rgbf32_to_rgb BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(out_le, out_be, "SSE4.1 rgbf32_to_rgba BE parity width {w}");
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf32_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 rgbf32_to_rgb_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 rgbf32_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgb_f32_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf32(w);
+    let be_in = be_rgbf32(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf32_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1 rgbf32_to_rgb_f32 BE parity width {w}"
+    );
+  }
+}
+
+// ---- BE parity tests — SSE4.1 + F16C Rgbf16 ----------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgb_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 3];
+    let mut out_be = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgb BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgba_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u8; w * 4];
+    let mut out_be = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgba BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgb_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 3];
+    let mut out_be = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgb_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgba_u16_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0u16; w * 4];
+    let mut out_be = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgba_u16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgba_u16 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgb_f32_be_matches_le() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![0.0f32; w * 3];
+    let mut out_be = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f32_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgb_f32 BE parity width {w}"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn sse41_rgbf16_to_rgb_f16_be_is_byteswap() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let le_in = pseudo_random_rgbf16(w);
+    let be_in = be_rgbf16(&le_in);
+    let mut out_le = std::vec![half::f16::ZERO; w * 3];
+    let mut out_be = std::vec![half::f16::ZERO; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f16_row::<false>(&le_in, &mut out_le, w);
+      rgbf16_to_rgb_f16_row::<true>(&be_in, &mut out_be, w);
+    }
+    assert_eq!(
+      out_le, out_be,
+      "SSE4.1+F16C rgbf16_to_rgb_f16 BE parity width {w}"
+    );
+  }
+}
diff --git a/src/row/dispatch/rgb_f16_ops.rs b/src/row/dispatch/rgb_f16_ops.rs
index 8dce5df5..c96af5d5 100644
--- a/src/row/dispatch/rgb_f16_ops.rs
+++ b/src/row/dispatch/rgb_f16_ops.rs
@@ -44,7 +44,12 @@ use crate::row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, s
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgb_out_min = rgb_row_bytes(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf16 row too short");
@@ -55,38 +60,38 @@ pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize,
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
           // SAFETY: `neon_available()` verified NEON is present.
-          unsafe { arch::neon::rgbf16_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
           // SAFETY: AVX-512F + F16C verified.
-          unsafe { arch::x86_avx512::rgbf16_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
           // SAFETY: AVX2 + F16C verified.
-          unsafe { arch::x86_avx2::rgbf16_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
           // SAFETY: SSE4.1 + F16C verified.
-          unsafe { arch::x86_sse41::rgbf16_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::rgbf16_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgb_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_row::<BE>(rgb_in, rgb_out, width);
 }
 
 /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B, A` `u8`
@@ -94,7 +99,12 @@ pub fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize,
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgba_out_min = rgba_row_bytes(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf16 row too short");
@@ -104,34 +114,34 @@ pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usiz
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
-          unsafe { arch::neon::rgbf16_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
-          unsafe { arch::x86_avx512::rgbf16_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
-          unsafe { arch::x86_avx2::rgbf16_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
-          unsafe { arch::x86_sse41::rgbf16_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf16_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgba_row(rgb_in, rgba_out, width);
+  scalar::rgbf16_to_rgba_row::<BE>(rgb_in, rgba_out, width);
 }
 
 /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B` `u16`
@@ -139,7 +149,7 @@ pub fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usiz
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgb_u16_row(
+pub fn rgbf16_to_rgb_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [u16],
   width: usize,
@@ -154,34 +164,34 @@ pub fn rgbf16_to_rgb_u16_row(
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
-          unsafe { arch::neon::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
-          unsafe { arch::x86_avx512::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
-          unsafe { arch::x86_avx2::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
-          unsafe { arch::x86_sse41::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgb_u16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width);
 }
 
 /// Converts packed `R, G, B` `half::f16` input to packed `R, G, B, A` `u16`
@@ -189,7 +199,7 @@ pub fn rgbf16_to_rgb_u16_row(
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgba_u16_row(
+pub fn rgbf16_to_rgba_u16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgba_out: &mut [u16],
   width: usize,
@@ -204,34 +214,34 @@ pub fn rgbf16_to_rgba_u16_row(
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
-          unsafe { arch::neon::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
-          unsafe { arch::x86_avx512::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
-          unsafe { arch::x86_avx2::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
-          unsafe { arch::x86_sse41::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgba_u16_row(rgb_in, rgba_out, width);
+  scalar::rgbf16_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width);
 }
 
 /// **Lossless** half-float pass-through: copies packed `R, G, B` `half::f16`
@@ -241,7 +251,7 @@ pub fn rgbf16_to_rgba_u16_row(
 /// `use_simd = false` forces the scalar reference path (which is also just
 /// `copy_from_slice` — the compiler will vectorize it regardless).
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgb_f16_row(
+pub fn rgbf16_to_rgb_f16_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [half::f16],
   width: usize,
@@ -256,34 +266,34 @@ pub fn rgbf16_to_rgb_f16_row(
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
-          unsafe { arch::neon::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
-          unsafe { arch::x86_avx512::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
-          unsafe { arch::x86_avx2::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
-          unsafe { arch::x86_sse41::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgb_f16_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
 }
 
 /// Lossless widening pass: converts packed `R, G, B` `half::f16` input to
@@ -292,7 +302,7 @@ pub fn rgbf16_to_rgb_f16_row(
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf16_to_rgb_f32_row(
+pub fn rgbf16_to_rgb_f32_row<const BE: bool>(
   rgb_in: &[half::f16],
   rgb_out: &mut [f32],
   width: usize,
@@ -307,32 +317,32 @@ pub fn rgbf16_to_rgb_f32_row(
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() && fp16_available() {
-          unsafe { arch::neon::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() && f16c_available() {
-          unsafe { arch::x86_avx512::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() && f16c_available() {
-          unsafe { arch::x86_avx2::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() && f16c_available() {
-          unsafe { arch::x86_sse41::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf16_to_rgb_f32_row(rgb_in, rgb_out, width);
+  scalar::rgbf16_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/dispatch/rgb_float_ops.rs b/src/row/dispatch/rgb_float_ops.rs
index 43baaf4c..5a238b91 100644
--- a/src/row/dispatch/rgb_float_ops.rs
+++ b/src/row/dispatch/rgb_float_ops.rs
@@ -30,7 +30,12 @@ use crate::row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, s
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn rgbf32_to_rgb_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgb_out_min = rgb_row_bytes(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short");
@@ -41,38 +46,38 @@ pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_s
       target_arch = "aarch64" => {
         if neon_available() {
           // SAFETY: `neon_available()` verified NEON is present.
-          unsafe { arch::neon::rgbf32_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
           // SAFETY: AVX-512F verified.
-          unsafe { arch::x86_avx512::rgbf32_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() {
           // SAFETY: AVX2 verified.
-          unsafe { arch::x86_avx2::rgbf32_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() {
           // SAFETY: SSE4.1 verified.
-          unsafe { arch::x86_sse41::rgbf32_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
           // SAFETY: simd128 compile-time verified.
-          unsafe { arch::wasm_simd128::rgbf32_to_rgb_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf32_to_rgb_row(rgb_in, rgb_out, width);
+  scalar::rgbf32_to_rgb_row::<BE>(rgb_in, rgb_out, width);
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u8`
@@ -80,7 +85,12 @@ pub fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize, use_s
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use_simd: bool) {
+pub fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgba_out_min = rgba_row_bytes(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short");
@@ -90,34 +100,34 @@ pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
-          unsafe { arch::neon::rgbf32_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::neon::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
-          unsafe { arch::x86_avx512::rgbf32_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx512::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if avx2_available() {
-          unsafe { arch::x86_avx2::rgbf32_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx2::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if sse41_available() {
-          unsafe { arch::x86_sse41::rgbf32_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_sse41::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf32_to_rgba_row(rgb_in, rgba_out, width); }
+          unsafe { arch::wasm_simd128::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf32_to_rgba_row(rgb_in, rgba_out, width);
+  scalar::rgbf32_to_rgba_row::<BE>(rgb_in, rgba_out, width);
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u16`
@@ -125,7 +135,12 @@ pub fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize, use
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgb_out_min = rgb_row_elems(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short");
@@ -135,34 +150,34 @@ pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize,
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
-          unsafe { arch::neon::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
-          unsafe { arch::x86_avx512::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() {
-          unsafe { arch::x86_avx2::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() {
-          unsafe { arch::x86_sse41::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf32_to_rgb_u16_row(rgb_in, rgb_out, width);
+  scalar::rgbf32_to_rgb_u16_row::<BE>(rgb_in, rgb_out, width);
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u16`
@@ -170,7 +185,12 @@ pub fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize,
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize, use_simd: bool) {
+pub fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgba_out_min = rgba_row_elems(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short");
@@ -180,34 +200,34 @@ pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
-          unsafe { arch::neon::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::neon::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
-          unsafe { arch::x86_avx512::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx512::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if avx2_available() {
-          unsafe { arch::x86_avx2::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_avx2::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
         if sse41_available() {
-          unsafe { arch::x86_sse41::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::x86_sse41::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width); }
+          unsafe { arch::wasm_simd128::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf32_to_rgba_u16_row(rgb_in, rgba_out, width);
+  scalar::rgbf32_to_rgba_u16_row::<BE>(rgb_in, rgba_out, width);
 }
 
 /// **Lossless** float pass-through: copies packed `R, G, B` `f32`
@@ -216,7 +236,12 @@ pub fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize
 ///
 /// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize, use_simd: bool) {
+pub fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+  use_simd: bool,
+) {
   let rgb_in_min = rgb_row_elems(width);
   let rgb_out_min = rgb_row_elems(width);
   assert!(rgb_in.len() >= rgb_in_min, "rgbf32 row too short");
@@ -226,32 +251,32 @@ pub fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize,
     cfg_select! {
       target_arch = "aarch64" => {
         if neon_available() {
-          unsafe { arch::neon::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::neon::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "x86_64" => {
         if avx512_available() {
-          unsafe { arch::x86_avx512::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx512::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if avx2_available() {
-          unsafe { arch::x86_avx2::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_avx2::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
         if sse41_available() {
-          unsafe { arch::x86_sse41::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::x86_sse41::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       target_arch = "wasm32" => {
         if simd128_available() {
-          unsafe { arch::wasm_simd128::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width); }
+          unsafe { arch::wasm_simd128::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width); }
           return;
         }
       },
       _ => {}
     }
   }
-  scalar::rgbf32_to_rgb_f32_row(rgb_in, rgb_out, width);
+  scalar::rgbf32_to_rgb_f32_row::<BE>(rgb_in, rgb_out, width);
 }
diff --git a/src/row/scalar/packed_rgb_float.rs b/src/row/scalar/packed_rgb_float.rs
index 58281e2c..e6d5e07b 100644
--- a/src/row/scalar/packed_rgb_float.rs
+++ b/src/row/scalar/packed_rgb_float.rs
@@ -66,42 +66,68 @@ pub(crate) fn f32_to_u16_clamped(v: f32) -> u16 {
   round_ties_even_nonneg(scaled) as u16
 }
 
+/// Read one f32 element from `rgb_in[i]`, byte-swapping the IEEE 754 bit
+/// pattern if `BE` is `true`. This is the scalar endian-aware load for
+/// big-endian Rgbf32 streams.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn load_f32<const BE: bool>(rgb_in: &[f32], i: usize) -> f32 {
+  let bits = rgb_in[i].to_bits();
+  f32::from_bits(if BE { bits.swap_bytes() } else { bits })
+}
+
+/// Read one `half::f16` element from `rgb_in[i]`, byte-swapping the
+/// bit pattern if `BE` is `true`. Scalar endian-aware load for Rgbf16.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn load_f16<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
+  let bits = rgb_in[i].to_bits();
+  half::f16::from_bits(if BE { bits.swap_bytes() } else { bits })
+}
+
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u8`
 /// output. Each `f32` is clamped to `[0, 1]` and scaled by 255.
 ///
+/// When `BE = true` the input `f32` values are encoded big-endian
+/// (bytes swapped relative to the host's native little-endian layout).
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf32_to_rgb_row(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
+pub(crate) fn rgbf32_to_rgb_row<const BE: bool>(rgb_in: &[f32], rgb_out: &mut [u8], width: usize) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
   for x in 0..width {
     let i = x * 3;
-    rgb_out[i] = f32_to_u8_clamped(rgb_in[i]);
-    rgb_out[i + 1] = f32_to_u8_clamped(rgb_in[i + 1]);
-    rgb_out[i + 2] = f32_to_u8_clamped(rgb_in[i + 2]);
+    rgb_out[i] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, i));
+    rgb_out[i + 1] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, i + 1));
+    rgb_out[i + 2] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, i + 2));
   }
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u8`
 /// output with `A = 0xFF` (the float source has no alpha).
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgba_out.len() < 4 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn rgbf32_to_rgba_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
   for x in 0..width {
     let s = x * 3;
     let d = x * 4;
-    rgba_out[d] = f32_to_u8_clamped(rgb_in[s]);
-    rgba_out[d + 1] = f32_to_u8_clamped(rgb_in[s + 1]);
-    rgba_out[d + 2] = f32_to_u8_clamped(rgb_in[s + 2]);
+    rgba_out[d] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, s));
+    rgba_out[d + 1] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, s + 1));
+    rgba_out[d + 2] = f32_to_u8_clamped(load_f32::<BE>(rgb_in, s + 2));
     rgba_out[d + 3] = 0xFF;
   }
 }
@@ -109,39 +135,51 @@ pub(crate) fn rgbf32_to_rgba_row(rgb_in: &[f32], rgba_out: &mut [u8], width: usi
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u16`
 /// output. Each `f32` is clamped to `[0, 1]` and scaled by 65535.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf32_to_rgb_u16_row(rgb_in: &[f32], rgb_out: &mut [u16], width: usize) {
+pub(crate) fn rgbf32_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
   for x in 0..width {
     let i = x * 3;
-    rgb_out[i] = f32_to_u16_clamped(rgb_in[i]);
-    rgb_out[i + 1] = f32_to_u16_clamped(rgb_in[i + 1]);
-    rgb_out[i + 2] = f32_to_u16_clamped(rgb_in[i + 2]);
+    rgb_out[i] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, i));
+    rgb_out[i + 1] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, i + 1));
+    rgb_out[i + 2] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, i + 2));
   }
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B, A` `u16`
 /// output with `A = 0xFFFF`.
 ///
+/// When `BE = true` the input `f32` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgba_out.len() < 4 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) fn rgbf32_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
   for x in 0..width {
     let s = x * 3;
     let d = x * 4;
-    rgba_out[d] = f32_to_u16_clamped(rgb_in[s]);
-    rgba_out[d + 1] = f32_to_u16_clamped(rgb_in[s + 1]);
-    rgba_out[d + 2] = f32_to_u16_clamped(rgb_in[s + 2]);
+    rgba_out[d] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, s));
+    rgba_out[d + 1] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, s + 1));
+    rgba_out[d + 2] = f32_to_u16_clamped(load_f32::<BE>(rgb_in, s + 2));
     rgba_out[d + 3] = 0xFFFF;
   }
 }
@@ -150,15 +188,32 @@ pub(crate) fn rgbf32_to_rgba_u16_row(rgb_in: &[f32], rgba_out: &mut [u16], width
 /// row into the output buffer without conversion. Source HDR values
 /// (> 1.0) and negatives are preserved bit-exact.
 ///
+/// When `BE = true` the input is byte-swapped (big-endian → host-native)
+/// so the output is always host-native `f32`.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width: usize) {
+pub(crate) fn rgbf32_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[f32],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
-  rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  if BE {
+    for (dst, src) in rgb_out[..width * 3]
+      .iter_mut()
+      .zip(rgb_in[..width * 3].iter())
+    {
+      let bits = src.to_bits().swap_bytes();
+      *dst = f32::from_bits(bits);
+    }
+  } else {
+    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  }
 }
 
 // ---- Tier 9 — Rgbf16 scalar row kernels --------------------------------
@@ -173,19 +228,25 @@ pub(crate) fn rgbf32_to_rgb_f32_row(rgb_in: &[f32], rgb_out: &mut [f32], width:
 /// `R, G, B` `u8` output.  Each `half::f16` is widened to `f32`, then
 /// clamped to `[0, 1]` and scaled by 255.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width: usize) {
+pub(crate) fn rgbf16_to_rgb_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
   for x in 0..width {
     let i = x * 3;
-    rgb_out[i] = f32_to_u8_clamped(rgb_in[i].to_f32());
-    rgb_out[i + 1] = f32_to_u8_clamped(rgb_in[i + 1].to_f32());
-    rgb_out[i + 2] = f32_to_u8_clamped(rgb_in[i + 2].to_f32());
+    rgb_out[i] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, i).to_f32());
+    rgb_out[i + 1] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, i + 1).to_f32());
+    rgb_out[i + 2] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, i + 2).to_f32());
   }
 }
 
@@ -193,20 +254,26 @@ pub(crate) fn rgbf16_to_rgb_row(rgb_in: &[half::f16], rgb_out: &mut [u8], width:
 /// `R, G, B, A` `u8` output with `A = 0xFF` (the half-float source has no
 /// alpha channel).
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgba_out.len() < 4 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn rgbf16_to_rgba_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
   for x in 0..width {
     let s = x * 3;
     let d = x * 4;
-    rgba_out[d] = f32_to_u8_clamped(rgb_in[s].to_f32());
-    rgba_out[d + 1] = f32_to_u8_clamped(rgb_in[s + 1].to_f32());
-    rgba_out[d + 2] = f32_to_u8_clamped(rgb_in[s + 2].to_f32());
+    rgba_out[d] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, s).to_f32());
+    rgba_out[d + 1] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, s + 1).to_f32());
+    rgba_out[d + 2] = f32_to_u8_clamped(load_f16::<BE>(rgb_in, s + 2).to_f32());
     rgba_out[d + 3] = 0xFF;
   }
 }
@@ -215,39 +282,51 @@ pub(crate) fn rgbf16_to_rgba_row(rgb_in: &[half::f16], rgba_out: &mut [u8], widt
 /// `R, G, B` `u16` output.  Each `half::f16` is widened to `f32`, then
 /// clamped to `[0, 1]` and scaled by 65535.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgb_u16_row(rgb_in: &[half::f16], rgb_out: &mut [u16], width: usize) {
+pub(crate) fn rgbf16_to_rgb_u16_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
   for x in 0..width {
     let i = x * 3;
-    rgb_out[i] = f32_to_u16_clamped(rgb_in[i].to_f32());
-    rgb_out[i + 1] = f32_to_u16_clamped(rgb_in[i + 1].to_f32());
-    rgb_out[i + 2] = f32_to_u16_clamped(rgb_in[i + 2].to_f32());
+    rgb_out[i] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, i).to_f32());
+    rgb_out[i + 1] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, i + 1).to_f32());
+    rgb_out[i + 2] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, i + 2).to_f32());
   }
 }
 
 /// Converts packed `R, G, B` 16-bit half-precision float input to packed
 /// `R, G, B, A` `u16` output with `A = 0xFFFF`.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgba_out.len() < 4 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgba_u16_row(rgb_in: &[half::f16], rgba_out: &mut [u16], width: usize) {
+pub(crate) fn rgbf16_to_rgba_u16_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
   for x in 0..width {
     let s = x * 3;
     let d = x * 4;
-    rgba_out[d] = f32_to_u16_clamped(rgb_in[s].to_f32());
-    rgba_out[d + 1] = f32_to_u16_clamped(rgb_in[s + 1].to_f32());
-    rgba_out[d + 2] = f32_to_u16_clamped(rgb_in[s + 2].to_f32());
+    rgba_out[d] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, s).to_f32());
+    rgba_out[d + 1] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, s + 1).to_f32());
+    rgba_out[d + 2] = f32_to_u16_clamped(load_f16::<BE>(rgb_in, s + 2).to_f32());
     rgba_out[d + 3] = 0xFFFF;
   }
 }
@@ -256,16 +335,30 @@ pub(crate) fn rgbf16_to_rgba_u16_row(rgb_in: &[half::f16], rgba_out: &mut [u16],
 /// (> 1.0) and negatives bit-exactly through the widen step.  Output
 /// is `f32`; no clamping is applied.
 ///
+/// When `BE = true` the input `half::f16` values are big-endian encoded.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgb_f32_row(rgb_in: &[half::f16], rgb_out: &mut [f32], width: usize) {
+pub(crate) fn rgbf16_to_rgb_f32_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
-  for i in 0..width * 3 {
-    rgb_out[i] = rgb_in[i].to_f32();
+  for (dst, src) in rgb_out[..width * 3]
+    .iter_mut()
+    .zip(rgb_in[..width * 3].iter())
+  {
+    let bits = if BE {
+      src.to_bits().swap_bytes()
+    } else {
+      src.to_bits()
+    };
+    *dst = half::f16::from_bits(bits).to_f32();
   }
 }
 
@@ -273,13 +366,29 @@ pub(crate) fn rgbf16_to_rgb_f32_row(rgb_in: &[half::f16], rgb_out: &mut [f32], w
 /// into the output buffer without any conversion.  Source HDR values and
 /// negatives are preserved bit-exact.
 ///
+/// When `BE = true` the input values are byte-swapped to host-native order
+/// on output.
+///
 /// # Panics
 ///
 /// Panics (any build profile) if `rgb_in.len() < 3 * width` or
 /// `rgb_out.len() < 3 * width`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn rgbf16_to_rgb_f16_row(rgb_in: &[half::f16], rgb_out: &mut [half::f16], width: usize) {
+pub(crate) fn rgbf16_to_rgb_f16_row<const BE: bool>(
+  rgb_in: &[half::f16],
+  rgb_out: &mut [half::f16],
+  width: usize,
+) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  if BE {
+    for (dst, src) in rgb_out[..width * 3]
+      .iter_mut()
+      .zip(rgb_in[..width * 3].iter())
+    {
+      *dst = half::f16::from_bits(src.to_bits().swap_bytes());
+    }
+  } else {
+    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  }
 }
diff --git a/src/row/scalar/tests.rs b/src/row/scalar/tests.rs
index 61a2857b..710f34bc 100644
--- a/src/row/scalar/tests.rs
+++ b/src/row/scalar/tests.rs
@@ -664,8 +664,8 @@ fn rgbf16_scalar_rgb_matches_widen_then_rgbf32() {
   let (rgb_in, widened, width) = rgbf16_test_inputs();
   let mut out_f16 = std::vec![0u8; width * 3];
   let mut out_via_f32 = std::vec![0u8; width * 3];
-  rgbf16_to_rgb_row(&rgb_in, &mut out_f16, width);
-  rgbf32_to_rgb_row(&widened, &mut out_via_f32, width);
+  rgbf16_to_rgb_row::<false>(&rgb_in, &mut out_f16, width);
+  rgbf32_to_rgb_row::<false>(&widened, &mut out_via_f32, width);
   assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgb scalar parity");
 }
 
@@ -678,8 +678,8 @@ fn rgbf16_scalar_rgba_matches_widen_then_rgbf32() {
   let (rgb_in, widened, width) = rgbf16_test_inputs();
   let mut out_f16 = std::vec![0u8; width * 4];
   let mut out_via_f32 = std::vec![0u8; width * 4];
-  rgbf16_to_rgba_row(&rgb_in, &mut out_f16, width);
-  rgbf32_to_rgba_row(&widened, &mut out_via_f32, width);
+  rgbf16_to_rgba_row::<false>(&rgb_in, &mut out_f16, width);
+  rgbf32_to_rgba_row::<false>(&widened, &mut out_via_f32, width);
   assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgba scalar parity");
 }
 
@@ -692,8 +692,8 @@ fn rgbf16_scalar_rgb_u16_matches_widen_then_rgbf32() {
   let (rgb_in, widened, width) = rgbf16_test_inputs();
   let mut out_f16 = std::vec![0u16; width * 3];
   let mut out_via_f32 = std::vec![0u16; width * 3];
-  rgbf16_to_rgb_u16_row(&rgb_in, &mut out_f16, width);
-  rgbf32_to_rgb_u16_row(&widened, &mut out_via_f32, width);
+  rgbf16_to_rgb_u16_row::<false>(&rgb_in, &mut out_f16, width);
+  rgbf32_to_rgb_u16_row::<false>(&widened, &mut out_via_f32, width);
   assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgb_u16 scalar parity");
 }
 
@@ -706,8 +706,8 @@ fn rgbf16_scalar_rgba_u16_matches_widen_then_rgbf32() {
   let (rgb_in, widened, width) = rgbf16_test_inputs();
   let mut out_f16 = std::vec![0u16; width * 4];
   let mut out_via_f32 = std::vec![0u16; width * 4];
-  rgbf16_to_rgba_u16_row(&rgb_in, &mut out_f16, width);
-  rgbf32_to_rgba_u16_row(&widened, &mut out_via_f32, width);
+  rgbf16_to_rgba_u16_row::<false>(&rgb_in, &mut out_f16, width);
+  rgbf32_to_rgba_u16_row::<false>(&widened, &mut out_via_f32, width);
   assert_eq!(out_f16, out_via_f32, "rgbf16_to_rgba_u16 scalar parity");
 }
 
@@ -719,7 +719,7 @@ fn rgbf16_scalar_rgba_u16_matches_widen_then_rgbf32() {
 fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() {
   let (rgb_in, widened, width) = rgbf16_test_inputs();
   let mut out = std::vec![0.0f32; width * 3];
-  rgbf16_to_rgb_f32_row(&rgb_in, &mut out, width);
+  rgbf16_to_rgb_f32_row::<false>(&rgb_in, &mut out, width);
   // Each output must equal the bit-exact widening of the input f16.
   assert_eq!(
     out, widened,
@@ -735,7 +735,7 @@ fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() {
 fn rgbf16_scalar_rgb_f16_is_copy() {
   let (rgb_in, _widened, width) = rgbf16_test_inputs();
   let mut out = std::vec![half::f16::ZERO; width * 3];
-  rgbf16_to_rgb_f16_row(&rgb_in, &mut out, width);
+  rgbf16_to_rgb_f16_row::<false>(&rgb_in, &mut out, width);
   assert_eq!(
     out, rgb_in,
     "rgbf16_to_rgb_f16 must be a byte-identical copy"
diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs
index 38e07d15..e349f130 100644
--- a/src/sinker/mixed/packed_rgb_f16.rs
+++ b/src/sinker/mixed/packed_rgb_f16.rs
@@ -234,27 +234,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if let Some(buf) = rgb_f16.as_deref_mut() {
       let f16_start = one_plane_start * 3;
       let f16_end = one_plane_end * 3;
-      rgbf16_to_rgb_f16_row(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
+      rgbf16_to_rgb_f16_row::<false>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
     }
 
     // Lossless f32 widen — also independent of integer conversion paths.
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf16_to_rgb_f32_row(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf16_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf16_to_rgb_u16_row(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf16_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_u16_row(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma /
@@ -269,7 +269,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -288,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
       w,
       h,
     )?;
-    rgbf16_to_rgb_row(rgb_in, rgb_row, w, use_simd);
+    rgbf16_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -328,7 +328,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     // over `rgb_row` via `expand_rgb_to_rgba_row`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs
index cc63c9b2..e1c17a39 100644
--- a/src/sinker/mixed/packed_rgb_float.rs
+++ b/src/sinker/mixed/packed_rgb_float.rs
@@ -209,20 +209,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf32_to_rgb_f32_row(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf32_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf32_to_rgb_u16_row(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf32_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_u16_row(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float→u8 conversion when
@@ -237,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -257,7 +257,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
       w,
       h,
     )?;
-    rgbf32_to_rgb_row(rgb_in, rgb_row, w, use_simd);
+    rgbf32_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -299,7 +299,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     // less memory pass for combined `with_rgb + with_rgba` callers.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())

From 8a086df94646c0a05fbbe3910fdbdbe310082b85 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 00:35:14 +1200
Subject: [PATCH 02/10] fix(be-tier9): widen_f16x4_sse BE path tail-overread in
 rgbf16 kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`widen_f16x4_sse<BE=true>` was calling `load_endian_u16x8` (which reads 16
bytes via `_mm_loadu_si128`) but the kernel only guarantees 8 readable
bytes per call (4 × f16). The third widen call per loop iteration reads
[lane*2+16, lane*2+32) while the buffer ends at lane*2+24 when
`lane+12 == total_lanes` — an 8-byte tail-overread that ASan caught on
PR #83's CI sanitizer job.

Add `load_endian_u16x4` (8-byte load via `_mm_loadl_epi64` + low-half
byte-swap; upper half zeroed). The fix is correct because `_mm_cvtph_ps`
only reads the low 64 bits (4 × f16) of its `__m128i` operand, so the
zeroed upper half is harmless. AVX2 (`widen_f16x8_avx`) and AVX-512
(`widen_f16x16_avx512`) need a full 16/32-byte u16 region per call so
they keep using `load_endian_u16x8` / `load_endian_u16x16`.

Verified locally with `RUSTFLAGS=-Zsanitizer=address cargo +nightly test
--target x86_64-apple-darwin` (2862 tests pass).
---
 src/row/arch/x86_sse41/endian.rs           | 56 ++++++++++++++++++++++
 src/row/arch/x86_sse41/packed_rgb_float.rs | 23 +++++----
 2 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs
index f7dc1d38..992ca30e 100644
--- a/src/row/arch/x86_sse41/endian.rs
+++ b/src/row/arch/x86_sse41/endian.rs
@@ -119,3 +119,59 @@ pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> __m128
     unsafe { load_le_u32x4(ptr) }
   }
 }
+
+// ---- u16x4 loaders (8-byte half-vector) ------------------------------------
+//
+// These load only 8 bytes (4 × u16) into the low half of an `__m128i` and
+// zero the upper half. Used by Rgbf16 widen kernels (`_mm_cvtph_ps` reads
+// the low 64 bits = 4 × f16) when the caller can only guarantee 8 readable
+// bytes — using the 16-byte `load_endian_u16x8` would tail-overread.
+
+/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into the low half of
+/// an `__m128i` in host-native order; the upper half is zero.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1
+/// (and SSSE3) enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadl_epi64(ptr.cast()) };
+  // On LE hosts the on-disk LE bytes already match host-native; on BE hosts
+  // we'd need to byte-swap, but the shuffle mask references only source
+  // bytes [0..8) which are the loaded bytes (upper half is zero from
+  // `_mm_loadl_epi64`), so the byte-swap is correct.
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into the low half of
+/// an `__m128i` in host-native order; the upper half is zero.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1
+/// (and SSSE3) enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadl_epi64(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4` based on
+/// the compile-time `BE` const parameter. Reads exactly 8 bytes.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x4` / `load_be_u16x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x4<const BE: bool>(ptr: *const u8) -> __m128i {
+  if BE {
+    unsafe { load_be_u16x4(ptr) }
+  } else {
+    unsafe { load_le_u16x4(ptr) }
+  }
+}
diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs
index a2e61611..4c6a479b 100644
--- a/src/row/arch/x86_sse41/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/packed_rgb_float.rs
@@ -356,12 +356,15 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 // `#[target_feature(enable = "sse4.1,f16c")]` ensures both features are active
 // in the body even though F16C is an independent feature bit.
 
-use super::endian::load_endian_u16x8;
+use super::endian::load_endian_u16x4;
 
 /// Widen 4 × f16 (at `ptr`, 8 bytes) to 4 × f32 (returned as `__m128`).
 ///
 /// For `BE = true` the f16 values are stored big-endian; bytes are swapped
-/// before the F16C widening conversion.
+/// before the F16C widening conversion. The loader reads exactly 8 bytes
+/// regardless of `BE` so the caller's `ptr` only needs 8 readable bytes
+/// (a 16-byte load via `load_endian_u16x8` would tail-overread the 4 × f16
+/// region the kernel actually owns).
 ///
 /// # Safety
 ///
@@ -371,16 +374,12 @@ use super::endian::load_endian_u16x8;
 #[target_feature(enable = "sse4.1,f16c")]
 unsafe fn widen_f16x4_sse<const BE: bool>(ptr: *const half::f16) -> __m128 {
   unsafe {
-    if BE {
-      // Load 16 bytes (8 × u16) with byte-swap; the low 4 u16 are our
-      // 4 f16 values byte-swapped to host-native. Use the low 64 bits.
-      let raw = load_endian_u16x8::<BE>(ptr as *const u8);
-      _mm_cvtph_ps(raw)
-    } else {
-      // _mm_loadl_epi64: 64-bit load into the low half of __m128i.
-      let raw = _mm_loadl_epi64(ptr as *const __m128i);
-      _mm_cvtph_ps(raw)
-    }
+    // 8-byte load (low 64 bits of __m128i, upper half zero). For `BE = true`
+    // the loader byte-swaps each u16 in place; for `BE = false` it's a plain
+    // load. `_mm_cvtph_ps` reads only the low 4 × f16 (low 64 bits), so the
+    // upper half being zero is harmless.
+    let raw = load_endian_u16x4::<BE>(ptr as *const u8);
+    _mm_cvtph_ps(raw)
   }
 }
 

From 115a85e69780a704bad3cb3a16fda4c071efb6e0 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 01:10:06 +1200
Subject: [PATCH 03/10] fix(be-tier9): make scalar BE conversion target-endian
 aware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tier 9 packed-float-RGB scalar BE conversion used unconditional
`x.swap_bytes()`, which always swaps regardless of host endianness. On
big-endian hosts (powerpc64, s390x) the source bytes are already in
host-native order, so an extra swap corrupts every BE row. The SIMD
`load_endian_*::<BE>` helpers shipped with feat/be-infra are already
target-endian aware (no-op on a matching host), so the scalar and
per-arch tail paths produced wrong output relative to the SIMD body
on a hypothetical s390x runner.

Replaced every `bits.swap_bytes()` / `to_bits().swap_bytes()` site in
the source files with `u32::from_be` / `u32::from_le` (for f32) or
`u16::from_be` / `u16::from_le` (for f16):

- `if BE { x.swap_bytes() } else { x }`
  → `if BE { u32::from_be(x) } else { u32::from_le(x) }`
- `f32::from_bits(raw.to_bits().swap_bytes())` (BE-only)
  → `f32::from_bits(if BE { u32::from_be(raw.to_bits()) }
       else { u32::from_le(raw.to_bits()) })`

`from_be` / `from_le` is a no-op when the encoded byte order matches
the host, a byte-swap when they differ — exactly mirroring the SIMD
helper semantics so LE and BE hosts now produce bit-identical output.

Special note for the f32 / f16 pass-through kernels: previously the
`else` branch fell back to `copy_from_slice`, which is a byte-level
copy. On a BE host that copies LE-encoded bytes into f32 / f16 lanes
verbatim, leaving the destination in non-host-native order — the
docstring claims "output is always host-native". The fix routes both
branches through `from_bits(from_be/from_le(to_bits()))`, which is a
no-op on LE host (correct, byte order matches) and a swap on BE host
(correct, since the data is LE-encoded).

Source-file call sites fixed:

- u32 (f32 → bits, target-endian decoded): 7 — scalar `load_f32`,
  scalar `rgbf32_to_rgb_f32_row`, neon / x86_sse41 / x86_avx2 /
  x86_avx512 / wasm_simd128 `rgbf32_to_rgb_f32_row` BE tails.
- u16 (f16 → bits, target-endian decoded): 12 — scalar `load_f16`,
  scalar `rgbf16_to_rgb_f32_row`, scalar `rgbf16_to_rgb_f16_row`,
  neon `widen_f16_tail`, x86_sse41 `load_f16_scalar`, x86_avx2 /
  x86_avx512 `rgbf16_to_rgb_f32_row` f16 widen tails, wasm_simd128
  five f16 widen lanes (`rgbf16_to_rgb_row`, `rgbf16_to_rgba_row`,
  `rgbf16_to_rgb_u16_row`, `rgbf16_to_rgba_u16_row`,
  `rgbf16_to_rgb_f32_row`).
- f32 special case: covered by the u32 sites (scalar
  `rgbf32_to_rgb_f32_row` and the per-arch BE tails go through
  `f32::from_bits(u32::from_be/le(to_bits()))`).
- f16 special case: covered by the u16 sites (scalar
  `rgbf16_to_rgb_f16_row` and `rgbf16_to_rgb_f32_row` go through
  `half::f16::from_bits(u16::from_be/le(to_bits()))`).

Test helpers (`be_rgbf32` / `be_rgbf16` in `tests/packed_rgb_float.rs`
across all arch backends) intentionally still use `swap_bytes()`
because they synthesize a BE-encoded buffer from an LE host input —
the unconditional swap is correct there and per-instructions remains
unchanged. The neon `widen_f16_tail` helper additionally became
`<const BE: bool>` (was previously calling `to_f32()` directly on
host-native bits, producing garbage when fed BE-encoded f16 — the
test `neon_rgbf16_to_rgb_f32_be_matches_le` failed at widths where
the 4-lane SIMD body left a non-zero tail).

Verified: 2170 lib tests pass on `aarch64-apple-darwin`;
`cargo build --target x86_64-apple-darwin --tests` clean;
`RUSTFLAGS="-C target-feature=+simd128" cargo build --target
wasm32-unknown-unknown --tests` clean (warnings pre-existing);
`cargo build --no-default-features` clean; `cargo fmt --check` clean;
`cargo clippy --all-targets --all-features -- -D warnings` clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/packed_rgb_float.rs         | 22 +++--
 src/row/arch/wasm_simd128/packed_rgb_float.rs | 29 ++++---
 src/row/arch/x86_avx2/packed_rgb_float.rs     | 10 ++-
 src/row/arch/x86_avx512/packed_rgb_float.rs   | 10 ++-
 src/row/arch/x86_sse41/packed_rgb_float.rs    | 10 ++-
 src/row/scalar/packed_rgb_float.rs            | 84 ++++++++++++-------
 6 files changed, 109 insertions(+), 56 deletions(-)

diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
index d775e016..524532d9 100644
--- a/src/row/arch/neon/packed_rgb_float.rs
+++ b/src/row/arch/neon/packed_rgb_float.rs
@@ -419,8 +419,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
         i += 4;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
         i += 1;
       }
     } else {
@@ -487,11 +487,23 @@ unsafe fn widen_f16x4<const BE: bool>(ptr: *const half::f16, out: *mut f32) {
 /// `n` must be in `[0, 4]` — `n == 0` is a no-op (the caller passes
 /// `total_lanes - lane`, which is `0` when `total_lanes` is a multiple of 4
 /// and the SIMD loop consumed the whole row).
+///
+/// For `BE = true` the source f16 bits are decoded from big-endian to
+/// host-native before widening; for `BE = false` they are read as host-
+/// native (identical to a plain LE load on every shipping target). This
+/// matches the SIMD body's `widen_f16x4::<BE>` semantics so partial-pixel
+/// tail bytes round-trip identically to the full-vector path.
 #[inline(always)]
-unsafe fn widen_f16_tail(src: &[half::f16], dst: &mut [f32], n: usize) {
+unsafe fn widen_f16_tail<const BE: bool>(src: &[half::f16], dst: &mut [f32], n: usize) {
   for i in 0..n {
     unsafe {
-      *dst.get_unchecked_mut(i) = src.get_unchecked(i).to_f32();
+      let raw = src.get_unchecked(i).to_bits();
+      let host_bits = if BE {
+        u16::from_be(raw)
+      } else {
+        u16::from_le(raw)
+      };
+      *dst.get_unchecked_mut(i) = half::f16::from_bits(host_bits).to_f32();
     }
   }
 }
@@ -687,7 +699,7 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   }
   // Scalar tail for the last 0-3 lanes (partial pixel at most).
   unsafe {
-    widen_f16_tail(
+    widen_f16_tail::<BE>(
       rgb_in.get_unchecked(lane..),
       rgb_out.get_unchecked_mut(lane..),
       total_lanes - lane,
diff --git a/src/row/arch/wasm_simd128/packed_rgb_float.rs b/src/row/arch/wasm_simd128/packed_rgb_float.rs
index 06e7b2f7..32e3a37a 100644
--- a/src/row/arch/wasm_simd128/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/packed_rgb_float.rs
@@ -304,8 +304,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
     }
     while i < total {
       unsafe {
-        let bits = rgb_in.get_unchecked(i).to_bits().swap_bytes();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        let bits = rgb_in.get_unchecked(i).to_bits();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
       }
       i += 1;
     }
@@ -349,10 +349,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
       let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let raw = f.to_bits();
       let bits = if BE {
-        f.to_bits().swap_bytes()
+        u16::from_be(raw)
       } else {
-        f.to_bits()
+        u16::from_le(raw)
       };
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
@@ -394,10 +395,11 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
       let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let raw = f.to_bits();
       let bits = if BE {
-        f.to_bits().swap_bytes()
+        u16::from_be(raw)
       } else {
-        f.to_bits()
+        u16::from_le(raw)
       };
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
@@ -438,10 +440,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
       let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let raw = f.to_bits();
       let bits = if BE {
-        f.to_bits().swap_bytes()
+        u16::from_be(raw)
       } else {
-        f.to_bits()
+        u16::from_le(raw)
       };
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
@@ -482,10 +485,11 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
     let mut buf = [0.0f32; 12];
     for k in 0..12 {
       let f = unsafe { rgb_in.get_unchecked(lane + k) };
+      let raw = f.to_bits();
       let bits = if BE {
-        f.to_bits().swap_bytes()
+        u16::from_be(raw)
       } else {
-        f.to_bits()
+        u16::from_le(raw)
       };
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
@@ -525,10 +529,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   for i in 0..total_lanes {
     unsafe {
       let f = rgb_in.get_unchecked(i);
+      let raw = f.to_bits();
       let bits = if BE {
-        f.to_bits().swap_bytes()
+        u16::from_be(raw)
       } else {
-        f.to_bits()
+        u16::from_le(raw)
       };
       *rgb_out.get_unchecked_mut(i) = half::f16::from_bits(bits).to_f32();
     }
diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs
index 80712abf..86aacd31 100644
--- a/src/row/arch/x86_avx2/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/packed_rgb_float.rs
@@ -328,8 +328,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
         i += 8;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
         i += 1;
       }
     } else {
@@ -583,7 +583,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   #[allow(clippy::needless_range_loop)]
   for i in lane..total_lanes {
     let bits = rgb_in[i].to_bits();
-    let h = half::f16::from_bits(if BE { bits.swap_bytes() } else { bits });
+    let h = half::f16::from_bits(if BE {
+      u16::from_be(bits)
+    } else {
+      u16::from_le(bits)
+    });
     unsafe {
       *rgb_out.get_unchecked_mut(i) = h.to_f32();
     }
diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs
index 4db3e62d..ff505f6d 100644
--- a/src/row/arch/x86_avx512/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/packed_rgb_float.rs
@@ -299,8 +299,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
         i += 16;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
         i += 1;
       }
     } else {
@@ -556,7 +556,11 @@ pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
   #[allow(clippy::needless_range_loop)]
   for i in lane..total_lanes {
     let bits = rgb_in[i].to_bits();
-    let h = half::f16::from_bits(if BE { bits.swap_bytes() } else { bits });
+    let h = half::f16::from_bits(if BE {
+      u16::from_be(bits)
+    } else {
+      u16::from_le(bits)
+    });
     unsafe {
       *rgb_out.get_unchecked_mut(i) = h.to_f32();
     }
diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs
index 4c6a479b..f8272169 100644
--- a/src/row/arch/x86_sse41/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/packed_rgb_float.rs
@@ -326,8 +326,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
         i += 4;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits().swap_bytes();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(bits);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
         i += 1;
       }
     } else {
@@ -617,5 +617,9 @@ pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
 #[inline(always)]
 fn load_f16_scalar<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
   let bits = rgb_in[i].to_bits();
-  half::f16::from_bits(if BE { bits.swap_bytes() } else { bits })
+  half::f16::from_bits(if BE {
+    u16::from_be(bits)
+  } else {
+    u16::from_le(bits)
+  })
 }
diff --git a/src/row/scalar/packed_rgb_float.rs b/src/row/scalar/packed_rgb_float.rs
index e6d5e07b..6ff1e2fb 100644
--- a/src/row/scalar/packed_rgb_float.rs
+++ b/src/row/scalar/packed_rgb_float.rs
@@ -66,21 +66,35 @@ pub(crate) fn f32_to_u16_clamped(v: f32) -> u16 {
   round_ties_even_nonneg(scaled) as u16
 }
 
-/// Read one f32 element from `rgb_in[i]`, byte-swapping the IEEE 754 bit
-/// pattern if `BE` is `true`. This is the scalar endian-aware load for
-/// big-endian Rgbf32 streams.
+/// Read one f32 element from `rgb_in[i]`, decoding the IEEE 754 bit
+/// pattern from `BE` byte order to host-native byte order. Scalar
+/// endian-aware load for Rgbf32 streams.
+///
+/// `from_be` / `from_le` are target-endian aware: a no-op when the
+/// stored byte order matches the host, a byte-swap when they differ.
+/// Mirrors the SIMD `load_endian_*::<BE>` helpers' semantics so LE and
+/// BE hosts produce identical decoded values.
 #[cfg_attr(not(tarpaulin), inline(always))]
 fn load_f32<const BE: bool>(rgb_in: &[f32], i: usize) -> f32 {
   let bits = rgb_in[i].to_bits();
-  f32::from_bits(if BE { bits.swap_bytes() } else { bits })
+  f32::from_bits(if BE {
+    u32::from_be(bits)
+  } else {
+    u32::from_le(bits)
+  })
 }
 
-/// Read one `half::f16` element from `rgb_in[i]`, byte-swapping the
-/// bit pattern if `BE` is `true`. Scalar endian-aware load for Rgbf16.
+/// Read one `half::f16` element from `rgb_in[i]`, decoding the bit
+/// pattern from `BE` byte order to host-native. Scalar endian-aware
+/// load for Rgbf16 streams.
 #[cfg_attr(not(tarpaulin), inline(always))]
 fn load_f16<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
   let bits = rgb_in[i].to_bits();
-  half::f16::from_bits(if BE { bits.swap_bytes() } else { bits })
+  half::f16::from_bits(if BE {
+    u16::from_be(bits)
+  } else {
+    u16::from_le(bits)
+  })
 }
 
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u8`
@@ -203,16 +217,20 @@ pub(crate) fn rgbf32_to_rgb_f32_row<const BE: bool>(
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
-  if BE {
-    for (dst, src) in rgb_out[..width * 3]
-      .iter_mut()
-      .zip(rgb_in[..width * 3].iter())
-    {
-      let bits = src.to_bits().swap_bytes();
-      *dst = f32::from_bits(bits);
-    }
-  } else {
-    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  // Decode each source f32 from `BE` byte order to host-native.
+  // `u32::from_be` / `u32::from_le` is target-endian aware: a no-op
+  // when encoded byte order matches the host, a byte-swap when they
+  // differ. Output is always host-native f32 on every target.
+  for (dst, src) in rgb_out[..width * 3]
+    .iter_mut()
+    .zip(rgb_in[..width * 3].iter())
+  {
+    let bits = src.to_bits();
+    *dst = f32::from_bits(if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    });
   }
 }
 
@@ -353,12 +371,13 @@ pub(crate) fn rgbf16_to_rgb_f32_row<const BE: bool>(
     .iter_mut()
     .zip(rgb_in[..width * 3].iter())
   {
-    let bits = if BE {
-      src.to_bits().swap_bytes()
+    let bits = src.to_bits();
+    let host_bits = if BE {
+      u16::from_be(bits)
     } else {
-      src.to_bits()
+      u16::from_le(bits)
     };
-    *dst = half::f16::from_bits(bits).to_f32();
+    *dst = half::f16::from_bits(host_bits).to_f32();
   }
 }
 
@@ -381,14 +400,19 @@ pub(crate) fn rgbf16_to_rgb_f16_row<const BE: bool>(
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  if BE {
-    for (dst, src) in rgb_out[..width * 3]
-      .iter_mut()
-      .zip(rgb_in[..width * 3].iter())
-    {
-      *dst = half::f16::from_bits(src.to_bits().swap_bytes());
-    }
-  } else {
-    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+  // Decode each source f16 from `BE` byte order to host-native, mirror
+  // of `rgbf32_to_rgb_f32_row`. `u16::from_be` / `u16::from_le` is
+  // target-endian aware: no-op when encoded byte order matches the
+  // host, swap when they differ. Output is always host-native f16.
+  for (dst, src) in rgb_out[..width * 3]
+    .iter_mut()
+    .zip(rgb_in[..width * 3].iter())
+  {
+    let bits = src.to_bits();
+    *dst = half::f16::from_bits(if BE {
+      u16::from_be(bits)
+    } else {
+      u16::from_le(bits)
+    });
   }
 }

From c3a6478483b83dfbc2f6e19dcd5be3ef4654c0a4 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 12:01:49 +1200
Subject: [PATCH 04/10] fix(be-tier9): NEON f16 widen over-read + host-native
 f32 routing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex review of PR #83 found two NEON-specific kernel correctness bugs in
the Tier 9 Rgbf16 BE path. Both are fixed here together.

Finding 1 — NEON f16 widening over-reads past row.

`widen_f16x4::<BE=true>` was calling `load_endian_u16x8` (16-byte load via
`vld1q_u16`) but the kernel only guarantees 8 readable bytes per call (4 ×
f16). The third widen call per 12-lane chunk reads bytes
[(lane+8)*2 .. (lane+16)*2) while the row ends at total_lanes*2 when
lane+12 == total_lanes — an 8-byte tail-overread that ASan/Miri catch on
guarded pages. Mirrors the SSE4.1 fix in 5967967.

The fix adds `load_endian_u16x4` to `src/row/arch/neon/endian.rs` (8-byte
load via `vld1_u16` + `vrev16_u8` byte-swap when needed) and uses it in
both the `BE=true` and `BE=false` arms of `widen_f16x4`. The downstream
`vcvt_f32_f16` already takes `uint16x4_t` so no further plumbing is
needed.

Finding 2 — f16→f32 widen-then-convert paths treat host-native f32 as
LE-encoded.

After widening f16 → f32 via `vcvt_f32_f16` (NEON), `_mm_cvtph_ps` /
`_mm256_cvtph_ps` / `_mm512_cvtph_ps` (x86), or scalar `to_f32()` (wasm),
the stack buffer carries host-native f32 values. The kernels then called
`rgbf32_to_*::<false>` to convert the buffer. With BE-aware kernel
semantics (BE=false means LE-encoded input, NOT host-native), the f32
loaders inside `rgbf32_to_*` would byte-swap the already-decoded host-
native buffer on a BE host — corrupting it.

The fix introduces a per-backend
  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
and routes the post-widen conversion as `rgbf32_to_*::<HOST_NATIVE_BE>`.
This is a no-op byte-swap on both LE and BE hosts:
  • LE host: HOST_NATIVE_BE = false → `from_le` (no-op on LE) → correct.
  • BE host: HOST_NATIVE_BE = true  → `from_be` (no-op on BE) → correct.

Applied to all 5 backends (NEON / SSE4.1 / AVX2 / AVX-512 / wasm-simd128)
across 20 call sites total. Each `rgbf32_to_rgb_row::<false>` /
`rgbf32_to_rgba_row::<false>` / `rgbf32_to_rgb_u16_row::<false>` /
`rgbf32_to_rgba_u16_row::<false>` after a SIMD widen now uses
`HOST_NATIVE_BE`. The lossless f16→f32 paths (`rgbf16_to_rgb_f32_row`)
write directly to the f32 output without a downstream convert and are
already correct.

Audit notes (other backends):

  • SSE4.1: only Finding 2 applied here; Finding 1 was fixed in 5967967.
  • AVX2: `widen_f16x8_avx` correctly loads 16 bytes (8 × f16 = 16) so no
    Finding 1; Finding 2 fix applied.
  • AVX-512: `widen_f16x16_avx512` correctly loads 32 bytes (16 × f16 =
    32) so no Finding 1; Finding 2 fix applied.
  • wasm-simd128: scalar widen, no SIMD u16 load to over-read; Finding 2
    fix applied (path was explicitly documented "call LE downstream",
    making the wasm32-LE-only assumption visible — fix is endian-agnostic
    so it survives any future BE wasm target).

Tests:

Width set for all NEON Rgbf16 BE-parity tests extended to include `5`
(was `[1, 4, 7, 16, 33, 1920, 1921]`) to cover the
"lane+12 == total_lanes via 1-pixel scalar tail" boundary. Added a
dedicated `neon_rgbf16_be_tail_overread_widths_4_5_16_33` regression test
that calls each kernel at exactly the over-read-prone widths through
exact-sized allocations. Verified locally:

  • cargo test --target aarch64-apple-darwin --lib → 2200 pass
  • cargo test --target x86_64-apple-darwin --lib → 2915 pass
  • cargo test --no-default-features --lib → 35 pass
  • RUSTFLAGS=-Zsanitizer=address cargo +nightly test
    --target aarch64-apple-darwin --lib row::arch::neon::tests::packed_rgb_float
    -Zbuild-std → 23 pass
  • RUSTFLAGS=-Zsanitizer=address cargo +nightly test
    --target x86_64-apple-darwin --lib rgbf16 -Zbuild-std → 60 pass
  • cargo build --target x86_64-apple-darwin --tests → 0 warnings (new)
  • RUSTFLAGS="-C target-feature=+simd128" cargo build
    --target wasm32-unknown-unknown --tests → 0 warnings (new)
  • cargo build --no-default-features → ok
  • cargo fmt --check → clean
  • cargo clippy --all-targets --all-features (both targets) -D warnings
    → clean

Out of scope:

  • Finding 3 (sinker hardcodes ::<false>) — Phase 4 deferred per PR
    scope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/endian.rs                   | 50 ++++++++++++++
 src/row/arch/neon/packed_rgb_float.rs         | 68 ++++++++++++-------
 src/row/arch/neon/tests/packed_rgb_float.rs   | 64 +++++++++++++++--
 src/row/arch/wasm_simd128/packed_rgb_float.rs | 35 ++++++++--
 src/row/arch/x86_avx2/packed_rgb_float.rs     | 30 ++++++--
 src/row/arch/x86_avx512/packed_rgb_float.rs   | 30 ++++++--
 src/row/arch/x86_sse41/packed_rgb_float.rs    | 29 ++++++--
 7 files changed, 257 insertions(+), 49 deletions(-)

diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs
index 6f800f7b..55ac4ad5 100644
--- a/src/row/arch/neon/endian.rs
+++ b/src/row/arch/neon/endian.rs
@@ -107,3 +107,53 @@ pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> uint32
     unsafe { load_le_u32x4(ptr) }
   }
 }
+
+// ---- u16x4 loaders (8-byte half-vector) ------------------------------------
+//
+// These load only 8 bytes (4 × u16) into a `uint16x4_t` in host-native order.
+// Used by Rgbf16 widen kernels (`vcvt_f32_f16` reads 4 × f16 from a
+// `uint16x4_t`) when the caller can only guarantee 8 readable bytes — using
+// the 16-byte `load_endian_u16x8` would tail-overread.
+
+/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> uint16x4_t {
+  let v = unsafe { vld1_u16(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) };
+  v
+}
+
+/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> uint16x4_t {
+  let v = unsafe { vld1_u16(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4` based on
+/// the compile-time `BE` const parameter. Reads exactly 8 bytes.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x4` / `load_be_u16x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x4<const BE: bool>(ptr: *const u8) -> uint16x4_t {
+  if BE {
+    unsafe { load_be_u16x4(ptr) }
+  } else {
+    unsafe { load_le_u16x4(ptr) }
+  }
+}
diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
index 524532d9..f96f02b1 100644
--- a/src/row/arch/neon/packed_rgb_float.rs
+++ b/src/row/arch/neon/packed_rgb_float.rs
@@ -445,17 +445,30 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 //
 // `vcvt_f32_f16` widens 4 × f16 to 4 × f32 in a single FCVT instruction.
 //
-// For BE: we load the u16 bits via `load_endian_u16x8::<BE>` (loads 8 u16
-// with byte-swap for BE), extract the low 4 lanes into a `uint16x4_t`, then
-// reinterpret as `float16x4_t` before widening with `vcvt_f32_f16`.
-
-use super::endian::load_endian_u16x8;
+// For BE: we load the u16 bits via `load_endian_u16x4::<BE>` (loads 4 u16 with
+// byte-swap for BE) into a `uint16x4_t`, then reinterpret as `float16x4_t`
+// before widening with `vcvt_f32_f16`.  `load_endian_u16x4` reads exactly
+// 8 bytes regardless of `BE`, matching the 4 × f16 region the kernel owns
+// (a 16-byte load via `load_endian_u16x8` would tail-overread).
+
+use super::endian::load_endian_u16x4;
+
+/// `BE` value that makes the f32 row loaders treat their input as host-native
+/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack
+/// buffer is already host-native after `vcvt_f32_f16`. On a LE target, host-
+/// native == LE so `BE = false`; on a BE target, host-native == BE so
+/// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
+/// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Widen 4 half-precision floats (`f16x4`, i.e. 8 bytes starting at `ptr`)
 /// to 4 single-precision floats into `out[0..4]`.
 ///
 /// For `BE = true` the f16 values are stored big-endian (bytes swapped);
-/// the byte-swap is applied before the widening conversion.
+/// the byte-swap is applied before the widening conversion. The loader reads
+/// exactly 8 bytes regardless of `BE` so the caller's `ptr` only needs 8
+/// readable bytes (a 16-byte load via `load_endian_u16x8` would tail-overread
+/// the 4 × f16 region the kernel actually owns).
 ///
 /// # Safety
 ///
@@ -465,21 +478,13 @@ use super::endian::load_endian_u16x8;
 #[inline(always)]
 unsafe fn widen_f16x4<const BE: bool>(ptr: *const half::f16, out: *mut f32) {
   unsafe {
-    if BE {
-      // Load 8 bytes as u16x8, byte-swap each u16, take low 4.
-      let u8_ptr = ptr as *const u8;
-      let u16x8 = load_endian_u16x8::<BE>(u8_ptr);
-      // Extract low 4 lanes (the ones we need for 4 f16 values).
-      let u16x4 = vget_low_u16(u16x8);
-      let f16x4 = vreinterpret_f16_u16(u16x4);
-      let f32x4 = vcvt_f32_f16(f16x4);
-      vst1q_f32(out, f32x4);
-    } else {
-      let u16s = vld1_u16(ptr as *const u16);
-      let f16s = vreinterpret_f16_u16(u16s);
-      let f32s = vcvt_f32_f16(f16s);
-      vst1q_f32(out, f32s);
-    }
+    // 8-byte load (4 × u16), byte-swapped per-lane when BE = true so the
+    // resulting `uint16x4_t` carries host-native f16 bit patterns ready for
+    // `vcvt_f32_f16`.
+    let u16x4 = load_endian_u16x4::<BE>(ptr as *const u8);
+    let f16x4 = vreinterpret_f16_u16(u16x4);
+    let f32x4 = vcvt_f32_f16(f16x4);
+    vst1q_f32(out, f32x4);
   }
 }
 
@@ -536,7 +541,9 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is host-native f32 after vcvt_f32_f16; route through the f32
+      // kernel with HOST_NATIVE_BE so its loaders perform a no-op swap.
+      rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -576,7 +583,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4).
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;
@@ -616,7 +628,8 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4).
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -657,7 +670,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
       widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
-      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE (see widen_f16x4).
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;
diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
index 6f0dadbc..259f0757 100644
--- a/src/row/arch/neon/tests/packed_rgb_float.rs
+++ b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -357,7 +357,13 @@ fn neon_rgbf16_to_rgb_be_matches_le() {
   if !std::arch::is_aarch64_feature_detected!("fp16") {
     return;
   }
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  // Widths 4, 5, 16, 33 specifically exercise the f16 widen-tail boundary:
+  // the BE branch used to over-read past the row via load_endian_u16x8 (16
+  // bytes) when only 8 bytes of f16 data are guaranteed at the third
+  // widen_f16x4 call per 12-lane chunk. With load_endian_u16x4 (8 bytes)
+  // the read stays within bounds; ASan / Miri pages would have caught the
+  // over-read as a guarded-page UB.
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![0u8; w * 3];
@@ -379,7 +385,7 @@ fn neon_rgbf16_to_rgba_be_matches_le() {
   if !std::arch::is_aarch64_feature_detected!("fp16") {
     return;
   }
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![0u8; w * 4];
@@ -401,7 +407,7 @@ fn neon_rgbf16_to_rgb_u16_be_matches_le() {
   if !std::arch::is_aarch64_feature_detected!("fp16") {
     return;
   }
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![0u16; w * 3];
@@ -423,7 +429,7 @@ fn neon_rgbf16_to_rgba_u16_be_matches_le() {
   if !std::arch::is_aarch64_feature_detected!("fp16") {
     return;
   }
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![0u16; w * 4];
@@ -448,7 +454,7 @@ fn neon_rgbf16_to_rgb_f32_be_matches_le() {
   if !std::arch::is_aarch64_feature_detected!("fp16") {
     return;
   }
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![0.0f32; w * 3];
@@ -467,7 +473,7 @@ fn neon_rgbf16_to_rgb_f32_be_matches_le() {
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
 fn neon_rgbf16_to_rgb_f16_be_is_byteswap() {
-  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+  for w in [1usize, 4, 5, 7, 16, 33, 1920, 1921] {
     let le_in = pseudo_random_rgbf16(w);
     let be_in = be_rgbf16(&le_in);
     let mut out_le = std::vec![half::f16::ZERO; w * 3];
@@ -480,3 +486,49 @@ fn neon_rgbf16_to_rgb_f16_be_is_byteswap() {
     assert_eq!(out_le, out_be, "NEON rgbf16_to_rgb_f16 BE parity width {w}");
   }
 }
+
+// ---- Tail-overread regression tests (NEON BE Rgbf16) ------------------------
+//
+// These specifically place the BE f16 input at the end of an exact-sized
+// allocation so any read past `3 * width * 2` bytes lands on the next
+// allocation (or, with ASan / Miri / guarded pages, an unmapped page). The
+// previous bug — `widen_f16x4::<BE=true>` calling `load_endian_u16x8` (16-byte
+// load) when only 8 bytes were guaranteed — would over-read 8 bytes past the
+// row at widths where lane+12 reaches total_lanes (multiples of 4 pixels).
+//
+// Widths 4, 16, 32 are exact multiples of 4 pixels (the SIMD chunk size) so
+// the tail-overread happens on the LAST chunk; widths 5 and 33 leave 1 pixel
+// of scalar tail, so the over-read happens on the SECOND-TO-LAST chunk.
+
+fn assert_rgbf16_tail_overread_safe<const BE: bool, F>(width: usize, kernel: F)
+where
+  F: Fn(&[half::f16], &mut [u8], usize),
+{
+  // Allocate exact-sized input/output so any over-read lands outside the
+  // Vec's allocation. The exact-fit Vec mostly catches over-reads that go
+  // outside the page boundary — pair with ASan in CI for the strict case.
+  let width_lanes = width * 3;
+  let mut le_in = std::vec::Vec::<half::f16>::with_capacity(width_lanes);
+  for v in pseudo_random_rgbf16(width) {
+    le_in.push(v);
+  }
+  let raw: std::vec::Vec<half::f16> = if BE { be_rgbf16(&le_in) } else { le_in.clone() };
+  let mut out = std::vec![0u8; width * 3];
+  kernel(&raw, &mut out, width);
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn neon_rgbf16_be_tail_overread_widths_4_5_16_33() {
+  if !std::arch::is_aarch64_feature_detected!("fp16") {
+    return;
+  }
+  for &w in &[4usize, 5, 16, 33] {
+    assert_rgbf16_tail_overread_safe::<true, _>(w, |inp, out, w| unsafe {
+      rgbf16_to_rgb_row::<true>(inp, out, w);
+    });
+  }
+}
diff --git a/src/row/arch/wasm_simd128/packed_rgb_float.rs b/src/row/arch/wasm_simd128/packed_rgb_float.rs
index 32e3a37a..414deb11 100644
--- a/src/row/arch/wasm_simd128/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/packed_rgb_float.rs
@@ -12,6 +12,16 @@ use core::arch::wasm32::*;
 
 use super::{endian::load_endian_u32x4, scalar};
 
+/// `BE` value that makes the f32 row loaders treat their input as host-native
+/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack
+/// buffer is already host-native after `half::f16::to_f32()`. On a LE target,
+/// host-native == LE so `BE = false`; on a BE target, host-native == BE so
+/// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
+/// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+/// (`wasm32-*` is LE today, but keeping the routing endian-agnostic future-
+/// proofs against any BE wasm target.)
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 // ---- helpers ------------------------------------------------------------------
 
 #[inline(always)]
@@ -321,7 +331,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 //
 // For BE inputs the byte-swap is applied before widening so the widened f32
 // buffer is already host-native; downstream f32 kernels are called with
-// `BE=false` to avoid a second swap.
+// `HOST_NATIVE_BE` so their loaders perform a no-op byte-swap (correct on
+// both LE and BE hosts).
 //
 // CHUNK_PIXELS = 4 (= 12 f32 lanes), matching the simd128 Rgbf32 loop stride.
 
@@ -358,8 +369,9 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      // Buffer is now host-native f32; call LE downstream.
-      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is now host-native f32; route via HOST_NATIVE_BE so the f32
+      // loaders perform a no-op byte-swap on both LE and BE hosts.
+      rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -404,7 +416,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;
@@ -449,7 +466,8 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -494,7 +512,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
       buf[k] = half::f16::from_bits(bits).to_f32();
     }
     unsafe {
-      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;
diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs
index 86aacd31..75cfdb04 100644
--- a/src/row/arch/x86_avx2/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/packed_rgb_float.rs
@@ -20,6 +20,14 @@ use super::endian::load_endian_u32x8;
 use super::scalar;
 use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 
+/// `BE` value that makes the f32 row loaders treat their input as host-native
+/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack
+/// buffer is already host-native after `_mm256_cvtph_ps`. On a LE target,
+/// host-native == LE so `BE = false`; on a BE target, host-native == BE so
+/// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
+/// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 /// Load 8 f32 lanes from `ptr` in endian-aware fashion.
 ///
 /// # Safety
@@ -409,7 +417,10 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
+      // Buffer is host-native f32 after _mm256_cvtph_ps; route via
+      // HOST_NATIVE_BE so the f32 loaders perform a no-op byte-swap on
+      // both LE and BE hosts.
+      rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
     }
     lane += 24;
   }
@@ -452,7 +463,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32),
+        8,
+      );
     }
     lane += 24;
     pix += 8;
@@ -495,7 +511,8 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
     }
     lane += 24;
   }
@@ -538,7 +555,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
       _mm256_storeu_ps(buf.as_mut_ptr(), f0);
       _mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
       _mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
-      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32), 8);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32),
+        8,
+      );
     }
     lane += 24;
     pix += 8;
diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs
index ff505f6d..783dcdab 100644
--- a/src/row/arch/x86_avx512/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/packed_rgb_float.rs
@@ -17,6 +17,14 @@ use super::endian::load_endian_u32x16;
 use super::scalar;
 use crate::row::arch::x86_avx2::endian::load_endian_u16x16;
 
+/// `BE` value that makes the f32 row loaders treat their input as host-native
+/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack
+/// buffer is already host-native after `_mm512_cvtph_ps`. On a LE target,
+/// host-native == LE so `BE = false`; on a BE target, host-native == BE so
+/// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
+/// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 /// Load 16 f32 lanes from `ptr` in endian-aware fashion.
 ///
 /// # Safety
@@ -382,7 +390,10 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
+      // Buffer is host-native f32 after _mm512_cvtph_ps; route via
+      // HOST_NATIVE_BE so the f32 loaders perform a no-op byte-swap on
+      // both LE and BE hosts.
+      rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
     }
     lane += 48;
   }
@@ -425,7 +436,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64),
+        16,
+      );
     }
     lane += 48;
     pix += 16;
@@ -468,7 +484,8 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
     }
     lane += 48;
   }
@@ -511,7 +528,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
       _mm512_storeu_ps(buf.as_mut_ptr(), f0);
       _mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
       _mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
-      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64), 16);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64),
+        16,
+      );
     }
     lane += 48;
     pix += 16;
diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs
index f8272169..85513f71 100644
--- a/src/row/arch/x86_sse41/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/packed_rgb_float.rs
@@ -358,6 +358,14 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 
 use super::endian::load_endian_u16x4;
 
+/// `BE` value that makes the f32 row loaders treat their input as host-native
+/// (a no-op byte-swap). Used by f16→f32 widen-then-convert paths whose stack
+/// buffer is already host-native after `_mm_cvtph_ps`. On a LE target, host-
+/// native == LE so `BE = false`; on a BE target, host-native == BE so
+/// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
+/// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 /// Widen 4 × f16 (at `ptr`, 8 bytes) to 4 × f32 (returned as `__m128`).
 ///
 /// For `BE = true` the f16 values are stored big-endian; bytes are swapped
@@ -414,7 +422,9 @@ pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgb_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE so the f32 loaders
+      // perform a no-op swap on both LE and BE hosts.
+      rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -457,7 +467,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgba_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;
@@ -500,7 +515,8 @@ pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgb_u16_row::<false>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
     }
     lane += 12;
   }
@@ -543,7 +559,12 @@ pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
       _mm_storeu_ps(buf.as_mut_ptr(), f0);
       _mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
       _mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
-      rgbf32_to_rgba_u16_row::<false>(&buf, rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16), 4);
+      // Buffer is host-native f32; route via HOST_NATIVE_BE.
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+        &buf,
+        rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
+        4,
+      );
     }
     lane += 12;
     pix += 4;

From dcf40a314c35308b76980252fa02528b95cef939 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 12:59:29 +1200
Subject: [PATCH 05/10] fix(be-tier9): route Rgbf32/Rgbf16 sinker through
 HOST_NATIVE_BE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 2nd-pass review of PR #83 found a third high-severity bug at the
**sinker** layer (one level above the SIMD-internal HOST_NATIVE_BE fix
that landed in c3a6478): the Rgbf32 and Rgbf16 `MixedSinker`
`PixelSink::process` impls hardcoded `::<false>` on every row dispatcher
call. With the new BE-aware kernel semantics, `BE = false` means
"decode LE-encoded input" (`u32::from_le` / `u16::from_le` / SIMD
`load_endian_*` LE arms). But `Rgbf32Frame` exposes a host-native
`&[f32]` row and `Rgbf16Frame` exposes a host-native `&[half::f16]` row
— the public API contract is "caller passes host-native floats". On a
BE host, `::<false>` would byte-swap the already-decoded host-native
values inside the loaders, corrupting the lossless `with_rgb_f32` /
`with_rgb_f16` pass-throughs **and** every downstream u8/u16/luma/HSV
output that flows through the same row.

Direct backend `::<true>` BE-parity tests (the body of c3a6478 and
prior commits) don't catch this because they bypass the sinker entirely
— they hand the kernel BE-encoded bytes and assert against the LE-
encoded counterpart, exercising only the kernel decode boundary, not
the sinker-to-kernel routing.

The fix is the **sinker-layer** complement of the SIMD-backend-internal
HOST_NATIVE_BE introduced in c3a6478:

  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");

Defined at module scope in both
`src/sinker/mixed/packed_rgb_float.rs` and
`src/sinker/mixed/packed_rgb_f16.rs`. Every hardcoded `::<false>` in
each sinker's `process` is replaced with `::<HOST_NATIVE_BE>`. Same
truth table as the SIMD fix, different layer:

  • LE host: HOST_NATIVE_BE = false → `from_le` (no-op on LE) → correct.
  • BE host: HOST_NATIVE_BE = true  → `from_be` (no-op on BE) → correct.

Distinction from Phase 4 (out of scope here):

  This is **host-native correctness** — the contract that `Rgbf32Frame`
  / `Rgbf16Frame` exposes already-decoded floats. It is **NOT** the
  Phase 4 BE-source-frame work, which would let the Frame type itself
  carry an encoding tag (LE-encoded bytes vs BE-encoded bytes vs host-
  native) and thread that through the walker / row / sinker stack. The
  Yuva / Gbrap / packed-RGB-u16 sinkers that are still on `::<false>`
  fall under Phase 4 because their Frames hold `&[u16]` plane buffers
  whose interpretation depends on whether the caller passed an LE-
  encoded or BE-encoded byte stream — that's a Frame-API design
  question, not a host-native routing bug. This commit touches only
  the float sinkers whose Frame types unambiguously specify host-
  native element semantics.

Call sites changed:

  • src/sinker/mixed/packed_rgb_float.rs — 6 sites (rgbf32_to_rgb_row,
    rgbf32_to_rgba_row ×2, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_u16_row,
    rgbf32_to_rgb_f32_row).
  • src/sinker/mixed/packed_rgb_f16.rs — 7 sites (rgbf16_to_rgb_row,
    rgbf16_to_rgba_row ×2, rgbf16_to_rgb_u16_row, rgbf16_to_rgba_u16_row,
    rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row).

Out of scope (Phase 4 territory, not touched here):

  • Frame types
  • Walker types
  • Other sinkers (Yuva, Gbrap, mono1bit u16, etc.) — their Frames
    carry plane bytes whose encoding semantics need explicit Phase 4
    plumbing.
  • Row dispatchers themselves (already BE-aware via const generic).

Tests:

Added 4 sinker-level regression tests (one kernel-equivalence test +
one public-API contract test for each of Rgbf32 / Rgbf16):

  • `rgbf32_kernel_host_native_be_matches_false_on_le_host` and
    `rgbf16_kernel_host_native_be_matches_false_on_le_host` — call
    each `rgbf32_to_*` / `rgbf16_to_*` dispatcher with both
    `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`),
    asserting outputs are byte-equal on the active host. On a LE host
    both are no-op so this documents the routing equivalence; on a BE
    host the same equivalence holds for the **fixed** sinker but would
    fail for the broken one. Width 33 covers SIMD main loop + scalar
    tail across every backend.
  • `rgbf32_sinker_host_native_contract_lossless_passthrough` and
    `rgbf16_sinker_host_native_contract_lossless_passthrough` — feed
    `Rgbf32Frame` / `Rgbf16Frame` through the public sinker API and
    assert `with_rgb_f32` / `with_rgb_f16` round-trips host-native
    input bit-exact. Pairs with the kernel-level test to cover both
    the dispatch boundary and the public sinker boundary.

Comment in each docstring notes that full BE-host coverage requires
QEMU s390x (Phase 3) — these tests document the contract on LE and
would catch the bug on BE.

Verified locally:

  • cargo test --target aarch64-apple-darwin --lib → 2204 pass
    (was 2200; +4 new tests).
  • cargo test --target x86_64-apple-darwin --lib → 2919 pass
    (was 2915; +4 new tests).
  • cargo test --no-default-features --lib → 35 pass.
  • cargo build --target x86_64-apple-darwin --tests → 0 warnings.
  • RUSTFLAGS="-C target-feature=+simd128" cargo build
    --target wasm32-unknown-unknown --tests → only 3 pre-existing
    `unused imports` warnings in unrelated `wasm_simd128/tests/*`
    files, not introduced by this change (confirmed by stashing the
    diff and rebuilding on c3a6478).
  • cargo build --no-default-features → ok.
  • cargo fmt --check → clean.
  • cargo clippy --all-targets --all-features
    --target {aarch64,x86_64}-apple-darwin -- -D warnings → clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/packed_rgb_f16.rs         |  33 ++++--
 src/sinker/mixed/packed_rgb_float.rs       |  31 ++++-
 src/sinker/mixed/tests/packed_rgb_f16.rs   | 120 +++++++++++++++++++
 src/sinker/mixed/tests/packed_rgb_float.rs | 130 +++++++++++++++++++++
 4 files changed, 301 insertions(+), 13 deletions(-)

diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs
index e349f130..62ab1cc8 100644
--- a/src/sinker/mixed/packed_rgb_f16.rs
+++ b/src/sinker/mixed/packed_rgb_f16.rs
@@ -34,6 +34,25 @@ use crate::{
   yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink},
 };
 
+/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as
+/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`]
+/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the
+/// caller hands us already-decoded half-floats. The kernel `BE` parameter,
+/// however, names the **encoded** byte order (so `BE = false` means "decode
+/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout
+/// is LE, so `BE = false` is correct; on a BE host the host-native layout is
+/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap.
+/// Without this routing the loaders would byte-swap an already-decoded host-
+/// native `f16` on BE hosts, corrupting every output path.
+///
+/// This is the **sinker-layer** complement to the SIMD-backend-internal
+/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
+/// `c3a6478` — same truth table, different layer:
+///
+///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
+///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 // ---- Rgbf16 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf16> {
@@ -234,27 +253,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if let Some(buf) = rgb_f16.as_deref_mut() {
       let f16_start = one_plane_start * 3;
       let f16_end = one_plane_end * 3;
-      rgbf16_to_rgb_f16_row::<false>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
+      rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
     }
 
     // Lossless f32 widen — also independent of integer conversion paths.
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf16_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf16_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma /
@@ -269,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -288,7 +307,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
       w,
       h,
     )?;
-    rgbf16_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
+    rgbf16_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -328,7 +347,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     // over `rgb_row` via `expand_rgb_to_rgba_row`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs
index e1c17a39..f189e5ab 100644
--- a/src/sinker/mixed/packed_rgb_float.rs
+++ b/src/sinker/mixed/packed_rgb_float.rs
@@ -31,6 +31,25 @@ use crate::{
   yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink},
 };
 
+/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as
+/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`]
+/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller
+/// hands us already-decoded floats. The kernel `BE` parameter, however, names
+/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes"
+/// via `u32::from_le`). On a LE host the host-native layout is LE, so
+/// `BE = false` is correct; on a BE host the host-native layout is BE, so we
+/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this
+/// routing the loaders would byte-swap an already-decoded host-native `f32` on
+/// BE hosts, corrupting every output path.
+///
+/// This is the **sinker-layer** complement to the SIMD-backend-internal
+/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
+/// `c3a6478` — same truth table, different layer:
+///
+///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
+///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 // ---- Rgbf32 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf32> {
@@ -209,20 +228,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf32_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf32_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float→u8 conversion when
@@ -237,7 +256,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -257,7 +276,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
       w,
       h,
     )?;
-    rgbf32_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
+    rgbf32_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -299,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     // less memory pass for combined `with_rgb + with_rgba` callers.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index a30bbe8a..78aa3c37 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -310,3 +310,123 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
   assert_eq!(luma_u16_simd, luma_u16_scalar, "Luma u16 output diverges");
   assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless");
 }
+
+/// Sinker-layer host-native-`f16` regression for the bug fixed alongside
+/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to
+/// hardcode `::<false>` when calling the row dispatchers, telling them to
+/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native
+/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted
+/// every output path on BE hosts (the `u16` loaders would byte-swap an
+/// already-decoded f16 bit-pattern). The fix replaces those `::<false>` with
+/// `::<HOST_NATIVE_BE>`, which is `false` on LE and `true` on BE — a no-op
+/// byte-swap on either host.
+///
+/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
+/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
+/// identical to `::<false>`, so this test cannot distinguish the broken vs
+/// fixed code on LE. It instead documents the equivalence at the **kernel
+/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both
+/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
+/// must produce identical output on the active host. On a hypothetical BE
+/// host (full QEMU s390x coverage is Phase 3), the same equivalence holds
+/// for the **fixed** sinker but would fail for the broken one.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn rgbf16_kernel_host_native_be_matches_false_on_le_host() {
+  use crate::row::{
+    rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row,
+    rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row,
+  };
+
+  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
+  // Compute it locally so the test asserts the same condition without taking
+  // a dependency on a private const.
+  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
+  // Width 33 covers SIMD main loop + scalar tail across every backend.
+  let w = 33usize;
+  let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25];
+  let pix: std::vec::Vec<half::f16> = (0..w * 3)
+    .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()]))
+    .collect();
+
+  // u8 RGB.
+  let mut rgb_false = std::vec![0u8; w * 3];
+  let mut rgb_host = std::vec![0u8; w * 3];
+  rgbf16_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
+  rgbf16_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
+  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
+
+  // u8 RGBA.
+  let mut rgba_false = std::vec![0u8; w * 4];
+  let mut rgba_host = std::vec![0u8; w * 4];
+  rgbf16_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
+  rgbf16_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
+  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
+
+  // u16 RGB.
+  let mut rgb_u16_false = std::vec![0u16; w * 3];
+  let mut rgb_u16_host = std::vec![0u16; w * 3];
+  rgbf16_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
+  rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
+  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
+
+  // u16 RGBA.
+  let mut rgba_u16_false = std::vec![0u16; w * 4];
+  let mut rgba_u16_host = std::vec![0u16; w * 4];
+  rgbf16_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
+  rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
+  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
+
+  // f16 lossless pass-through.
+  let mut f16_false = std::vec![half::f16::ZERO; w * 3];
+  let mut f16_host = std::vec![half::f16::ZERO; w * 3];
+  rgbf16_to_rgb_f16_row::<false>(&pix, &mut f16_false, w, true);
+  rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(&pix, &mut f16_host, w, true);
+  assert_eq!(f16_false, f16_host, "f16 RGB diverges");
+  if !HOST_NATIVE_BE {
+    assert_eq!(
+      f16_host, pix,
+      "f16 lossless pass-through corrupted on LE host"
+    );
+  }
+
+  // f32 lossless widen.
+  let mut f32_false = std::vec![0.0f32; w * 3];
+  let mut f32_host = std::vec![0.0f32; w * 3];
+  rgbf16_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
+  rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
+  assert_eq!(f32_false, f32_host, "f32 widen diverges");
+}
+
+/// End-to-end sinker contract test: feeding host-native `half::f16` through
+/// [`MixedSinker<Rgbf16>`] must round-trip the f16 input bit-exact via
+/// `with_rgb_f16` on every host. Documents the public-API contract that the
+/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level
+/// test above; together they cover both the dispatch boundary and the public
+/// sinker boundary.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn rgbf16_sinker_host_native_contract_lossless_passthrough() {
+  let vals_f32 = [0.5f32, 1.5, -0.25, 100.0];
+  let pix: std::vec::Vec<half::f16> = (0..16 * 4 * 3)
+    .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()]))
+    .collect();
+  let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
+
+  let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3];
+  let mut sink = MixedSinker::<Rgbf16>::new(16, 4)
+    .with_rgb_f16(&mut rgb_f16_out)
+    .unwrap();
+  rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  // Bit-exact pass-through on every host — broken `::<false>` routing
+  // would byte-swap on a BE host; the fixed routing keeps the f16 intact.
+  assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted");
+}
diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs
index 47cd3be4..64c2ab6f 100644
--- a/src/sinker/mixed/tests/packed_rgb_float.rs
+++ b/src/sinker/mixed/tests/packed_rgb_float.rs
@@ -245,3 +245,133 @@ fn rgbf32_simd_matches_scalar_with_random_input() {
   assert_eq!(luma_u16_simd, luma_u16_scalar, "Luma u16 output diverges");
   assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless");
 }
+
+/// Sinker-layer host-native-`f32` regression for the bug fixed alongside
+/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to
+/// hardcode `::<false>` when calling the row dispatchers, telling them to
+/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native
+/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every
+/// output path on BE hosts (the loaders would byte-swap an already-decoded
+/// f32). The fix replaces those `::<false>` with `::<HOST_NATIVE_BE>`, which
+/// is `false` on LE and `true` on BE — a no-op byte-swap on either host.
+///
+/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
+/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
+/// identical to `::<false>`, so this test cannot distinguish the broken vs
+/// fixed code on LE. It instead documents the equivalence at the **kernel
+/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both
+/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
+/// must produce identical output on the active host. On a hypothetical BE
+/// host (full QEMU s390x coverage is Phase 3), the same equivalence holds
+/// for the **fixed** sinker but would fail for the broken one — making this
+/// the natural regression test for the routing change.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn rgbf32_kernel_host_native_be_matches_false_on_le_host() {
+  use crate::row::{
+    rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row,
+    rgbf32_to_rgba_u16_row,
+  };
+
+  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
+  // Compute it locally so the test asserts the same condition without taking
+  // a dependency on a private const.
+  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
+  // Width 33 covers SIMD main loop + scalar tail across every backend.
+  let w = 33usize;
+  let mut pix = std::vec![0.0f32; w * 3];
+  for (i, v) in pix.iter_mut().enumerate() {
+    // Mix in-range, HDR, and negative values to exercise every clamp branch.
+    *v = match i % 5 {
+      0 => 0.0,
+      1 => 0.5,
+      2 => 1.0,
+      3 => 1.75,
+      _ => -0.25,
+    };
+  }
+
+  // u8 RGB.
+  let mut rgb_false = std::vec![0u8; w * 3];
+  let mut rgb_host = std::vec![0u8; w * 3];
+  rgbf32_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
+  rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
+  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
+
+  // u8 RGBA.
+  let mut rgba_false = std::vec![0u8; w * 4];
+  let mut rgba_host = std::vec![0u8; w * 4];
+  rgbf32_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
+  rgbf32_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
+  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
+
+  // u16 RGB.
+  let mut rgb_u16_false = std::vec![0u16; w * 3];
+  let mut rgb_u16_host = std::vec![0u16; w * 3];
+  rgbf32_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
+  rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
+  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
+
+  // u16 RGBA.
+  let mut rgba_u16_false = std::vec![0u16; w * 4];
+  let mut rgba_u16_host = std::vec![0u16; w * 4];
+  rgbf32_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
+  rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
+  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
+
+  // f32 lossless pass-through.
+  let mut f32_false = std::vec![0.0f32; w * 3];
+  let mut f32_host = std::vec![0.0f32; w * 3];
+  rgbf32_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
+  rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
+  assert_eq!(f32_false, f32_host, "f32 RGB diverges");
+  // And on the host (LE on every CI runner) both must equal `pix` bit-exact.
+  if !HOST_NATIVE_BE {
+    assert_eq!(
+      f32_host, pix,
+      "f32 lossless pass-through corrupted on LE host"
+    );
+  }
+}
+
+/// End-to-end sinker contract test: feeding host-native `f32` through
+/// [`MixedSinker<Rgbf32>`] must produce the same output every other sinker
+/// would expect from a host-native source — specifically, `with_rgb_f32`
+/// must be bit-exact identical to the input on every host. Documents the
+/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves.
+/// Pairs with the kernel-level test above; together they cover both the
+/// dispatch boundary and the public sinker boundary.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn rgbf32_sinker_host_native_contract_lossless_passthrough() {
+  // Mix HDR, in-range, and negative values — the f32 lossless path must
+  // round-trip them bit-exact on every host.
+  let mut pix = std::vec![0.0f32; 16 * 4 * 3];
+  for (i, v) in pix.iter_mut().enumerate() {
+    *v = match i % 4 {
+      0 => 0.5,
+      1 => 1.5,
+      2 => -0.25,
+      _ => 100.0,
+    };
+  }
+  let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
+
+  let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3];
+  let mut sink = MixedSinker::<Rgbf32>::new(16, 4)
+    .with_rgb_f32(&mut rgb_f32_out)
+    .unwrap();
+  rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  // Bit-exact pass-through on every host. On the buggy `::<false>` routing
+  // a BE host would see byte-swapped output here; on the fixed routing the
+  // assertion holds on both LE and BE.
+  assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted");
+}

From f1161d7397669f086f769e2987736268b059afee Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 13:19:49 +1200
Subject: [PATCH 06/10] fix(be-tier9): NEON f32 pass-through respects host
 endian; gate dispatcher-equivalence tests on LE host
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 3rd-pass review of PR #83 found that `rgbf32_to_rgb_f32_row::<BE>` had
two endian-correctness defects:

Finding 1 (high) — NEON `BE = false` branch used a raw `vld1q_f32` /
`vst1q_f32` copy. That preserves on-disk byte order: it's correct when the
input encoding matches host-native (the historical assumption: LE-encoded
input on a LE host) but corrupts the lossless f32 output on a BE host where
the request `BE = false` should mean "decode LE-encoded input to host-native"
— the kernel must byte-swap, not pass through. Audit revealed identical
defects in the SSE4.1, AVX2, AVX-512, and wasm-simd128 backends.

Findings 2 + 3 (medium) — the Rgbf32 / Rgbf16 sinker dispatcher-equivalence
tests asserted `::<false>` ≡ `::<HOST_NATIVE_BE>` while feeding host-native
fixtures. On LE hosts both calls are byte-for-byte identical (the test's
intent), but on BE hosts `::<false>` decodes the host-native fixture as if it
were LE-encoded (byte-swap) while `::<HOST_NATIVE_BE> == ::<true>` decodes as
BE (no swap), so the outputs diverge by design — the equivalence claim is
specifically about the LE host-routing pattern.

Fix:

* Replace the raw-copy fast path in every `rgbf32_to_rgb_f32_row::<BE>`
  backend (NEON, SSE4.1, AVX2, AVX-512, wasm-simd128) with a host-endian
  gate: `if BE == HOST_NATIVE_BE { raw copy } else { endian-aware load }`.
  When the requested encoding matches host-native the bytes can be copied
  verbatim (perf-equivalent to the old fast path on the only shipping
  target — LE); otherwise the kernel falls through to the existing
  `load_f32x{4,8,16}::<BE>` slow path which byte-swaps via the `_endian_*`
  loaders. Tail loop now uses the endian-aware
  `if BE { from_be } else { from_le }` decode (matches scalar reference at
  `src/row/scalar/packed_rgb_float.rs:213`).
* Add five new BE-target regression tests
  (`{neon,sse41,avx2,avx512,wasm}_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host`).
  Each constructs an LE-encoded f32 byte fixture (host-native bits passed
  through `f32::from_bits(u32::from_le(_))`) and feeds it through `::<false>`,
  asserting the output matches the original host-native expected values. On
  LE hosts this is a vacuous identity check; on BE hosts (full QEMU s390x
  coverage is Phase 3) it would have caught the original bug.
* Gate the two sinker dispatcher-equivalence tests
  (`rgbf32_kernel_host_native_be_matches_false_on_le_host` and
  `rgbf16_kernel_host_native_be_matches_false_on_le_host`) on
  `#[cfg(target_endian = "little")]`. BE-host correctness of the routing
  change is verified instead by `*_sinker_host_native_contract_lossless_passthrough`
  (sinker public-API contract) and the row-kernel BE parity tests.

Audit results — every backend with a `rgbf32_to_rgb_f32_row::<BE>` kernel
that used a raw passthrough on `BE = false` had the same defect:

  src/row/arch/neon/packed_rgb_float.rs:414      — fixed
  src/row/arch/x86_sse41/packed_rgb_float.rs:322 — fixed
  src/row/arch/x86_avx2/packed_rgb_float.rs:332  — fixed
  src/row/arch/x86_avx512/packed_rgb_float.rs:303 — fixed
  src/row/arch/wasm_simd128/packed_rgb_float.rs:287 — fixed

Five call sites changed (all five vector backends), five new regression
tests, two existing sinker dispatcher-equivalence tests cfg-gated on
`target_endian = "little"`. All `cargo test` / `cargo build` /
`cargo fmt --check` / `cargo clippy --all-targets --all-features
-- -D warnings` checks pass on aarch64-apple-darwin and x86_64-apple-darwin
(LE hosts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/packed_rgb_float.rs         | 29 +++++++++++----
 src/row/arch/neon/tests/packed_rgb_float.rs   | 32 +++++++++++++++++
 src/row/arch/wasm_simd128/packed_rgb_float.rs | 28 +++++++++++----
 .../wasm_simd128/tests/packed_rgb_float.rs    | 33 +++++++++++++++++
 src/row/arch/x86_avx2/packed_rgb_float.rs     | 27 ++++++++++----
 .../arch/x86_avx2/tests/packed_rgb_float.rs   | 36 +++++++++++++++++++
 src/row/arch/x86_avx512/packed_rgb_float.rs   | 27 ++++++++++----
 .../arch/x86_avx512/tests/packed_rgb_float.rs | 36 +++++++++++++++++++
 src/row/arch/x86_sse41/packed_rgb_float.rs    | 27 ++++++++++----
 .../arch/x86_sse41/tests/packed_rgb_float.rs  | 36 +++++++++++++++++++
 src/sinker/mixed/tests/packed_rgb_f16.rs      | 15 ++++++--
 src/sinker/mixed/tests/packed_rgb_float.rs    | 16 ++++++---
 12 files changed, 304 insertions(+), 38 deletions(-)

diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
index f96f02b1..a612dcef 100644
--- a/src/row/arch/neon/packed_rgb_float.rs
+++ b/src/row/arch/neon/packed_rgb_float.rs
@@ -411,26 +411,36 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    if BE {
-      // For BE pass-through: load as u32 with byte-swap, store as f32.
+    // Fast path: when the requested encoding (BE) matches the host's native
+    // endian, the bytes can be copied verbatim — `vld1q_f32` reads host-native
+    // bytes which is exactly what we need to emit. Otherwise we must decode
+    // through `load_f32x4::<BE>` (which byte-swaps when BE != host-native) so
+    // the stored host-native f32 round-trips back to the same value.
+    if BE == HOST_NATIVE_BE {
       while i + 4 <= total {
-        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
+        let v = vld1q_f32(rgb_in.as_ptr().add(i));
         vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
         i += 4;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
         i += 1;
       }
     } else {
+      // Encoding doesn't match host: decode each lane to host-native.
       while i + 4 <= total {
-        let v = vld1q_f32(rgb_in.as_ptr().add(i));
+        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
         vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
         i += 4;
       }
       while i < total {
-        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        let host_bits = if BE {
+          u32::from_be(bits)
+        } else {
+          u32::from_le(bits)
+        };
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
         i += 1;
       }
     }
@@ -459,6 +469,11 @@ use super::endian::load_endian_u16x4;
 /// native == LE so `BE = false`; on a BE target, host-native == BE so
 /// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
 /// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+///
+/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw
+/// `vld1q_f32`/`vst1q_f32` copy is byte-correct only when the source encoding
+/// (`BE`) matches the host's native endian, so the kernel falls through to
+/// the endian-aware `load_f32x4::<BE>` slow path otherwise.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Widen 4 half-precision floats (`f16x4`, i.e. 8 bytes starting at `ptr`)
diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
index 259f0757..ab343747 100644
--- a/src/row/arch/neon/tests/packed_rgb_float.rs
+++ b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -346,6 +346,38 @@ fn neon_rgbf32_to_rgb_f32_be_is_byteswap() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::<false>`
+/// and asserts it decodes to the host-native expected values.
+///
+/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but
+/// on BE hosts it guards against the historical bug where the kernel used a raw
+/// `vld1q_f32`/`vst1q_f32` copy in the `BE = false` branch, which preserved the
+/// LE byte order on store and produced corrupted (byte-swapped) host f32s.
+/// The current kernel falls through to the endian-aware `load_f32x4::<false>`
+/// slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this test passes on both.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let expected = pseudo_random_rgbf32(w); // host-native f32 values
+    // Build LE-encoded input: each lane's bits, written as if LE on disk, then
+    // reinterpreted as host-native f32. On LE hosts this is identical to
+    // `expected`; on BE hosts each lane is byte-swapped.
+    let le_in: std::vec::Vec<f32> = expected
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out, w);
+    }
+    assert_eq!(
+      out, expected,
+      "NEON rgbf32_to_rgb_f32_row::<false> must decode LE input to host-native (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — Rgbf16 -----------------------------------------------
 
 #[test]
diff --git a/src/row/arch/wasm_simd128/packed_rgb_float.rs b/src/row/arch/wasm_simd128/packed_rgb_float.rs
index 414deb11..1cfedf3b 100644
--- a/src/row/arch/wasm_simd128/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/packed_rgb_float.rs
@@ -20,6 +20,11 @@ use super::{endian::load_endian_u32x4, scalar};
 /// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
 /// (`wasm32-*` is LE today, but keeping the routing endian-agnostic future-
 /// proofs against any BE wasm target.)
+///
+/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw
+/// `v128_load`/`v128_store` copy is byte-correct only when the source encoding
+/// (`BE`) matches the host's native endian, so the kernel falls through to
+/// the endian-aware `load_f32x4::<BE>` slow path otherwise.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 // ---- helpers ------------------------------------------------------------------
@@ -272,8 +277,9 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
 
 /// f32 RGB → f32 RGB lossless pass-through / byte-swap.
 ///
-/// - `BE = false`: fast `v128_load` → `v128_store` copy (no math).
-/// - `BE = true`:  load each element as u32, byte-swap, store as f32.
+/// - `BE == HOST_NATIVE_BE`: fast `v128_load` → `v128_store` copy (no math).
+/// - otherwise: load each element through endian-aware `load_f32x4::<BE>`
+///   (byte-swap to host-native), store as f32.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
@@ -284,7 +290,12 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
 
-  if !BE {
+  // Fast path: when the requested encoding (BE) matches the host's native
+  // endian, the bytes can be copied verbatim — `v128_load` reads host-native
+  // bytes which is exactly what we need to emit. Otherwise we must decode
+  // through `load_f32x4::<BE>` (which byte-swaps when BE differs from
+  // host-native) so the stored host-native f32 round-trips to the same value.
+  if BE == HOST_NATIVE_BE {
     let total = width * 3;
     let mut i = 0usize;
     while i + 4 <= total {
@@ -301,12 +312,12 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
       i += 1;
     }
   } else {
-    // BE: byte-swap each f32 element via u32 lane reinterpretation.
+    // Encoding doesn't match host: decode each lane to host-native via the
+    // endian-aware loader (`load_endian_u32x4::<BE>` byte-swaps each lane).
     let total = width * 3;
     let mut i = 0usize;
     while i + 4 <= total {
       unsafe {
-        // load_endian_u32x4::<true> byte-swaps each 32-bit lane.
         let swapped = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
         v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, swapped);
       }
@@ -315,7 +326,12 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
     while i < total {
       unsafe {
         let bits = rgb_in.get_unchecked(i).to_bits();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
+        let host_bits = if BE {
+          u32::from_be(bits)
+        } else {
+          u32::from_le(bits)
+        };
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
       }
       i += 1;
     }
diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
index 6a13b394..b4e13e50 100644
--- a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
@@ -301,6 +301,39 @@ fn wasm_rgbf32_to_rgb_f32_be_is_byteswap() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::<false>`
+/// and asserts it decodes to the host-native expected values.
+///
+/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but
+/// on BE hosts it guards against the historical bug where the kernel used a raw
+/// `v128_load`/`v128_store` copy in the `BE = false` branch, which preserved
+/// the LE byte order on store and produced corrupted (byte-swapped) host f32s.
+/// The current kernel falls through to the endian-aware `load_f32x4::<false>`
+/// slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this test passes on both.
+/// (`wasm32-*` is LE today, but the routing is endian-agnostic for any future
+/// BE wasm target.)
+#[test]
+fn wasm_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let expected = pseudo_random_rgbf32(w); // host-native f32 values
+    // Build LE-encoded input: each lane's bits, written as if LE on disk, then
+    // reinterpreted as host-native f32. On LE hosts this is identical to
+    // `expected`; on BE hosts each lane is byte-swapped.
+    let le_in: std::vec::Vec<f32> = expected
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out, w);
+    }
+    assert_eq!(
+      out, expected,
+      "wasm rgbf32_to_rgb_f32_row::<false> must decode LE input to host-native (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — wasm-simd128 Rgbf16 -----------------------------------
 
 #[test]
diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs
index 75cfdb04..c859b1b7 100644
--- a/src/row/arch/x86_avx2/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/packed_rgb_float.rs
@@ -26,6 +26,11 @@ use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
 /// host-native == LE so `BE = false`; on a BE target, host-native == BE so
 /// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
 /// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+///
+/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw
+/// `_mm256_loadu_ps`/`_mm256_storeu_ps` copy is byte-correct only when the
+/// source encoding (`BE`) matches the host's native endian, so the kernel
+/// falls through to the endian-aware `load_f32x8::<BE>` slow path otherwise.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Load 8 f32 lanes from `ptr` in endian-aware fashion.
@@ -329,25 +334,35 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    if BE {
+    // Fast path: when the requested encoding (BE) matches the host's native
+    // endian, the bytes can be copied verbatim — `_mm256_loadu_ps` reads host-
+    // native bytes which is exactly what we need to emit. Otherwise we must
+    // decode through `load_f32x8::<BE>` (which byte-swaps when BE differs from
+    // host-native) so the stored host-native f32 round-trips to the same value.
+    if BE == HOST_NATIVE_BE {
       while i + 8 <= total {
-        let v = load_f32x8::<BE>(rgb_in.as_ptr().add(i));
+        let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i));
         _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 8;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
         i += 1;
       }
     } else {
       while i + 8 <= total {
-        let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i));
+        let v = load_f32x8::<BE>(rgb_in.as_ptr().add(i));
         _mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 8;
       }
       while i < total {
-        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        let host_bits = if BE {
+          u32::from_be(bits)
+        } else {
+          u32::from_le(bits)
+        };
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
         i += 1;
       }
     }
diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
index 9fde3182..cdb6f0f8 100644
--- a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
@@ -366,6 +366,42 @@ fn avx2_rgbf32_to_rgb_f32_be_is_byteswap() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::<false>`
+/// and asserts it decodes to the host-native expected values.
+///
+/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but
+/// on BE hosts it guards against the historical bug where the kernel used a raw
+/// `_mm256_loadu_ps`/`_mm256_storeu_ps` copy in the `BE = false` branch, which
+/// preserved the LE byte order on store and produced corrupted (byte-swapped)
+/// host f32s. The current kernel falls through to the endian-aware
+/// `load_f32x8::<false>` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this
+/// test passes on both.
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx2_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [1usize, 4, 8, 17, 33, 1920, 1921] {
+    let expected = pseudo_random_rgbf32(w); // host-native f32 values
+    // Build LE-encoded input: each lane's bits, written as if LE on disk, then
+    // reinterpreted as host-native f32. On LE hosts this is identical to
+    // `expected`; on BE hosts each lane is byte-swapped.
+    let le_in: std::vec::Vec<f32> = expected
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out, w);
+    }
+    assert_eq!(
+      out, expected,
+      "AVX2 rgbf32_to_rgb_f32_row::<false> must decode LE input to host-native (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — AVX2 + F16C Rgbf16 ------------------------------------
 
 #[test]
diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs
index 783dcdab..a80c5128 100644
--- a/src/row/arch/x86_avx512/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/packed_rgb_float.rs
@@ -23,6 +23,11 @@ use crate::row::arch::x86_avx2::endian::load_endian_u16x16;
 /// host-native == LE so `BE = false`; on a BE target, host-native == BE so
 /// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
 /// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+///
+/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw
+/// `_mm512_loadu_ps`/`_mm512_storeu_ps` copy is byte-correct only when the
+/// source encoding (`BE`) matches the host's native endian, so the kernel
+/// falls through to the endian-aware `load_f32x16::<BE>` slow path otherwise.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Load 16 f32 lanes from `ptr` in endian-aware fashion.
@@ -300,25 +305,35 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    if BE {
+    // Fast path: when the requested encoding (BE) matches the host's native
+    // endian, the bytes can be copied verbatim — `_mm512_loadu_ps` reads host-
+    // native bytes which is exactly what we need to emit. Otherwise we must
+    // decode through `load_f32x16::<BE>` (which byte-swaps when BE differs from
+    // host-native) so the stored host-native f32 round-trips to the same value.
+    if BE == HOST_NATIVE_BE {
       while i + 16 <= total {
-        let v = load_f32x16::<BE>(rgb_in.as_ptr().add(i));
+        let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i));
         _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 16;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
         i += 1;
       }
     } else {
       while i + 16 <= total {
-        let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i));
+        let v = load_f32x16::<BE>(rgb_in.as_ptr().add(i));
         _mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 16;
       }
       while i < total {
-        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        let host_bits = if BE {
+          u32::from_be(bits)
+        } else {
+          u32::from_le(bits)
+        };
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
         i += 1;
       }
     }
diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
index 493f07e0..e74cc5b2 100644
--- a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
@@ -395,6 +395,42 @@ fn avx512_rgbf32_to_rgb_f32_be_is_byteswap() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::<false>`
+/// and asserts it decodes to the host-native expected values.
+///
+/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but
+/// on BE hosts it guards against the historical bug where the kernel used a raw
+/// `_mm512_loadu_ps`/`_mm512_storeu_ps` copy in the `BE = false` branch, which
+/// preserved the LE byte order on store and produced corrupted (byte-swapped)
+/// host f32s. The current kernel falls through to the endian-aware
+/// `load_f32x16::<false>` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so
+/// this test passes on both.
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn avx512_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512f") {
+    return;
+  }
+  for w in [1usize, 4, 16, 33, 1920, 1921] {
+    let expected = pseudo_random_rgbf32(w); // host-native f32 values
+    // Build LE-encoded input: each lane's bits, written as if LE on disk, then
+    // reinterpreted as host-native f32. On LE hosts this is identical to
+    // `expected`; on BE hosts each lane is byte-swapped.
+    let le_in: std::vec::Vec<f32> = expected
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out, w);
+    }
+    assert_eq!(
+      out, expected,
+      "AVX-512 rgbf32_to_rgb_f32_row::<false> must decode LE input to host-native (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — AVX-512 + F16C Rgbf16 ---------------------------------
 
 #[test]
diff --git a/src/row/arch/x86_sse41/packed_rgb_float.rs b/src/row/arch/x86_sse41/packed_rgb_float.rs
index 85513f71..9b68dee9 100644
--- a/src/row/arch/x86_sse41/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/packed_rgb_float.rs
@@ -319,25 +319,35 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
   unsafe {
     let total = width * 3;
     let mut i = 0usize;
-    if BE {
+    // Fast path: when the requested encoding (BE) matches the host's native
+    // endian, the bytes can be copied verbatim — `_mm_loadu_ps` reads host-
+    // native bytes which is exactly what we need to emit. Otherwise we must
+    // decode through `load_f32x4::<BE>` (which byte-swaps when BE differs from
+    // host-native) so the stored host-native f32 round-trips to the same value.
+    if BE == HOST_NATIVE_BE {
       while i + 4 <= total {
-        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
+        let v = _mm_loadu_ps(rgb_in.as_ptr().add(i));
         _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 4;
       }
       while i < total {
-        let bits = (*rgb_in.get_unchecked(i)).to_bits();
-        *rgb_out.get_unchecked_mut(i) = f32::from_bits(u32::from_be(bits));
+        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
         i += 1;
       }
     } else {
       while i + 4 <= total {
-        let v = _mm_loadu_ps(rgb_in.as_ptr().add(i));
+        let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
         _mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
         i += 4;
       }
       while i < total {
-        *rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
+        let bits = (*rgb_in.get_unchecked(i)).to_bits();
+        let host_bits = if BE {
+          u32::from_be(bits)
+        } else {
+          u32::from_le(bits)
+        };
+        *rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
         i += 1;
       }
     }
@@ -364,6 +374,11 @@ use super::endian::load_endian_u16x4;
 /// native == LE so `BE = false`; on a BE target, host-native == BE so
 /// `BE = true`. Without this routing the downstream `rgbf32_to_*::<false>`
 /// would byte-swap an already-decoded host-native f32 buffer on BE hosts.
+///
+/// Also used by the `rgbf32_to_rgb_f32_row` pass-through fast path: the raw
+/// `_mm_loadu_ps`/`_mm_storeu_ps` copy is byte-correct only when the source
+/// encoding (`BE`) matches the host's native endian, so the kernel falls
+/// through to the endian-aware `load_f32x4::<BE>` slow path otherwise.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Widen 4 × f16 (at `ptr`, 8 bytes) to 4 × f32 (returned as `__m128`).
diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
index 02f5b9ef..81aded28 100644
--- a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
@@ -444,6 +444,42 @@ fn sse41_rgbf32_to_rgb_f32_be_is_byteswap() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgb_f32_row::<false>`
+/// and asserts it decodes to the host-native expected values.
+///
+/// On LE hosts this is a vacuous sanity check (LE-encoded == host-native), but
+/// on BE hosts it guards against the historical bug where the kernel used a raw
+/// `_mm_loadu_ps`/`_mm_storeu_ps` copy in the `BE = false` branch, which
+/// preserved the LE byte order on store and produced corrupted (byte-swapped)
+/// host f32s. The current kernel falls through to the endian-aware
+/// `load_f32x4::<false>` slow path on BE hosts (`HOST_NATIVE_BE != BE`) so this
+/// test passes on both.
+#[test]
+#[cfg_attr(miri, ignore = "SIMD intrinsics unsupported by Miri")]
+fn sse41_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [1usize, 4, 7, 16, 33, 1920, 1921] {
+    let expected = pseudo_random_rgbf32(w); // host-native f32 values
+    // Build LE-encoded input: each lane's bits, written as if LE on disk, then
+    // reinterpreted as host-native f32. On LE hosts this is identical to
+    // `expected`; on BE hosts each lane is byte-swapped.
+    let le_in: std::vec::Vec<f32> = expected
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf32_to_rgb_f32_row::<false>(&le_in, &mut out, w);
+    }
+    assert_eq!(
+      out, expected,
+      "SSE4.1 rgbf32_to_rgb_f32_row::<false> must decode LE input to host-native (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — SSE4.1 + F16C Rgbf16 ----------------------------------
 
 #[test]
diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index 78aa3c37..1cd09a1f 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -327,10 +327,19 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
 /// fixed code on LE. It instead documents the equivalence at the **kernel
 /// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both
 /// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host. On a hypothetical BE
-/// host (full QEMU s390x coverage is Phase 3), the same equivalence holds
-/// for the **fixed** sinker but would fail for the broken one.
+/// must produce identical output on the active host.
+///
+/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
+/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
+/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
+/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
+/// outputs diverge by design. The dispatch-equivalence claim is specifically
+/// about the LE host-routing pattern; the BE-host correctness of the routing
+/// change is verified instead by
+/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the
+/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs
index 64c2ab6f..fd4df2da 100644
--- a/src/sinker/mixed/tests/packed_rgb_float.rs
+++ b/src/sinker/mixed/tests/packed_rgb_float.rs
@@ -261,11 +261,19 @@ fn rgbf32_simd_matches_scalar_with_random_input() {
 /// fixed code on LE. It instead documents the equivalence at the **kernel
 /// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both
 /// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host. On a hypothetical BE
-/// host (full QEMU s390x coverage is Phase 3), the same equivalence holds
-/// for the **fixed** sinker but would fail for the broken one — making this
-/// the natural regression test for the routing change.
+/// must produce identical output on the active host.
+///
+/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
+/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
+/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
+/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
+/// outputs diverge by design. The dispatch-equivalence claim is specifically
+/// about the LE host-routing pattern; the BE-host correctness of the routing
+/// change is verified instead by
+/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the
+/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"

From 63fdf8f2d974b7c3fcdb22e9efb8877a3b4293cd Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 13:46:08 +1200
Subject: [PATCH 07/10] fix(be-tier9): RGBA raw-load fast paths respect host
 endian (audit follow-up)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 4th-pass review of PR #83 found that the previous `f1161d7` audit, which
fixed the `BE = false` raw-load fast path in `rgbf32_to_rgb_f32_row` per
backend, missed three other kernels with the same broken pattern: NEON's
`rgbf32_to_rgba_row` / `rgbf32_to_rgba_u16_row` (`vld3q_f32` deinterleave
hardcoded for the LE-host quadrant) and the AVX2 / AVX-512 `widen_f16x*`
helpers used by every Rgbf16 SIMD entry point (raw `_mm_loadu_si128` /
`_mm256_loadu_si256` for `BE = false`). On a big-endian AArch64 or x86 host
with LE-encoded input, each of these reads host-native (BE) bytes from an
LE buffer and mis-decodes the f32 / f16 lanes downstream — same defect class
as the original f32 pass-through bug, just in different kernels.

Fix:

* NEON `rgbf32_to_rgba_row` / `rgbf32_to_rgba_u16_row`: replace the hardcoded
  `if BE { endian-aware } else { vld3q_f32 }` deinterleave gate with the
  `BE == HOST_NATIVE_BE` host-endian gate (same pattern f1161d7 established
  for `rgbf32_to_rgb_f32_row`). Fast path uses `vld3q_f32` when on-disk
  encoding matches host-native; otherwise falls through to the existing
  endian-aware `load_f32x4::<BE>` slow path with manual deinterleave. Two
  call sites changed.
* AVX2 `widen_f16x8_avx`: drop the `if BE { load_endian_u16x8::<BE> } else
  { _mm_loadu_si128 }` conditional in favor of unconditionally routing
  through `load_endian_u16x8::<BE>`. The endian-aware loader monomorphizes
  to a no-op `_mm_loadu_si128` when `BE` matches host-native and to a
  byte-swap shuffle otherwise — correct on both LE and BE hosts. One call
  site changed; transitively fixes 5 entry points (`rgbf16_to_rgb_row`,
  `rgbf16_to_rgba_row`, `rgbf16_to_rgb_u16_row`, `rgbf16_to_rgba_u16_row`,
  `rgbf16_to_rgb_f32_row`).
* AVX-512 `widen_f16x16_avx512`: same fix using `load_endian_u16x16::<BE>`.
  One call site changed; transitively fixes the same 5 entry points.

Audit results — only kernels with a `<const BE: bool>` parameter that gated a
raw-load fast path on the bare `BE` flag had this defect:

  src/row/arch/neon/packed_rgb_float.rs:162   rgbf32_to_rgba_row     — fixed
  src/row/arch/neon/packed_rgb_float.rs:341   rgbf32_to_rgba_u16_row — fixed
  src/row/arch/x86_avx2/packed_rgb_float.rs:393  widen_f16x8_avx     — fixed
  src/row/arch/x86_avx512/packed_rgb_float.rs:367 widen_f16x16_avx512 — fixed

Backends checked clean: SSE4.1 (RGBA paths already use `load_f32x4::<BE>`,
f16 widen already uses `load_endian_u16x4::<BE>`); WASM SIMD128 (RGBA paths
use `load_f32x4::<BE>`, f16 widen is scalar with explicit endian decode);
NEON `widen_f16x4` (already uses `load_endian_u16x4::<BE>`).

Regression tests — twelve new LE-decode tests using the established
`*_le_input_decodes_correctly_on_any_host` pattern:

  neon_rgbf32_to_rgba_row_le_input_decodes_correctly_on_any_host
  neon_rgbf32_to_rgba_u16_row_le_input_decodes_correctly_on_any_host
  avx2_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host
  avx2_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host
  avx2_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host
  avx2_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host
  avx2_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host
  avx512_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host
  avx512_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host
  avx512_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host
  avx512_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host
  avx512_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host

Each constructs an LE-encoded fixture (host-native bits passed through
`from_bits(u_n::from_le(_))`) and feeds it through `::<false>`, asserting
the SIMD output matches the scalar reference (which is endian-correct via
`load_f32::<BE>` / `load_f16::<BE>`). On LE hosts these are vacuous identity
checks; on BE hosts (full QEMU s390x coverage is Phase 3) each would have
caught the original bug.

Aarch64 LE-host test count: 2205 → 2207 (+2 NEON RGBA tests). x86_64 LE-host
test count: 2922 → 2932 (+10 = 5 AVX2 + 5 AVX-512). All `cargo test` /
`cargo build` (LE x86 + aarch64 + WASM) / `cargo build --no-default-features`
/ `cargo fmt --check` / `cargo clippy --all-targets --all-features
-- -D warnings` checks pass on aarch64-apple-darwin and x86_64-apple-darwin.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/packed_rgb_float.rs         |  59 +++----
 src/row/arch/neon/tests/packed_rgb_float.rs   |  61 +++++++
 src/row/arch/x86_avx2/packed_rgb_float.rs     |  20 +--
 .../arch/x86_avx2/tests/packed_rgb_float.rs   | 145 +++++++++++++++++
 src/row/arch/x86_avx512/packed_rgb_float.rs   |  20 +--
 .../arch/x86_avx512/tests/packed_rgb_float.rs | 153 ++++++++++++++++++
 6 files changed, 409 insertions(+), 49 deletions(-)

diff --git a/src/row/arch/neon/packed_rgb_float.rs b/src/row/arch/neon/packed_rgb_float.rs
index a612dcef..1ea6331a 100644
--- a/src/row/arch/neon/packed_rgb_float.rs
+++ b/src/row/arch/neon/packed_rgb_float.rs
@@ -159,35 +159,26 @@ pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
       // — the f32→u8 cast itself is the cost, not the gather.
       for sub in 0..4 {
         let base = (x + sub * 4) * 3;
-        let v_rgb = if BE {
-          // For BE we cannot use vld3q_f32 directly (it always loads
-          // native-endian bytes). Load each f32 vector individually
-          // via the endian-aware helper, then manually deinterleave.
-          // Load 12 f32 values as 3 × float32x4_t, then deinterleave
-          // the R/G/B channels using vtrnq / vuzpq.
+        // Fast path: on-disk encoding (`BE`) matches host-native, so the raw
+        // `vld3q_f32` reads host-native bytes that already carry the right f32
+        // values. Slow path (`BE != HOST_NATIVE_BE`): each f32 must be byte-
+        // swapped before deinterleave; load via the endian-aware helper into
+        // a contiguous buffer, then unstride into per-channel vectors.
+        let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE {
+          let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
+          (v_rgb.0, v_rgb.1, v_rgb.2)
+        } else {
           let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
           let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
           let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
-          // raw0 = [R0,G0,B0,R1], raw1 = [G1,B1,R2,G2], raw2 = [B2,R3,G3,B3]
-          // Deinterleave into per-channel vectors via vuzpq:
-          //   r = [R0,B0,G1,R2, R1,B1,…] — need proper deinterleave.
-          // Use the same scalar path for the BE deinterleave case to
-          // keep correctness simple.
-          float32x4x3_t(raw0, raw1, raw2)
-        } else {
-          vld3q_f32(rgb_in.as_ptr().add(base))
-        };
-
-        let (r_v, g_v, b_v) = if BE {
-          // Manual deinterleave: raw interleaved [R0,G0,B0,R1,G1,B1,R2,G2,B2,R3,G3,B3]
-          // split into three 4-element f32 arrays via temporary scalar approach.
+          // Manual deinterleave: contiguous host-native f32 layout is
+          // [R0,G0,B0,R1, G1,B1,R2,G2, B2,R3,G3,B3] across raw{0,1,2}.
           let mut r_arr = [0.0f32; 4];
           let mut g_arr = [0.0f32; 4];
           let mut b_arr = [0.0f32; 4];
-          vst1q_f32(r_arr.as_mut_ptr(), v_rgb.0);
-          vst1q_f32(g_arr.as_mut_ptr(), v_rgb.1);
-          vst1q_f32(b_arr.as_mut_ptr(), v_rgb.2);
-          // r_arr = [R0,G0,B0,R1], g_arr = [G1,B1,R2,G2], b_arr = [B2,R3,G3,B3]
+          vst1q_f32(r_arr.as_mut_ptr(), raw0);
+          vst1q_f32(g_arr.as_mut_ptr(), raw1);
+          vst1q_f32(b_arr.as_mut_ptr(), raw2);
           let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]];
           let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]];
           let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]];
@@ -196,8 +187,6 @@ pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
             vld1q_f32(g_deint.as_ptr()),
             vld1q_f32(b_deint.as_ptr()),
           )
-        } else {
-          (v_rgb.0, v_rgb.1, v_rgb.2)
         };
 
         let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
@@ -338,7 +327,16 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
       let mut b_h = [0u16; 8];
       for sub in 0..2 {
         let base = (x + sub * 4) * 3;
-        let (r_v, g_v, b_v) = if BE {
+        // Fast path: `BE == HOST_NATIVE_BE` means on-disk encoding matches the
+        // host's native byte order, so `vld3q_f32` (which always reads host-
+        // native bytes) decodes the encoded f32s correctly. Slow path: the
+        // encoded bytes are foreign — load each f32 through the endian-aware
+        // helper (which byte-swaps when `BE != HOST_NATIVE_BE`) into a
+        // contiguous buffer, then deinterleave into per-channel vectors.
+        let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE {
+          let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
+          (v_rgb.0, v_rgb.1, v_rgb.2)
+        } else {
           let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
           let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
           let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
@@ -356,9 +354,6 @@ pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
             vld1q_f32(g_deint.as_ptr()),
             vld1q_f32(b_deint.as_ptr()),
           )
-        } else {
-          let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
-          (v_rgb.0, v_rgb.1, v_rgb.2)
         };
 
         let r_s = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
@@ -474,6 +469,12 @@ use super::endian::load_endian_u16x4;
 /// `vld1q_f32`/`vst1q_f32` copy is byte-correct only when the source encoding
 /// (`BE`) matches the host's native endian, so the kernel falls through to
 /// the endian-aware `load_f32x4::<BE>` slow path otherwise.
+///
+/// Same gate applies to the `rgbf32_to_rgba_row` / `rgbf32_to_rgba_u16_row`
+/// `vld3q_f32` deinterleave fast path: `vld3q_f32` reads host-native bytes,
+/// so it's only correct when the on-disk encoding matches host-native.
+/// Otherwise the kernel falls through to the endian-aware `load_f32x4::<BE>`
+/// path with a manual deinterleave.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
 /// Widen 4 half-precision floats (`f16x4`, i.e. 8 bytes starting at `ptr`)
diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
index ab343747..4c65a5c4 100644
--- a/src/row/arch/neon/tests/packed_rgb_float.rs
+++ b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -378,6 +378,67 @@ fn neon_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
   }
 }
 
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgba_row::<false>`
+/// and asserts it produces the same output as the scalar reference run with
+/// the same LE-encoded input via `<false>`.
+///
+/// On LE hosts this is vacuous (LE-encoded == host-native, both paths agree),
+/// but on BE hosts it guards against the historical bug where the kernel used
+/// a raw `vld3q_f32` deinterleave in the `BE = false` branch — that reads
+/// host-native (BE) bytes from an LE-encoded buffer, mis-decoding the f32s.
+/// The fixed kernel uses the `BE == HOST_NATIVE_BE` gate to choose between
+/// the raw-load fast path and the endian-aware deinterleave, so it's correct
+/// on both LE and BE hosts.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_row_le_input_decodes_correctly_on_any_host() {
+  for w in [1usize, 4, 7, 15, 16, 17, 31, 33, 1920, 1921] {
+    let host_native = pseudo_random_rgbf32(w);
+    let le_in: std::vec::Vec<f32> = host_native
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out_simd = std::vec![0u8; w * 4];
+    let mut out_scalar = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf32_to_rgba_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf32_to_rgba_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "NEON rgbf32_to_rgba_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+/// Feeds an explicitly LE-encoded fixture through `rgbf32_to_rgba_u16_row::<false>`
+/// and asserts it produces the same output as the scalar reference.
+///
+/// Same rationale as `neon_rgbf32_to_rgba_row_le_input_decodes_correctly_on_any_host`:
+/// the fast `vld3q_f32` deinterleave is only safe when on-disk encoding matches
+/// host-native, so the kernel now gates it on `BE == HOST_NATIVE_BE`.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_rgbf32_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() {
+  for w in [1usize, 4, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
+    let host_native = pseudo_random_rgbf32(w);
+    let le_in: std::vec::Vec<f32> = host_native
+      .iter()
+      .map(|v| f32::from_bits(u32::from_le(v.to_bits())))
+      .collect();
+    let mut out_simd = std::vec![0u16; w * 4];
+    let mut out_scalar = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf32_to_rgba_u16_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "NEON rgbf32_to_rgba_u16_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
 // ---- BE parity tests — Rgbf16 -----------------------------------------------
 
 #[test]
diff --git a/src/row/arch/x86_avx2/packed_rgb_float.rs b/src/row/arch/x86_avx2/packed_rgb_float.rs
index c859b1b7..32daae3f 100644
--- a/src/row/arch/x86_avx2/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/packed_rgb_float.rs
@@ -373,14 +373,20 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 //
 // `_mm256_cvtph_ps` (F16C) widens 8 × f16 (stored as 8 × i16 in a __m128i)
 // to 8 × f32 in a __m256.  We load 16 bytes (8 f16 values) via
-// `_mm_loadu_si128` (LE) or `load_endian_u16x8::<BE>` (with byte-swap for BE).
+// `load_endian_u16x8::<BE>` which routes to a host-native pass-through when
+// the on-disk encoding matches host-native and to a byte-swap shuffle
+// otherwise — correct on both LE and BE hosts.
 //
 // `#[target_feature(enable = "avx2,f16c")]` ensures both features are active.
 
 /// Widen 8 × f16 (at `ptr`, 16 bytes) to 8 × f32 (returned as `__m256`).
 ///
 /// For `BE = true` the f16 values are stored big-endian; bytes are swapped
-/// before the F16C widening conversion.
+/// before the F16C widening conversion. The historical `BE = false` branch
+/// used a raw `_mm_loadu_si128` which assumed LE-encoded input on a LE host;
+/// `load_endian_u16x8::<BE>` is correct on both LE and BE hosts because it
+/// monomorphizes to a no-op load when on-disk encoding matches host-native
+/// and to a byte-swap shuffle otherwise.
 ///
 /// # Safety
 ///
@@ -390,14 +396,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 #[target_feature(enable = "avx2,f16c")]
 unsafe fn widen_f16x8_avx<const BE: bool>(ptr: *const half::f16) -> __m256 {
   unsafe {
-    if BE {
-      // Load 16 bytes as u16x8 with byte-swap, then widen to f32x8.
-      let raw = load_endian_u16x8::<BE>(ptr as *const u8);
-      _mm256_cvtph_ps(raw)
-    } else {
-      let raw = _mm_loadu_si128(ptr as *const __m128i);
-      _mm256_cvtph_ps(raw)
-    }
+    let raw = load_endian_u16x8::<BE>(ptr as *const u8);
+    _mm256_cvtph_ps(raw)
   }
 }
 
diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
index cdb6f0f8..b4f19774 100644
--- a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
@@ -553,3 +553,148 @@ fn avx2_rgbf16_to_rgb_f16_be_is_byteswap() {
     );
   }
 }
+
+// ---- LE-decode regression tests — AVX2 + F16C `widen_f16x8_avx` -------------
+//
+// These guard against the historical bug where `widen_f16x8_avx::<false>` used
+// a raw `_mm_loadu_si128` to read 8 × f16 — that's a host-native u16 load, so
+// on a BE host with LE-encoded input the lanes were mis-decoded as host-BE
+// before the F16C widening conversion. The fixed helper always routes through
+// `load_endian_u16x8::<BE>` which monomorphizes to a pass-through on the
+// matching host-encoding axis and to a byte-swap shuffle otherwise, so each
+// kernel decodes correctly on both LE and BE hosts.
+
+/// Build an LE-encoded f16 fixture from a host-native one. On LE hosts this is
+/// identity; on BE hosts each lane is byte-swapped (so an LE-encoded buffer
+/// looks byte-swapped relative to host-native bits).
+fn le_rgbf16_from_host(host: &[half::f16]) -> std::vec::Vec<half::f16> {
+  host
+    .iter()
+    .map(|v| half::f16::from_bits(u16::from_le(v.to_bits())))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u8; w * 3];
+    let mut out_scalar = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX2+F16C rgbf16_to_rgb_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u8; w * 4];
+    let mut out_scalar = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgba_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX2+F16C rgbf16_to_rgba_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u16; w * 3];
+    let mut out_scalar = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX2+F16C rgbf16_to_rgb_u16_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u16; w * 4];
+    let mut out_scalar = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX2+F16C rgbf16_to_rgba_u16_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx2_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") || !std::arch::is_x86_feature_detected!("f16c") {
+    return;
+  }
+  for w in [1usize, 4, 8, 15, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0.0f32; w * 3];
+    let mut out_scalar = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX2+F16C rgbf16_to_rgb_f32_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx512/packed_rgb_float.rs b/src/row/arch/x86_avx512/packed_rgb_float.rs
index a80c5128..d4d80802 100644
--- a/src/row/arch/x86_avx512/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/packed_rgb_float.rs
@@ -345,8 +345,9 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 // `_mm512_cvtph_ps` (F16C + AVX-512F) widens 16 × f16 (stored as 16 × i16 in
 // a __m256i) to 16 × f32 in a __m512.
 //
-// For BE: load 32 bytes as __m256i via `load_endian_u16x16::<BE>` (which
-// byte-swaps each u16 for big-endian inputs), then call `_mm512_cvtph_ps`.
+// Load 32 bytes as __m256i via `load_endian_u16x16::<BE>` which byte-swaps
+// each u16 when the on-disk encoding doesn't match host-native, then call
+// `_mm512_cvtph_ps`.
 //
 // `#[target_feature(enable = "avx512f,f16c")]` — `f16c` is the half↔single
 // narrowing/widening extension.
@@ -354,7 +355,11 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 /// Widen 16 × f16 (at `ptr`, 32 bytes) to 16 × f32 (returned as `__m512`).
 ///
 /// For `BE = true` the f16 values are stored big-endian; bytes are swapped
-/// before the F16C widening conversion.
+/// before the F16C widening conversion. The historical `BE = false` branch
+/// used a raw `_mm256_loadu_si256` which assumed LE-encoded input on a LE
+/// host; `load_endian_u16x16::<BE>` is correct on both LE and BE hosts because
+/// it monomorphizes to a no-op load when on-disk encoding matches host-native
+/// and to a byte-swap shuffle otherwise.
 ///
 /// # Safety
 ///
@@ -364,13 +369,8 @@ pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
 #[target_feature(enable = "avx512f,f16c")]
 unsafe fn widen_f16x16_avx512<const BE: bool>(ptr: *const half::f16) -> __m512 {
   unsafe {
-    if BE {
-      let raw = load_endian_u16x16::<BE>(ptr as *const u8);
-      _mm512_cvtph_ps(raw)
-    } else {
-      let raw = _mm256_loadu_si256(ptr as *const __m256i);
-      _mm512_cvtph_ps(raw)
-    }
+    let raw = load_endian_u16x16::<BE>(ptr as *const u8);
+    _mm512_cvtph_ps(raw)
   }
 }
 
diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
index e74cc5b2..97172d70 100644
--- a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
@@ -598,3 +598,156 @@ fn avx512_rgbf16_to_rgb_f16_be_is_byteswap() {
     );
   }
 }
+
+// ---- LE-decode regression tests — AVX-512 + F16C `widen_f16x16_avx512` ------
+//
+// These guard against the historical bug where `widen_f16x16_avx512::<false>`
+// used a raw `_mm256_loadu_si256` to read 16 × f16 — that's a host-native u16
+// load, so on a BE host with LE-encoded input the lanes were mis-decoded as
+// host-BE before the F16C widening conversion. The fixed helper always routes
+// through `load_endian_u16x16::<BE>` which monomorphizes to a pass-through on
+// the matching host-encoding axis and to a byte-swap shuffle otherwise, so
+// each kernel decodes correctly on both LE and BE hosts.
+
+/// Build an LE-encoded f16 fixture from a host-native one.
+fn le_rgbf16_from_host(host: &[half::f16]) -> std::vec::Vec<half::f16> {
+  host
+    .iter()
+    .map(|v| half::f16::from_bits(u16::from_le(v.to_bits())))
+    .collect()
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u8; w * 3];
+    let mut out_scalar = std::vec![0u8; w * 3];
+    unsafe {
+      rgbf16_to_rgb_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX-512+F16C rgbf16_to_rgb_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgba_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u8; w * 4];
+    let mut out_scalar = std::vec![0u8; w * 4];
+    unsafe {
+      rgbf16_to_rgba_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgba_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX-512+F16C rgbf16_to_rgba_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_u16_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u16; w * 3];
+    let mut out_scalar = std::vec![0u16; w * 3];
+    unsafe {
+      rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_u16_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX-512+F16C rgbf16_to_rgb_u16_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgba_u16_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0u16; w * 4];
+    let mut out_scalar = std::vec![0u16; w * 4];
+    unsafe {
+      rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgba_u16_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX-512+F16C rgbf16_to_rgba_u16_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn avx512_rgbf16_to_rgb_f32_row_le_input_decodes_correctly_on_any_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw")
+    || !std::arch::is_x86_feature_detected!("f16c")
+  {
+    return;
+  }
+  for w in [1usize, 4, 16, 17, 33, 1920, 1921] {
+    let host = pseudo_random_rgbf16(w);
+    let le_in = le_rgbf16_from_host(&host);
+    let mut out_simd = std::vec![0.0f32; w * 3];
+    let mut out_scalar = std::vec![0.0f32; w * 3];
+    unsafe {
+      rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_simd, w);
+    }
+    scalar::rgbf16_to_rgb_f32_row::<false>(&le_in, &mut out_scalar, w);
+    assert_eq!(
+      out_simd, out_scalar,
+      "AVX-512+F16C rgbf16_to_rgb_f32_row::<false> must decode LE input to match scalar (width {w})"
+    );
+  }
+}

From 56342c0cc25f25b23da85fdc48052f614de4067e Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 14:09:20 +1200
Subject: [PATCH 08/10] test(be-tier9): gate LE-fixture scalar tests on
 cfg(target_endian = "little")
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 5th-pass review of PR #83 found a medium finding in
src/row/scalar/tests.rs: the six Rgbf16 scalar row kernel parity /
widen / copy tests at lines 658-743 use host-native fixtures
(half::f16::from_f32 / Vec<f32>) and call the kernels with `::<false>`
(LE-encoded path). On a big-endian host (e.g. miri-sb-s390x), the
kernel's `u16::from_le` / `u32::from_le` byte-swap correctly
reinterprets the host-native fixture as if it were LE-encoded —
producing different bits than the test asserts. Same class as the
PR #82 alpha_extract / planar_gbr_high_bit fix in 8f2e329.

Fix: add `#[cfg(target_endian = "little")]` alongside `#[test]` on the
six tests, plus on the shared `rgbf16_test_inputs` helper to avoid an
unused-fn warning on BE hosts. Add a section-header comment block
explaining why these tests fail on BE, why the kernel itself remains
correct (locked down by the dedicated BE-parity tests in the
per-backend `tests/packed_rgb_float.rs` modules that build LE-encoded
fixtures via `f32::from_bits(u32::from_le(_))` /
`half::f16::from_bits(u16::from_le(_))`), and why byte-symmetric value
tests are intentionally NOT gated.

Audit:
  - The only `tests.rs` file under `src/row/scalar/` is the file in
    this commit; no other dedicated test files in that directory.
  - Inline `mod tests` blocks in other scalar source files were
    audited in PR #82 (`8f2e329`); no new occurrences of the
    host-native + `<false>` pattern landed in PR #83.
  - YUV planar high-bit-depth tests (e.g. yuv420p10_*) use host-native
    u16 too, but their kernels read u16 directly without `from_le`
    — the byte-format-agnostic contract is documented in
    `src/row/scalar/yuv_planar_high_bit.rs`. Those tests are correctly
    NOT gated.

Tests gated: 6 (rgbf16_scalar_{rgb,rgba,rgb_u16,rgba_u16}_matches_widen_then_rgbf32,
rgbf16_scalar_rgb_f32_matches_element_wise_widen,
rgbf16_scalar_rgb_f16_is_copy) + the rgbf16_test_inputs helper.

LE-host test count unchanged (gates are no-ops):
  Before: 2207 passed
  After:  2207 passed (cargo test --target aarch64-apple-darwin --lib)

cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, cargo build --target x86_64-apple-darwin --tests,
RUSTFLAGS="-C target-feature=+simd128" cargo build --target
wasm32-unknown-unknown --tests, cargo build --no-default-features
all pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/tests.rs | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/src/row/scalar/tests.rs b/src/row/scalar/tests.rs
index 710f34bc..2bf38a79 100644
--- a/src/row/scalar/tests.rs
+++ b/src/row/scalar/tests.rs
@@ -641,10 +641,45 @@ fn p416_rgba_u16_gray_alpha_is_ffff() {
 // values, calls the `rgbf16_to_*_row` kernel, and then calls the matching
 // `rgbf32_to_*_row` kernel with the widened f32 slice.  The outputs must
 // be identical, proving that widening is the only difference.
+//
+// LE-host gating rationale (codex 5th-pass review of PR #83):
+//
+// The fixture builder `rgbf16_test_inputs` produces host-native `half::f16`
+// (and widened host-native `f32`) values via `half::f16::from_f32` /
+// `to_f32`. The tests then call the kernels with `::<false>`, which means
+// "input is LE-encoded — decode to host-native by applying `from_le`".
+//
+// On a little-endian host, host-native bits and LE-encoded bits are the
+// same byte sequence, so `u16::from_le` / `u32::from_le` is a no-op and
+// the assertion holds.
+//
+// On a big-endian host, host-native `f16`/`f32` bits do NOT lay out
+// little-endian, so the kernel's `from_le` byte-swap correctly
+// reinterprets the host-native fixture as if it were an LE-encoded
+// payload — producing a different (corrupted) value than the test
+// expects.  The kernel itself is correct; this is purely a
+// fixture-vs-kernel byte-order mismatch on BE hosts (same class as the
+// PR #82 alpha_extract / planar_gbr_high_bit gates in `8f2e329`).
+//
+// Kernel BE-host correctness is locked down separately by the dedicated
+// BE-parity tests in the per-backend `tests/packed_rgb_float.rs`
+// modules, which build LE-encoded fixtures via
+// `f32::from_bits(u32::from_le(_))` / `half::f16::from_bits(u16::from_le(_))`
+// and assert the kernel output matches the original host-native values
+// on every host. Those tests are intentionally NOT gated.
+//
+// The fixture set `[0.0, 1.0, 0.5, 65504.0, 1e-5, -0.5, 2.5, 0.999, 0.001]`
+// includes only one byte-symmetric value (`0.0` → `0x00..00`); every
+// other value has distinct LE/BE byte layouts, so the parity assertions
+// would fail on BE without gating.
 
 /// 9 representative half-float inputs: normal [0,1] range, HDR, subnormal-
 /// ish, negative, and over-range.  Replicated across 9 pixels × 3 channels
 /// so that every channel position sees every value at some pixel.
+///
+/// LE-only: the six parity / widen / copy tests below are all gated on
+/// `target_endian = "little"`, so this helper is unused on BE hosts.
+#[cfg(target_endian = "little")]
 fn rgbf16_test_inputs() -> (Vec<half::f16>, Vec<f32>, usize) {
   let inputs_f32: [f32; 9] = [0.0, 1.0, 0.5, 65504.0, 1e-5, -0.5, 2.5, 0.999, 0.001];
   let width = inputs_f32.len();
@@ -656,6 +691,7 @@ fn rgbf16_test_inputs() -> (Vec<half::f16>, Vec<f32>, usize) {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
@@ -670,6 +706,7 @@ fn rgbf16_scalar_rgb_matches_widen_then_rgbf32() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
@@ -684,6 +721,7 @@ fn rgbf16_scalar_rgba_matches_widen_then_rgbf32() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
@@ -698,6 +736,7 @@ fn rgbf16_scalar_rgb_u16_matches_widen_then_rgbf32() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
@@ -712,6 +751,7 @@ fn rgbf16_scalar_rgba_u16_matches_widen_then_rgbf32() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"
@@ -728,6 +768,7 @@ fn rgbf16_scalar_rgb_f32_matches_element_wise_widen() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri"

From 4340b152b2f8ca0a93df422eb328ed5d5d6c917a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 14:35:46 +1200
Subject: [PATCH 09/10] test(be-tier9): gate NEON LE-fixture parity tests on
 cfg(target_endian = "little") (audit follow-up)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 6th-pass review of PR #83 found a medium finding in
src/row/arch/neon/tests/packed_rgb_float.rs (lines 32-242): the eleven
Rgbf32 / Rgbf16 SIMD-vs-scalar parity tests use host-native fixtures
(`pseudo_random_rgbf32` / `pseudo_random_rgbf16`) and call the kernels
with `::<false>` (LE-encoded path). On a big-endian host (e.g.
aarch64-be-linux-gnu, miri-sb-s390x), the kernel's `u32::from_le` /
`u16::from_le` byte-swap correctly reinterprets the host-native fixture
as if it were LE-encoded — producing different bits than the test
asserts. Same class as the PR #82 alpha_extract / planar_gbr_high_bit
fix in 8f2e329 and the PR #83 5th-pass scalar gate in 56342c0.

For the SIMD-vs-scalar parity assertions (`assert_eq!(out_scalar,
out_simd)`), parity holds vacuously on BE because both paths apply
the same `from_le` byte-swap to the host-native fixture and produce
the same (corrupted) decoded f32/f16. For the two `lossless`
host-native equality assertions (`assert_eq!(out_neon,
input[..w * 3])` for `rgbf32_to_rgb_f32_row` and
`rgbf16_to_rgb_f16_row`), the assertion fails outright on BE since
the kernel decodes through `load_f32x4::<false>` / scalar `from_le`
to produce a byte-swapped (relative to host-native) result.

The kernel itself is correct on BE; this is purely a fixture-vs-
kernel byte-order mismatch. NEON BE-host correctness is locked down
separately by the dedicated BE-parity tests in this same module
(which build LE-encoded fixtures via `byte_swap` helpers and assert
`<true>`/`<false>` parity on every host) and by the LE-decode
regression tests added in commits c3a6478, dcf40a3, f1161d7,
63fdf8f. Those tests are intentionally NOT gated.

Fix: add `#[cfg(target_endian = "little")]` alongside `#[test]` on
the eleven NEON parity tests, plus a section-header comment block
explaining why these tests fail on BE, why the kernel itself remains
correct, and why byte-swap-helper / LE-decode regression tests are
intentionally NOT gated. The shared `pseudo_random_rgbf32` /
`pseudo_random_rgbf16` helpers are NOT gated because they're also
used by the BE-parity / LE-decode tests that compile on every host.

Audit of other backend test files (`packed_rgb_float.rs` under
x86_sse41, x86_avx2, x86_avx512, wasm_simd128):

  - SSE4.1: 12 tests with same pattern (1 MXCSR regression + 5
    Rgbf32 + 6 Rgbf16). Gated for structural consistency. Already
    only compiled on `target_arch = "x86_64"` which always implies
    `target_endian = "little"`, so the gate is functionally a no-op
    on every supported configuration — but it documents the
    assumption and matches the audit pattern.
  - AVX2: 11 tests (5 Rgbf32 + 6 Rgbf16). Same rationale.
  - AVX-512: 11 tests (5 Rgbf32 + 6 Rgbf16). Same rationale.
  - wasm_simd128: 11 tests (5 Rgbf32 + 6 Rgbf16).
    `target_arch = "wasm32"` is LE by spec; gate added for
    consistency / future-proofing against hypothetical BE wasm.

Total tests gated: 56 (11 NEON + 12 SSE4.1 + 11 AVX2 + 11 AVX-512 +
11 wasm_simd128).

LE-host test count unchanged (gates are no-ops):
  Before: 2207 passed
  After:  2207 passed (cargo test --target aarch64-apple-darwin --lib)

cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, cargo build --target x86_64-apple-darwin --tests
(0 warnings), RUSTFLAGS="-C target-feature=+simd128" cargo build
--target wasm32-unknown-unknown --tests (3 pre-existing warnings,
not from this commit), cargo build --no-default-features, and
cargo check --target s390x-unknown-linux-gnu --lib (BE-host smoke
check) all pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/arch/neon/tests/packed_rgb_float.rs   | 52 +++++++++++++++++++
 .../wasm_simd128/tests/packed_rgb_float.rs    | 27 ++++++++++
 .../arch/x86_avx2/tests/packed_rgb_float.rs   | 28 ++++++++++
 .../arch/x86_avx512/tests/packed_rgb_float.rs | 28 ++++++++++
 .../arch/x86_sse41/tests/packed_rgb_float.rs  | 34 ++++++++++++
 5 files changed, 169 insertions(+)

diff --git a/src/row/arch/neon/tests/packed_rgb_float.rs b/src/row/arch/neon/tests/packed_rgb_float.rs
index 4c65a5c4..2f2ce7f8 100644
--- a/src/row/arch/neon/tests/packed_rgb_float.rs
+++ b/src/row/arch/neon/tests/packed_rgb_float.rs
@@ -1,6 +1,47 @@
 use super::*;
 
 // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests --------------------------
+//
+// LE-host gating rationale (codex 6th-pass review of PR #83):
+//
+// The five Rgbf32 SIMD-vs-scalar parity tests below (and the six Rgbf16
+// parity tests further down) build their fixtures via
+// `pseudo_random_rgbf32` / `pseudo_random_rgbf16`, which produce
+// host-native `f32` / `half::f16` values. They then call the kernels
+// with `::<false>`, which means "input is LE-encoded — decode to
+// host-native by applying `u32::from_le` / `u16::from_le`".
+//
+// On a little-endian host (e.g. aarch64-apple-darwin), host-native bits
+// and LE-encoded bits are the same byte sequence, so `from_le` is a
+// no-op and the assertions hold.
+//
+// On a big-endian host (e.g. aarch64-be-linux-gnu), host-native f32 /
+// f16 bits do NOT lay out little-endian, so the kernel's `from_le`
+// byte-swap correctly reinterprets the host-native fixture as if it
+// were an LE-encoded payload — producing a different (corrupted) value
+// than the test expects. Because both the scalar and NEON kernels
+// apply the same `from_le` byte-swap, the SIMD-vs-scalar parity
+// assertions still hold on BE — but they're vacuously testing
+// "scalar and SIMD are identically wrong", not kernel correctness.
+// The two `lossless` host-native equality assertions
+// (`assert_eq!(out_neon, input[..w * 3])` for `rgbf32_to_rgb_f32_row`
+// and `rgbf16_to_rgb_f16_row`) would fail outright on BE since the
+// kernel decodes through `load_f32x4::<false>` / scalar `from_le` to
+// produce a byte-swapped (relative to host-native) result.
+//
+// The kernel itself is correct on BE; this is purely a fixture-vs-
+// kernel byte-order mismatch (same class as the scalar tests gated in
+// `56342c0`, and the PR #82 alpha_extract / planar_gbr_high_bit gates
+// in `8f2e329`). NEON BE-host correctness is locked down separately
+// by the dedicated BE-parity tests in this same module (which build
+// LE-encoded fixtures via `byte_swap` helpers and assert
+// `<true>`/`<false>` parity on every host) and by the LE-decode
+// regression tests added in commits c3a6478, dcf40a3, f1161d7,
+// 63fdf8f (which build LE-encoded fixtures via
+// `f32::from_bits(u32::from_le(_))` /
+// `half::f16::from_bits(u16::from_le(_))` and assert kernel output
+// matches scalar on every host). Those tests are intentionally NOT
+// gated.
 
 /// Generates a row of pseudo-random `f32` RGB samples. Mix of in-range
 /// `[0, 1]` values, exact `0.5` (round-half-even tie), and HDR > 1.0
@@ -28,6 +69,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgb_neon_matches_scalar_widths() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
@@ -43,6 +85,7 @@ fn rgbf32_to_rgb_neon_matches_scalar_widths() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgba_neon_matches_scalar_widths() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
@@ -58,6 +101,7 @@ fn rgbf32_to_rgba_neon_matches_scalar_widths() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgb_u16_neon_matches_scalar_widths() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
@@ -73,6 +117,7 @@ fn rgbf32_to_rgb_u16_neon_matches_scalar_widths() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgba_u16_neon_matches_scalar_widths() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
@@ -88,6 +133,7 @@ fn rgbf32_to_rgba_u16_neon_matches_scalar_widths() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgb_f32_neon_matches_scalar_widths() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
@@ -119,6 +165,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec<half::f16> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -140,6 +187,7 @@ fn neon_rgbf16_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -161,6 +209,7 @@ fn neon_rgbf16_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -182,6 +231,7 @@ fn neon_rgbf16_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -203,6 +253,7 @@ fn neon_rgbf16_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -224,6 +275,7 @@ fn neon_rgbf16_to_rgb_f32_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
index b4e13e50..0b151c52 100644
--- a/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
+++ b/src/row/arch/wasm_simd128/tests/packed_rgb_float.rs
@@ -1,6 +1,22 @@
 use super::*;
 
 // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests --------------------------
+//
+// LE-host gating rationale (codex 6th-pass review of PR #83):
+//
+// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build
+// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` /
+// `pseudo_random_rgbf16`) and call the kernels with `::<false>`. Same
+// fixture-vs-kernel byte-order class as the scalar tests gated in
+// `56342c0` and the NEON tests gated alongside this commit.
+//
+// `target_arch = "wasm32"` is little-endian by spec, so the
+// `#[cfg(target_endian = "little")]` gate is functionally a no-op on
+// every supported configuration. It's added here for structural
+// consistency with the NEON / scalar gating pattern. The
+// `wasm_rgbf32_to_rgb_f32_row_le_input_decodes_correctly_on_any_host`
+// regression test is correctly vacuous on LE hosts (probe-the-bug on
+// hypothetical BE wasm) and is intentionally NOT gated.
 
 fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
   let n = width * 3;
@@ -21,6 +37,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn wasm_rgbf32_to_rgb_matches_scalar() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
     let input = pseudo_random_rgbf32(w);
@@ -35,6 +52,7 @@ fn wasm_rgbf32_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn wasm_rgbf32_to_rgba_matches_scalar() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
     let input = pseudo_random_rgbf32(w);
@@ -49,6 +67,7 @@ fn wasm_rgbf32_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn wasm_rgbf32_to_rgb_u16_matches_scalar() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
     let input = pseudo_random_rgbf32(w);
@@ -63,6 +82,7 @@ fn wasm_rgbf32_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn wasm_rgbf32_to_rgba_u16_matches_scalar() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
     let input = pseudo_random_rgbf32(w);
@@ -77,6 +97,7 @@ fn wasm_rgbf32_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn wasm_rgbf32_to_rgb_f32_matches_scalar() {
   for w in [1usize, 3, 4, 5, 7, 8, 15, 16, 17, 31, 33, 1920, 1921] {
     let input = pseudo_random_rgbf32(w);
@@ -101,6 +122,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec<half::f16> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -119,6 +141,7 @@ fn wasm_rgbf16_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -137,6 +160,7 @@ fn wasm_rgbf16_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -155,6 +179,7 @@ fn wasm_rgbf16_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -173,6 +198,7 @@ fn wasm_rgbf16_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -191,6 +217,7 @@ fn wasm_rgbf16_to_rgb_f32_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
index b4f19774..0da58e97 100644
--- a/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx2/tests/packed_rgb_float.rs
@@ -1,6 +1,23 @@
 use super::super::*;
 
 // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests --------------------------
+//
+// LE-host gating rationale (codex 6th-pass review of PR #83):
+//
+// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build
+// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` /
+// `pseudo_random_rgbf16`) and call the kernels with `::<false>`. Same
+// fixture-vs-kernel byte-order class as the scalar tests gated in
+// `56342c0` and the NEON tests gated alongside this commit.
+//
+// `target_arch = "x86_64"` always implies `target_endian = "little"`,
+// so the `#[cfg(target_endian = "little")]` gate is functionally a
+// no-op on every supported configuration. It's added here for
+// structural consistency with the NEON / scalar gating pattern. The
+// LE-decode regression tests further down build LE-encoded fixtures
+// via `half::f16::from_bits(u16::from_le(_))` and are correctly
+// vacuous on LE hosts (probe-the-bug on hypothetical BE), so they
+// are intentionally NOT gated.
 
 fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
   let n = width * 3;
@@ -21,6 +38,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx2_rgbf32_to_rgb_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx2") {
     return;
@@ -38,6 +56,7 @@ fn avx2_rgbf32_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx2_rgbf32_to_rgba_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx2") {
     return;
@@ -55,6 +74,7 @@ fn avx2_rgbf32_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx2_rgbf32_to_rgb_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx2") {
     return;
@@ -72,6 +92,7 @@ fn avx2_rgbf32_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx2_rgbf32_to_rgba_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx2") {
     return;
@@ -89,6 +110,7 @@ fn avx2_rgbf32_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx2_rgbf32_to_rgb_f32_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx2") {
     return;
@@ -116,6 +138,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec<half::f16> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -137,6 +160,7 @@ fn avx2_rgbf16_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -158,6 +182,7 @@ fn avx2_rgbf16_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -182,6 +207,7 @@ fn avx2_rgbf16_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -206,6 +232,7 @@ fn avx2_rgbf16_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -230,6 +257,7 @@ fn avx2_rgbf16_to_rgb_f32_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
index 97172d70..261ce54c 100644
--- a/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_avx512/tests/packed_rgb_float.rs
@@ -1,6 +1,23 @@
 use super::super::*;
 
 // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests --------------------------
+//
+// LE-host gating rationale (codex 6th-pass review of PR #83):
+//
+// The five Rgbf32 / six Rgbf16 SIMD-vs-scalar parity tests below build
+// fixtures via host-native f32 / f16 (`pseudo_random_rgbf32` /
+// `pseudo_random_rgbf16`) and call the kernels with `::<false>`. Same
+// fixture-vs-kernel byte-order class as the scalar tests gated in
+// `56342c0` and the NEON tests gated alongside this commit.
+//
+// `target_arch = "x86_64"` always implies `target_endian = "little"`,
+// so the `#[cfg(target_endian = "little")]` gate is functionally a
+// no-op on every supported configuration. It's added here for
+// structural consistency with the NEON / scalar gating pattern. The
+// LE-decode regression tests further down build LE-encoded fixtures
+// via `half::f16::from_bits(u16::from_le(_))` and are correctly
+// vacuous on LE hosts (probe-the-bug on hypothetical BE), so they
+// are intentionally NOT gated.
 
 fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
   let n = width * 3;
@@ -21,6 +38,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx512_rgbf32_to_rgb_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx512bw") {
     return;
@@ -38,6 +56,7 @@ fn avx512_rgbf32_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx512_rgbf32_to_rgba_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx512bw") {
     return;
@@ -55,6 +74,7 @@ fn avx512_rgbf32_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx512_rgbf32_to_rgb_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx512bw") {
     return;
@@ -72,6 +92,7 @@ fn avx512_rgbf32_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx512_rgbf32_to_rgba_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx512bw") {
     return;
@@ -89,6 +110,7 @@ fn avx512_rgbf32_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn avx512_rgbf32_to_rgb_f32_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("avx512bw") {
     return;
@@ -116,6 +138,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec<half::f16> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -139,6 +162,7 @@ fn avx512_rgbf16_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -165,6 +189,7 @@ fn avx512_rgbf16_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -191,6 +216,7 @@ fn avx512_rgbf16_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -217,6 +243,7 @@ fn avx512_rgbf16_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -243,6 +270,7 @@ fn avx512_rgbf16_to_rgb_f32_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
index 81aded28..3d6f59f1 100644
--- a/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
+++ b/src/row/arch/x86_sse41/tests/packed_rgb_float.rs
@@ -1,6 +1,28 @@
 use super::super::*;
 
 // ---- Tier 9 Rgbf32 SIMD-vs-scalar parity tests --------------------------
+//
+// LE-host gating rationale (codex 6th-pass review of PR #83):
+//
+// The MXCSR regression test plus the five Rgbf32 / six Rgbf16 SIMD-vs-
+// scalar parity tests below build their fixtures via host-native f32
+// (`vec![0.5_f32; ...]`, `pseudo_random_rgbf32`) or host-native f16
+// (`pseudo_random_rgbf16`) and call the kernels with `::<false>`. Same
+// fixture-vs-kernel byte-order class as the scalar tests gated in
+// `56342c0` and the NEON tests gated alongside this commit.
+//
+// `target_arch = "x86_64"` always implies `target_endian = "little"`,
+// so the `#[cfg(target_endian = "little")]` gate is functionally a
+// no-op on every supported configuration. It's added here for
+// structural consistency with the NEON / scalar gating pattern, so an
+// audit of "tests that take host-native fixtures and call kernels with
+// `<false>`" returns a uniform answer across every backend. (If x86
+// ever adds BE support, the gates are already in place.)
+//
+// SSE4.1 BE-host correctness — when SSE4.1 is run on a hypothetical BE
+// target — is locked down separately by the dedicated BE-parity tests
+// in this same module (which build LE-encoded fixtures via byte-swap
+// helpers and assert `<true>`/`<false>` parity on every host).
 
 // MXCSR access via inline asm. `_mm_getcsr` / `_mm_setcsr` are deprecated
 // (the deprecation message itself points at inline assembly), so we use the
@@ -34,6 +56,7 @@ unsafe fn write_mxcsr(v: u32) {
 
 #[test]
 #[cfg(target_arch = "x86_64")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(miri, ignore = "MXCSR + SIMD intrinsics unsupported by Miri")]
 fn rgbf32_to_rgb_row_simd_matches_scalar_under_truncate_mxcsr() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
@@ -83,6 +106,7 @@ fn pseudo_random_rgbf32(width: usize) -> std::vec::Vec<f32> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn sse41_rgbf32_to_rgb_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
     return;
@@ -100,6 +124,7 @@ fn sse41_rgbf32_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn sse41_rgbf32_to_rgba_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
     return;
@@ -117,6 +142,7 @@ fn sse41_rgbf32_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn sse41_rgbf32_to_rgb_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
     return;
@@ -134,6 +160,7 @@ fn sse41_rgbf32_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn sse41_rgbf32_to_rgba_u16_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
     return;
@@ -151,6 +178,7 @@ fn sse41_rgbf32_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 fn sse41_rgbf32_to_rgb_f32_matches_scalar() {
   if !std::arch::is_x86_feature_detected!("sse4.1") {
     return;
@@ -178,6 +206,7 @@ fn pseudo_random_rgbf16(width: usize) -> std::vec::Vec<half::f16> {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -200,6 +229,7 @@ fn sse41_rgbf16_to_rgb_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -222,6 +252,7 @@ fn sse41_rgbf16_to_rgba_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -247,6 +278,7 @@ fn sse41_rgbf16_to_rgb_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -272,6 +304,7 @@ fn sse41_rgbf16_to_rgba_u16_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
@@ -297,6 +330,7 @@ fn sse41_rgbf16_to_rgb_f32_matches_scalar() {
 }
 
 #[test]
+#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"

From b915754ec9f4994882b0737dc36fb0a5a51654b3 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 16:04:23 +1200
Subject: [PATCH 10/10] docs+perf(be-tier9): address Copilot review on PR #83
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three Copilot findings addressed (others were already fixed in dcf40a3
/ 56342c0 / 4340b15 — Copilot reviewed pre-fix state):

1. **Doc rewording** at `rgbf32_to_rgb_row` — was "bytes swapped
   relative to the host's native little-endian layout" which is
   misleading on BE hosts. Reframed in terms of the input buffer's
   encoded byte order vs the host CPU's native order.

2. **Pass-through perf fast path** for `rgbf32_to_rgb_f32_row` —
   added `BE == HOST_NATIVE_BE` branch that becomes a single
   `copy_from_slice` (memcpy) when the encoded byte order matches the
   host. Restores the pre-BE-aware "lossless pass-through" perf
   characteristic. The const-generic dead branch is eliminated per
   monomorphization, so the slow byte-swap path is only emitted for
   `BE != HOST_NATIVE_BE` callers.

3. **Pass-through perf fast path** for `rgbf16_to_rgb_f16_row` —
   mirror of (2) for half-precision input.

Verified:
  - cargo test --target aarch64-apple-darwin --lib: 2207 pass
  - cargo build --target x86_64-apple-darwin --tests: 0 warnings
  - cargo fmt --check: clean
  - cargo clippy --all-targets --all-features -D warnings: clean

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/packed_rgb_float.rs | 41 ++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/row/scalar/packed_rgb_float.rs b/src/row/scalar/packed_rgb_float.rs
index 6ff1e2fb..79cd2c4e 100644
--- a/src/row/scalar/packed_rgb_float.rs
+++ b/src/row/scalar/packed_rgb_float.rs
@@ -100,8 +100,12 @@ fn load_f16<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
 /// Converts packed `R, G, B` `f32` input to packed `R, G, B` `u8`
 /// output. Each `f32` is clamped to `[0, 1]` and scaled by 255.
 ///
-/// When `BE = true` the input `f32` values are encoded big-endian
-/// (bytes swapped relative to the host's native little-endian layout).
+/// `BE` selects the **encoded byte order** of the input buffer:
+/// `false` = LE-encoded on disk/wire, `true` = BE-encoded. This is
+/// independent of the host CPU's native byte order — a swap happens
+/// only when the encoded order differs from the host CPU's native order
+/// (handled internally via `u32::from_le` / `u32::from_be`, both
+/// target-endian-aware).
 ///
 /// # Panics
 ///
@@ -217,10 +221,19 @@ pub(crate) fn rgbf32_to_rgb_f32_row<const BE: bool>(
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
-  // Decode each source f32 from `BE` byte order to host-native.
-  // `u32::from_be` / `u32::from_le` is target-endian aware: a no-op
-  // when encoded byte order matches the host, a byte-swap when they
-  // differ. Output is always host-native f32 on every target.
+  // Fast path: encoded byte order matches host-native — pure memcpy.
+  // (LE-encoded data on LE host, or BE-encoded data on BE host.)
+  // The const-generic `BE == HOST_NATIVE_BE` branch is dead-code-
+  // eliminated per monomorphization, so this becomes a single
+  // `copy_from_slice` call with no swap loop.
+  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+  if BE == HOST_NATIVE_BE {
+    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+    return;
+  }
+  // Slow path: encoded byte order differs from host — byte-swap each
+  // f32 element via `u32::from_be` / `u32::from_le` (the dead branch
+  // is eliminated since `BE` is const). Output is always host-native.
   for (dst, src) in rgb_out[..width * 3]
     .iter_mut()
     .zip(rgb_in[..width * 3].iter())
@@ -400,10 +413,18 @@ pub(crate) fn rgbf16_to_rgb_f16_row<const BE: bool>(
 ) {
   debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
   debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
-  // Decode each source f16 from `BE` byte order to host-native, mirror
-  // of `rgbf32_to_rgb_f32_row`. `u16::from_be` / `u16::from_le` is
-  // target-endian aware: no-op when encoded byte order matches the
-  // host, swap when they differ. Output is always host-native f16.
+  // Fast path: encoded byte order matches host-native — pure memcpy.
+  // Mirrors the `rgbf32_to_rgb_f32_row` fast path; the const-generic
+  // `BE == HOST_NATIVE_BE` branch is dead-code-eliminated per
+  // monomorphization, so this becomes a single `copy_from_slice`.
+  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+  if BE == HOST_NATIVE_BE {
+    rgb_out[..width * 3].copy_from_slice(&rgb_in[..width * 3]);
+    return;
+  }
+  // Slow path: encoded byte order differs from host — byte-swap each
+  // f16 element via `u16::from_be` / `u16::from_le`. Output is always
+  // host-native f16.
   for (dst, src) in rgb_out[..width * 3]
     .iter_mut()
     .zip(rgb_in[..width * 3].iter())