diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs
index 1469df86..d1decc2b 100644
--- a/src/frame/packed_rgb_f16.rs
+++ b/src/frame/packed_rgb_f16.rs
@@ -50,7 +50,7 @@ pub enum Rgbf16FrameError {
   },
 }
 
-/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`).
+/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16LE`).
 /// One plane, 3 × `f16` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -65,6 +65,25 @@ pub enum Rgbf16FrameError {
 /// `stride` is in **`f16` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted
+/// as `f16`, matching the FFmpeg **`AV_PIX_FMT_RGBF16LE`** pixel-format
+/// convention. (FFmpeg's unsuffixed `AV_PIX_FMT_RGBF16` is a *target-endian*
+/// alias — `RGBF16LE` on a little-endian host, `RGBF16BE` on a big-endian
+/// host — so this contract pins the canonical `*LE` byte order regardless
+/// of host endianness.)
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[half::f16]` is also a host-native f16 slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf16Frame<'a> {
   rgb: &'a [half::f16],
diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs
index d1f79cef..7ff0b660 100644
--- a/src/frame/packed_rgb_float.rs
+++ b/src/frame/packed_rgb_float.rs
@@ -50,7 +50,7 @@ pub enum Rgbf32FrameError {
   },
 }
 
-/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`).
+/// A validated packed **RGBF32** frame.
 /// One plane, 3 × `f32` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -64,6 +64,29 @@ pub enum Rgbf32FrameError {
 /// `stride` is in **`f32` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes** (`AV_PIX_FMT_RGBF32LE`)
+///
+/// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as
+/// `f32`. This frame maps to FFmpeg `AV_PIX_FMT_RGBF32LE`. FFmpeg also
+/// defines `AV_PIX_FMT_RGBF32BE` and an unsuffixed `AV_PIX_FMT_RGBF32`
+/// alias that is **target-endian** (resolves to `RGBF32LE` on LE hosts and
+/// `RGBF32BE` on BE hosts). **Callers on a BE host who hold target-endian
+/// `AV_PIX_FMT_RGBF32` bytes must convert them to LE before constructing
+/// this frame** — otherwise the LE-decode contract here would re-interpret
+/// the BE bytes as LE and produce byte-swapped float data. The 4-channel
+/// `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` pair follows the same
+/// `*LE` convention; this frame uses the analogous LE binding.
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[f32]` is also a host-native float slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf32Frame<'a> {
   rgb: &'a [f32],
diff --git a/src/frame/planar_gbr_float.rs b/src/frame/planar_gbr_float.rs
index d612e715..505a7bd7 100644
--- a/src/frame/planar_gbr_float.rs
+++ b/src/frame/planar_gbr_float.rs
@@ -147,6 +147,20 @@ const fn check_plane(
 /// `f32` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// preserved bit-exact on lossless pass-through outputs and clamped to
 /// `[0.0, 1.0]` on integer-output paths.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf32Frame<'a> {
   g: &'a [f32],
@@ -250,6 +264,20 @@ impl<'a> Gbrpf32Frame<'a> {
 /// Four full-resolution `f32` planes in **G, B, R, A** order. Alpha is
 /// real per-pixel; nominal range `[0.0, 1.0]` (opaque = 1.0). Stride is
 /// in `f32` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf32Frame<'a> {
   g: &'a [f32],
@@ -372,6 +400,20 @@ impl<'a> Gbrapf32Frame<'a> {
 /// Three full-resolution [`half::f16`] planes in **G, B, R** order. Stride
 /// is in `f16` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// permitted (saturation to `+Inf` occurs on f16→f32 narrowing paths).
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf16Frame<'a> {
   g: &'a [half::f16],
@@ -475,6 +517,20 @@ impl<'a> Gbrpf16Frame<'a> {
 /// Four full-resolution [`half::f16`] planes in **G, B, R, A** order.
 /// Alpha is real per-pixel; nominal range `[0.0, 1.0]`. Stride is in
 /// `f16` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf16Frame<'a> {
   g: &'a [half::f16],
diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs
index ffb04e6a..5135b3d6 100644
--- a/src/row/arch/neon/alpha_extract.rs
+++ b/src/row/arch/neon/alpha_extract.rs
@@ -116,7 +116,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+    scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -154,7 +154,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_at_0(
+    scalar::copy_alpha_packed_u16x4_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -357,7 +357,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -375,7 +375,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs
index b999b618..bee7633d 100644
--- a/src/row/arch/wasm_simd128/alpha_extract.rs
+++ b/src/row/arch/wasm_simd128/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -226,7 +226,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -518,7 +518,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -538,7 +538,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs
index 1ebe97c1..2c58d3e9 100644
--- a/src/row/arch/x86_avx2/alpha_extract.rs
+++ b/src/row/arch/x86_avx2/alpha_extract.rs
@@ -213,7 +213,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -636,7 +636,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -657,7 +657,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs
index 203e08e3..45743fb5 100644
--- a/src/row/arch/x86_avx512/alpha_extract.rs
+++ b/src/row/arch/x86_avx512/alpha_extract.rs
@@ -206,7 +206,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -604,7 +604,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -627,7 +627,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs
index d327e299..4b1d800b 100644
--- a/src/row/arch/x86_sse41/alpha_extract.rs
+++ b/src/row/arch/x86_sse41/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -227,7 +227,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -521,7 +521,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -542,7 +542,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 00ecb61e..1e0351c5 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -95,17 +95,26 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(
 /// Runtime-dispatched α-extract for AYUV64 → u8 RGBA: gather α from
 /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u8) via `>> 8`.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source `packed` plane byte order (`false` = LE on
+/// disk/wire — matching the LE-encoded `Ayuv64Frame` contract;
+/// `true` = BE). Like [`copy_alpha_plane_u16_to_u8`], the existing SIMD
+/// helpers use host-native u16 loads with no `from_le` / `from_be`
+/// normalisation, so SIMD is only correct on LE host processing LE
+/// source. The dispatcher computes
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and falls
+/// back to the target-endian-aware scalar in every other quadrant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width);
+  // SIMD α-extract helpers use host-native u16 loads. Force scalar in
+  // any quadrant where source byte order doesn't match host byte order.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_packed_u16x4_to_u8_at_0::<BE>(packed, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -141,7 +150,7 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
     },
     _ => {}
   }
-  scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width);
+  scalar::copy_alpha_packed_u16x4_to_u8_at_0::<BE>(packed, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -152,17 +161,19 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
 /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u16). No depth
 /// conversion.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source `packed` plane byte order. See
+/// [`copy_alpha_packed_u16x4_to_u8_at_0`] for the rationale: SIMD is
+/// only correct on LE host with LE source; scalar is target-endian-aware.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_packed_u16x4_at_0(
+pub(crate) fn copy_alpha_packed_u16x4_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width);
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_packed_u16x4_at_0::<BE>(packed, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -198,7 +209,7 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(
     },
     _ => {}
   }
-  scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width);
+  scalar::copy_alpha_packed_u16x4_at_0::<BE>(packed, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index 6c77346a..6fc664ac 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -27,7 +27,16 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], wi
 /// into `rgba_out[3 + 4*n]` (u8 element) with depth-conv `>> 8`.
 ///
 /// AYUV64 layout per pixel: `[A(16), Y(16), U(16), V(16)]` — α is at slot 0.
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_AYUV64LE` per the Frame
+/// contract; `true` = BE on disk/wire). Each raw u16 is normalised to
+/// host-native order via `u16::from_le` / `u16::from_be` before the
+/// `>> 8` depth conversion. On a host whose endianness matches the
+/// source the conversion compiles to a no-op; otherwise it is a
+/// `swap_bytes`. Without this a BE host (e.g., s390x) processing the
+/// LE-encoded Frame would emit a byte-reversed α byte.
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -35,17 +44,34 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 4] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 4])
+    } else {
+      u16::from_le(packed[n * 4])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
 /// AYUV64 → u16 RGBA: gather α from `packed[0 + 4*n]` (u16) into
 /// `rgba_out[3 + 4*n]` (u16). No depth conversion.
-pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_packed_u16x4_to_u8_at_0`] for the full rationale.
+pub(crate) fn copy_alpha_packed_u16x4_at_0<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = packed[n * 4];
+    let raw = if BE {
+      u16::from_be(packed[n * 4])
+    } else {
+      u16::from_le(packed[n * 4])
+    };
+    rgba_out[n * 4 + 3] = raw;
   }
 }
 
@@ -58,8 +84,15 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16],
 /// Used in Strategy A+: after `expand_rgb_to_rgba_row` fills the RGBA buffer
 /// with a forced-opaque alpha, this helper overwrites only the α slot with the
 /// real source alpha, depth-converted to u8.
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_RGBA64LE` /
+/// `AV_PIX_FMT_BGRA64LE` per the Frame contract; `true` = BE). Each raw
+/// u16 is normalised to host-native order via `u16::from_le` /
+/// `u16::from_be` before the `>> 8` depth conversion. Without this a BE
+/// host processing the LE-encoded Frame would emit a byte-reversed α byte.
 #[allow(dead_code)] // wired in sinker Task 10
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -67,7 +100,12 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 4 + 3] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 4 + 3])
+    } else {
+      u16::from_le(packed[n * 4 + 3])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
@@ -77,12 +115,24 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
 /// Used in Strategy A+: after `expand_rgb_u16_to_rgba_u16_row` fills the
 /// RGBA buffer, this helper overwrites only the α slot with the real source
 /// alpha at native 16-bit depth.
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_packed_u16x4_to_u8_at_3`] for the full rationale.
 #[allow(dead_code)] // wired in sinker Task 10
-pub(crate) fn copy_alpha_packed_u16x4_at_3(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) {
+pub(crate) fn copy_alpha_packed_u16x4_at_3<const BE: bool>(
+  packed: &[u16],
+  rgba_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
   for n in 0..width {
-    rgba_u16_out[n * 4 + 3] = packed[n * 4 + 3];
+    let raw = if BE {
+      u16::from_be(packed[n * 4 + 3])
+    } else {
+      u16::from_le(packed[n * 4 + 3])
+    };
+    rgba_u16_out[n * 4 + 3] = raw;
   }
 }
 
@@ -195,21 +245,49 @@ pub(crate) fn copy_alpha_ya_u8(packed: &[u8], rgba_out: &mut [u8], width: usize)
 /// into `rgba_out[3 + 4*n]` (u8).
 ///
 /// Ya16 layout per pixel: `[Y(16), A(16)]` — α is at odd u16 offsets (slot 1).
-pub(crate) fn copy_alpha_ya_u16_to_u8(packed: &[u16], rgba_out: &mut [u8], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_YA16LE` per the
+/// `Ya16Frame` contract; `true` = BE). Each raw u16 is normalised to
+/// host-native order via `u16::from_le` / `u16::from_be` before the
+/// `>> 8` depth conversion. Without this a BE host processing the
+/// LE-encoded Frame would emit a byte-reversed α byte.
+pub(crate) fn copy_alpha_ya_u16_to_u8<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 2 + 1] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 2 + 1])
+    } else {
+      u16::from_le(packed[n * 2 + 1])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
 /// Ya16 → u16 RGBA: gather A from `packed[1 + 2*n]` (u16) into
 /// `rgba_out[3 + 4*n]` (u16). No depth conversion.
-pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_ya_u16_to_u8`] for the full rationale.
+pub(crate) fn copy_alpha_ya_u16<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = packed[n * 2 + 1];
+    let raw = if BE {
+      u16::from_be(packed[n * 2 + 1])
+    } else {
+      u16::from_le(packed[n * 2 + 1])
+    };
+    rgba_out[n * 4 + 3] = raw;
   }
 }
 
@@ -218,13 +296,42 @@ pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usi
 /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 255, and rounded
 /// with round-half-up (`+ 0.5` then truncate). Only slot 3 of every 4-element
 /// tuple is written; R, G, B slots are untouched.
+///
+/// `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire (e.g., `AV_PIX_FMT_GBRAPF32LE` per the
+/// `Gbrapf32Frame` contract; this also matches the case where the f32
+/// scratch is already host-native and the host is little-endian);
+/// `true` = BE on disk/wire (or host-native scratch on a BE host). Each
+/// raw f32 is bit-normalised to host-native order via
+/// `f32::from_bits(u32::from_le(bits))` (or `from_be`) BEFORE the clamp /
+/// scale / round-half-up. Without this a BE host (e.g., s390x) processing
+/// the LE-encoded Frame would clamp byte-swapped garbage values, typically
+/// producing α = 0 or α = 255 regardless of intent. Mirrors the
+/// `copy_alpha_plane_u16_to_u8::<BITS, BE>` endian pattern.
+///
+/// Routing pattern at the sinker layer:
+/// - **Direct-Frame paths** (e.g., `Gbrapf32Frame` → α plane consumed directly)
+///   pass `BE = false` (data is LE-encoded per the unified Frame contract).
+/// - **Post-widen paths** (e.g., `Gbrapf16Frame` widened-to-f32 scratch) pass
+///   `BE = HOST_NATIVE_BE` (scratch is host-native f32 after widen).
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn copy_alpha_plane_f32_to_u8<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 255.0 + 0.5) as u8;
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    let v = f32::from_bits(host_bits);
+    rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8;
   }
 }
 
@@ -232,13 +339,28 @@ pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], wid
 ///
 /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 65535, and rounded
 /// with round-half-up. Only slot 3 of every 4-element tuple is written.
+///
+/// `BE` selects the **byte order** of the encoded source α plane.
+/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the
+/// direct-Frame vs post-widen routing pattern.
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) fn copy_alpha_plane_f32_to_u16<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 65535.0 + 0.5) as u16;
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    let v = f32::from_bits(host_bits);
+    rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16;
   }
 }
 
@@ -247,13 +369,30 @@ pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], w
 ///
 /// No clamping, no rounding — HDR values, NaN, and Inf in the α plane are
 /// preserved bit-exact. Only slot 3 of every 4-element tuple is written.
+/// The output α is always written in **host-native** byte order (the
+/// downstream consumer of `&[f32]` expects host-native floats); this helper's
+/// `BE` only describes the **input** plane.
+///
+/// `BE` selects the **byte order** of the encoded source α plane.
+/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the
+/// direct-Frame vs post-widen routing pattern.
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32(alpha: &[f32], rgba_out: &mut [f32], width: usize) {
+pub(crate) fn copy_alpha_plane_f32<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n];
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    rgba_out[n * 4 + 3] = f32::from_bits(host_bits);
   }
 }
 
@@ -270,21 +409,56 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_0_depth_converts_correctly() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0x12, 1, 1, 1, 0xAB]);
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_0_preserves_native_u16() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,];
     let mut rgba = std::vec![1u16; 8];
-    copy_alpha_packed_u16x4_at_0(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0x1234, 1, 1, 1, 0xABCD]);
   }
 
+  /// BE parity for AYUV64 alpha-at-slot-0 → u8 RGBA: byte-swapping the
+  /// packed source and toggling the `BE` flag must yield byte-for-byte
+  /// identical output. Locks down the corruption where a BE host
+  /// processing the LE-encoded Frame contract would emit a byte-reversed α.
+  #[test]
+  fn copy_alpha_packed_u16x4_to_u8_at_0_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 8];
+    let mut rgba_be = std::vec![1u8; 8];
+    copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_to_u8_at_0::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for AYUV64 alpha-at-slot-0 → u16 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_at_0_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 8];
+    let mut rgba_be = std::vec![7u16; 8];
+    copy_alpha_packed_u16x4_at_0::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_at_0::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
   #[test]
   fn copy_alpha_plane_u8_scatters_into_rgba_alpha_slot() {
     let alpha = std::vec![50u8, 60, 70, 80];
@@ -423,28 +597,71 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_ya_u16_to_u8_depth_converts_via_high_byte() {
     // Ya16 packed → u8 RGBA: α >> 8 selects the high byte.
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_ya_u16_to_u8(&packed, &mut rgba, 2);
+    copy_alpha_ya_u16_to_u8::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0xFF]);
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_ya_u16_preserves_native_u16() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC];
     let mut rgba = std::vec![1u16; 8];
-    copy_alpha_ya_u16(&packed, &mut rgba, 2);
+    copy_alpha_ya_u16::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xABCD, 1, 1, 1, 0x9ABC]);
   }
 
+  /// BE parity for Ya16 → u8 RGBA: byte-swapping the packed source and
+  /// toggling the `BE` flag must yield byte-for-byte identical output.
+  /// Locks down the codex-flagged corruption where a BE host (e.g.
+  /// s390x) processing the LE-encoded `Ya16Frame` would otherwise emit
+  /// a byte-reversed α byte under the combined `with_rgb + with_rgba`
+  /// Strategy A+ path.
   #[test]
+  fn copy_alpha_ya_u16_to_u8_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00, 0x0001, 0x00FF];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 12];
+    let mut rgba_be = std::vec![1u8; 12];
+    copy_alpha_ya_u16_to_u8::<false>(&packed_le, &mut rgba_le, 3);
+    copy_alpha_ya_u16_to_u8::<true>(&packed_be, &mut rgba_be, 3);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Ya16 → u16 RGBA (16-bit α path).
+  #[test]
+  fn copy_alpha_ya_u16_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC, 0x0001, 0x00FF];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 12];
+    let mut rgba_be = std::vec![7u16; 12];
+    copy_alpha_ya_u16::<false>(&packed_le, &mut rgba_le, 3);
+    copy_alpha_ya_u16::<true>(&packed_be, &mut rgba_be, 3);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// On a LE host, `BE = false` makes the bit-normalize a no-op, so passing
+  /// host-native `f32` literals as if they were already LE-encoded reproduces
+  /// the original (pre-endian-aware) clamp+scale semantics. BE-host scalar
+  /// correctness is locked down by the `*_be_parity_with_swapped_buffer`
+  /// tests below.
+  #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_to_u8_clamps_and_scales() {
     // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3.
     let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1];
     let mut rgba = vec![1u8; 20];
-    copy_alpha_plane_f32_to_u8(&alpha, &mut rgba, 5);
+    copy_alpha_plane_f32_to_u8::<false>(&alpha, &mut rgba, 5);
     // R, G, B slots (0, 1, 2) must be untouched; slot 3 has the alpha.
     assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0");
     assert_eq!(rgba[7], 128, "alpha[1]=0.5 → 128");
@@ -458,11 +675,12 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_to_u16_clamps_and_scales() {
     // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 32768, 65535, 65535, 0] in slot 3.
     let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1];
     let mut rgba = vec![1u16; 20];
-    copy_alpha_plane_f32_to_u16(&alpha, &mut rgba, 5);
+    copy_alpha_plane_f32_to_u16::<false>(&alpha, &mut rgba, 5);
     assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0");
     assert_eq!(rgba[7], 32768, "alpha[1]=0.5 → 32768");
     assert_eq!(rgba[11], 65535, "alpha[2]=1.0 → 65535");
@@ -475,11 +693,12 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_lossless_passthrough() {
     // HDR (2.5), NaN, Inf, negative all preserved bit-exact.
     let alpha = vec![2.5f32, f32::NAN, f32::INFINITY, -1.0];
     let mut rgba = vec![0.0f32; 16];
-    copy_alpha_plane_f32(&alpha, &mut rgba, 4);
+    copy_alpha_plane_f32::<false>(&alpha, &mut rgba, 4);
     assert_eq!(rgba[3], 2.5, "HDR 2.5 preserved");
     assert!(rgba[7].is_nan(), "NaN preserved");
     assert!(rgba[11].is_infinite() && rgba[11] > 0.0, "+Inf preserved");
@@ -490,43 +709,142 @@ mod tests {
     assert_eq!(rgba[2], 0.0);
   }
 
+  /// BE parity for Gbrapf32 → u8 RGBA: byte-swapping the bits of every
+  /// f32 in the source α plane and toggling `BE` must produce identical
+  /// output. Locks down the codex 3rd-pass finding where a BE host
+  /// processing the LE-encoded `Gbrapf32Frame` would clamp byte-swapped
+  /// garbage values (typical result: α = 0 or α = 255 regardless of intent).
+  #[test]
+  fn copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![1u8; 32];
+    let mut rgba_be = std::vec![1u8; 32];
+    copy_alpha_plane_f32_to_u8::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32_to_u8::<true>(&alpha_be, &mut rgba_be, 8);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + bit-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Gbrapf32 → u16 RGBA.
+  #[test]
+  fn copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![7u16; 32];
+    let mut rgba_be = std::vec![7u16; 32];
+    copy_alpha_plane_f32_to_u16::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32_to_u16::<true>(&alpha_be, &mut rgba_be, 8);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + bit-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Gbrapf32 → f32 RGBA (lossless α pass-through). The
+  /// output α must equal the host-native f32 bit-pattern of the LE source
+  /// regardless of the host's byte order. NaN bit-patterns may differ
+  /// across hardware after a `from_bits → to_bits` round-trip, so we
+  /// compare on the bit representation of finite, non-NaN samples only.
+  #[test]
+  fn copy_alpha_plane_f32_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> =
+      std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 2.5, -1.0, f32::INFINITY];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![0.0f32; 32];
+    let mut rgba_be = std::vec![0.0f32; 32];
+    copy_alpha_plane_f32::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32::<true>(&alpha_be, &mut rgba_be, 8);
+    let bits_le: std::vec::Vec<u32> = rgba_le.iter().map(|v| v.to_bits()).collect();
+    let bits_be: std::vec::Vec<u32> = rgba_be.iter().map(|v| v.to_bits()).collect();
+    assert_eq!(
+      bits_le, bits_be,
+      "BE flag + bit-swapped buffer must match LE path bit-for-bit"
+    );
+  }
+
   // ---- copy_alpha_packed_u16x4_to_u8_at_3 / copy_alpha_packed_u16x4_at_3 --
 
   /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n].
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_3_narrows_correctly() {
     let packed: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0x12]);
   }
 
   /// Alpha at slot 3 is copied verbatim (no depth conversion).
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_3_copies_verbatim() {
     let packed: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
     let mut rgba_u16 = std::vec![1u16; 8];
-    copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 2);
+    copy_alpha_packed_u16x4_at_3::<false>(&packed, &mut rgba_u16, 2);
     assert_eq!(rgba_u16, std::vec![1, 1, 1, 0xABFF, 1, 1, 1, 0x1234]);
   }
 
   /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_3_touches_only_alpha_slot() {
     let packed: std::vec::Vec<u16> = std::vec![0, 0, 0, 0xFFFF];
     let mut rgba = std::vec![42u8; 4];
-    copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 1);
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed, &mut rgba, 1);
     assert_eq!(rgba[..3], [42, 42, 42]);
     assert_eq!(rgba[3], 0xFF);
   }
 
   /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_3_touches_only_alpha_slot() {
     let packed: std::vec::Vec<u16> = std::vec![0, 0, 0, 0xBEEF];
     let mut rgba_u16 = std::vec![99u16; 4];
-    copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 1);
+    copy_alpha_packed_u16x4_at_3::<false>(&packed, &mut rgba_u16, 1);
     assert_eq!(rgba_u16[..3], [99, 99, 99]);
     assert_eq!(rgba_u16[3], 0xBEEF);
   }
+
+  /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u8 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_to_u8_at_3_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 8];
+    let mut rgba_be = std::vec![1u8; 8];
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_to_u8_at_3::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u16 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_at_3_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 8];
+    let mut rgba_be = std::vec![7u16; 8];
+    copy_alpha_packed_u16x4_at_3::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_at_3::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
 }
diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs
index 3b9ba779..766c310b 100644
--- a/src/row/scalar/planar_gbr_f16.rs
+++ b/src/row/scalar/planar_gbr_f16.rs
@@ -198,16 +198,34 @@ pub(crate) fn gbrapf16_to_rgba_f16_row<const BE: bool>(
 /// Only slot 3 of every 4-element tuple is written; R, G, B slots are
 /// untouched. Lossless — HDR, NaN, and Inf in the α plane are preserved
 /// bit-exact.
+///
+/// `BE` selects the **byte order** of the encoded source α plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_GBRAPF16LE` per the
+/// `Gbrapf16Frame` contract; `true` = BE on disk/wire). Each raw f16 is
+/// bit-normalised to host-native order via `u16::from_le` / `u16::from_be`
+/// BEFORE the slot-3 write so the output buffer always carries host-native
+/// `half::f16` (matching the rest of the f16 row kernels). Without this a
+/// BE host processing the LE-encoded Frame would emit byte-reversed α bits.
 // Only called from the `mod tests` block which is gated on `feature = "std"`.
 // Under `cargo test --no-default-features` the test module is compiled out,
 // leaving the function without callers; suppress the resulting lint there.
 #[cfg_attr(not(feature = "std"), expect(dead_code))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_f16(alpha: &[half::f16], rgba_out: &mut [half::f16], width: usize) {
+pub(crate) fn copy_alpha_plane_f16<const BE: bool>(
+  alpha: &[half::f16],
+  rgba_out: &mut [half::f16],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n];
+    let raw = alpha[n].to_bits();
+    let host_bits = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
+    rgba_out[n * 4 + 3] = half::f16::from_bits(host_bits);
   }
 }
 
@@ -414,11 +432,12 @@ mod tests {
     miri,
     ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
   )]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f16_only_writes_alpha_slot() {
     let alpha = vec![half::f16::from_f32(0.7), half::f16::from_f32(0.3)];
     let sentinel = half::f16::from_f32(0.1);
     let mut rgba = vec![sentinel; 8];
-    copy_alpha_plane_f16(&alpha, &mut rgba, 2);
+    copy_alpha_plane_f16::<false>(&alpha, &mut rgba, 2);
     // Only slot 3 written; R, G, B slots (0, 1, 2) must be untouched.
     assert_eq!(rgba[0], sentinel, "R slot 0 untouched");
     assert_eq!(rgba[1], sentinel, "G slot 0 untouched");
@@ -429,4 +448,34 @@ mod tests {
     assert_eq!(rgba[6], sentinel, "B slot 1 untouched");
     assert_eq!(rgba[7], half::f16::from_f32(0.3), "A slot 1");
   }
+
+  /// BE parity for `copy_alpha_plane_f16`: byte-swapping the bits of every
+  /// f16 in the source α plane and toggling `BE` must produce identical
+  /// output. Mirrors the f32 alpha-patch endian-aware fix.
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
+  )]
+  fn copy_alpha_plane_f16_be_parity_with_swapped_buffer() {
+    let alpha_le = vec![
+      half::f16::from_f32(0.0),
+      half::f16::from_f32(0.25),
+      half::f16::from_f32(0.5),
+      half::f16::from_f32(1.0),
+      half::f16::from_f32(2.5),
+      half::f16::from_f32(-1.0),
+    ];
+    let alpha_be = be_encode_f16(&alpha_le);
+    let mut rgba_le = vec![half::f16::ZERO; 24];
+    let mut rgba_be = vec![half::f16::ZERO; 24];
+    copy_alpha_plane_f16::<false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_f16::<true>(&alpha_be, &mut rgba_be, 6);
+    let bits_le: std::vec::Vec<u16> = rgba_le.iter().map(|v| v.to_bits()).collect();
+    let bits_be: std::vec::Vec<u16> = rgba_be.iter().map(|v| v.to_bits()).collect();
+    assert_eq!(
+      bits_le, bits_be,
+      "BE flag + bit-swapped buffer must match LE path bit-for-bit"
+    );
+  }
 }
diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs
index 7782994d..6ab55456 100644
--- a/src/sinker/mixed/ayuv64.rs
+++ b/src/sinker/mixed/ayuv64.rs
@@ -350,7 +350,8 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0(
+        // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+        crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
           packed, rgba_row, w, use_simd,
         );
       }
@@ -404,7 +405,13 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         let rgba_u16_row =
           rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0(packed, rgba_u16_row, w, use_simd);
+        // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+        crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0::<false>(
+          packed,
+          rgba_u16_row,
+          w,
+          use_simd,
+        );
       }
     }
 
diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs
index 5d6c44e1..b07b9191 100644
--- a/src/sinker/mixed/gray.rs
+++ b/src/sinker/mixed/gray.rs
@@ -1499,8 +1499,9 @@ impl PixelSink for MixedSinker<'_, Ya16> {
         let rgba_u16_row =
           rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        // Patch α from source (native u16 depth).
-        copy_alpha_ya_u16(packed, rgba_u16_row, w);
+        // Patch α from source (native u16 depth). `Ya16Frame` is LE-encoded
+        // per the unified Frame contract → `BE = false`.
+        copy_alpha_ya_u16::<false>(packed, rgba_u16_row, w);
       }
     }
 
@@ -1562,7 +1563,8 @@ impl PixelSink for MixedSinker<'_, Ya16> {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
       // Overwrite the α channel with real source α (>> 8 for u8 output).
-      copy_alpha_ya_u16_to_u8(packed, rgba_row, w);
+      // `Ya16Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+      copy_alpha_ya_u16_to_u8::<false>(packed, rgba_row, w);
     }
 
     Ok(())
@@ -2408,6 +2410,139 @@ mod tests {
     assert_eq!(v, [0x80, 0xFF]);
   }
 
+  /// Strategy A+ (combined `with_rgb` + `with_rgba`) must produce α bytes
+  /// byte-identical to the standalone `with_rgba` path. Locks down the
+  /// codex-flagged corruption where a BE host processing the LE-encoded
+  /// `Ya16Frame` would otherwise diverge between the two paths: standalone
+  /// uses the endian-aware `ya16_to_rgba_row::<false>` kernel; combined
+  /// expanded RGB → RGBA then patched α via `copy_alpha_ya_u16_to_u8` which
+  /// previously read raw `packed[n*2+1]` host-native and so emitted a
+  /// byte-reversed α byte on BE. After the fix, `copy_alpha_ya_u16_to_u8`
+  /// is target-endian-aware (`<false>` for the LE Frame contract) and the
+  /// two paths agree on every host.
+  ///
+  /// To exercise the LE-encoded byte contract on every host we build the
+  /// `&[u16]` plane by bit-casting LE bytes — `u16::from_le_bytes` per
+  /// sample. On LE hosts that's a no-op; on BE hosts it byte-swaps so the
+  /// in-memory bytes match the FFmpeg `AV_PIX_FMT_YA16LE` layout.
+  #[test]
+  fn ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded() {
+    let w: u32 = 8;
+    let h: u32 = 1;
+    // Logical samples (Y, A) per pixel.
+    let samples: [(u16, u16); 8] = [
+      (0x0000, 0xFFFF),
+      (0x8000, 0x4000),
+      (0xFFFF, 0x0000),
+      (0x1234, 0xABCD),
+      (0x00FF, 0xFF00),
+      (0x5A5A, 0xA5A5),
+      (0x7FFF, 0x8000),
+      (0xC000, 0x3FFF),
+    ];
+    // Build the `&[u16]` plane such that its in-memory bytes match the
+    // FFmpeg `AV_PIX_FMT_YA16LE` byte layout on every host. We want a
+    // host-native u16 whose underlying bytes spell `[low, high]` (LE):
+    // `u16::from_ne_bytes(x.to_le_bytes())` is `x` on LE and `x.swap_bytes()`
+    // on BE — the right value to store in either case.
+    let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) };
+    let packed: std::vec::Vec<u16> = samples
+      .iter()
+      .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)])
+      .collect();
+    let frame = Ya16Frame::new(&packed, w, h, w * 2);
+
+    // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the
+    // newly endian-aware `copy_alpha_ya_u16_to_u8::<false>`. Forces
+    // `with_simd(false)` so the test runs purely scalar — no SIMD intrinsics
+    // — which lets it execute under `cargo miri test`. BE CI is driven by
+    // miri on s390x / powerpc64; gating it out of miri would skip exactly
+    // the host where BE corruption would surface.
+    let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize];
+    let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
+        .with_rgb(&mut rgb_combined)
+        .unwrap()
+        .with_rgba(&mut rgba_combined)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    // Run standalone (with_rgba only) — exercises the endian-aware
+    // `ya16_to_rgba_row::<false>` kernel. Same scalar-only rationale.
+    let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
+        .with_rgba(&mut rgba_standalone)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    assert_eq!(
+      rgba_combined, rgba_standalone,
+      "combined (with_rgb+with_rgba) RGBA must equal standalone with_rgba"
+    );
+  }
+
+  /// u16 RGBA variant of the combined-vs-standalone parity check. Locks
+  /// down `copy_alpha_ya_u16::<false>` (the u16 alpha-patch helper for
+  /// 16-bit RGBA outputs).
+  #[test]
+  fn ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded() {
+    let w: u32 = 8;
+    let h: u32 = 1;
+    let samples: [(u16, u16); 8] = [
+      (0x0000, 0xFFFF),
+      (0x8000, 0x4000),
+      (0xFFFF, 0x0000),
+      (0x1234, 0xABCD),
+      (0x00FF, 0xFF00),
+      (0x5A5A, 0xA5A5),
+      (0x7FFF, 0x8000),
+      (0xC000, 0x3FFF),
+    ];
+    // See sibling test for the `le_encoded` rationale.
+    let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) };
+    let packed: std::vec::Vec<u16> = samples
+      .iter()
+      .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)])
+      .collect();
+    let frame = Ya16Frame::new(&packed, w, h, w * 2);
+
+    // Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+    // intrinsics — which lets it execute under `cargo miri test`. BE CI is
+    // driven by miri on s390x / powerpc64; gating it out of miri would skip
+    // exactly the host where BE corruption would surface.
+    let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize];
+    let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
+        .with_rgb_u16(&mut rgb_combined)
+        .unwrap()
+        .with_rgba_u16(&mut rgba_combined)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
+        .with_rgba_u16(&mut rgba_standalone)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    assert_eq!(
+      rgba_combined, rgba_standalone,
+      "combined (with_rgb_u16+with_rgba_u16) RGBA u16 must equal standalone"
+    );
+  }
+
   #[test]
   #[cfg_attr(
     miri,
diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs
index 837b8246..49e34d84 100644
--- a/src/sinker/mixed/packed_rgb_16bit.rs
+++ b/src/sinker/mixed/packed_rgb_16bit.rs
@@ -726,7 +726,11 @@ impl PixelSink for MixedSinker<'_, Rgba64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::<false>(
+          in64, rgba_row, w,
+        );
       }
     }
 
@@ -759,7 +763,13 @@ impl PixelSink for MixedSinker<'_, Rgba64> {
         let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::<false>(
+          in64,
+          rgba_u16_row,
+          w,
+        );
       }
     }
 
@@ -995,7 +1005,11 @@ impl PixelSink for MixedSinker<'_, Bgra64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::<false>(
+          in64, rgba_row, w,
+        );
       }
     }
 
@@ -1024,7 +1038,13 @@ impl PixelSink for MixedSinker<'_, Bgra64> {
         let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::<false>(
+          in64,
+          rgba_u16_row,
+          w,
+        );
       }
     }
 
diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs
index 62ab1cc8..e349f130 100644
--- a/src/sinker/mixed/packed_rgb_f16.rs
+++ b/src/sinker/mixed/packed_rgb_f16.rs
@@ -34,25 +34,6 @@ use crate::{
   yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink},
 };
 
-/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as
-/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`]
-/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the
-/// caller hands us already-decoded half-floats. The kernel `BE` parameter,
-/// however, names the **encoded** byte order (so `BE = false` means "decode
-/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout
-/// is LE, so `BE = false` is correct; on a BE host the host-native layout is
-/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap.
-/// Without this routing the loaders would byte-swap an already-decoded host-
-/// native `f16` on BE hosts, corrupting every output path.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` — same truth table, different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Rgbf16 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf16> {
@@ -253,27 +234,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if let Some(buf) = rgb_f16.as_deref_mut() {
       let f16_start = one_plane_start * 3;
       let f16_end = one_plane_end * 3;
-      rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
+      rgbf16_to_rgb_f16_row::<false>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
     }
 
     // Lossless f32 widen — also independent of integer conversion paths.
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf16_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf16_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma /
@@ -288,7 +269,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -307,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
       w,
       h,
     )?;
-    rgbf16_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
+    rgbf16_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -347,7 +328,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     // over `rgb_row` via `expand_rgb_to_rgba_row`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs
index f189e5ab..e1c17a39 100644
--- a/src/sinker/mixed/packed_rgb_float.rs
+++ b/src/sinker/mixed/packed_rgb_float.rs
@@ -31,25 +31,6 @@ use crate::{
   yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink},
 };
 
-/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as
-/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`]
-/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller
-/// hands us already-decoded floats. The kernel `BE` parameter, however, names
-/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes"
-/// via `u32::from_le`). On a LE host the host-native layout is LE, so
-/// `BE = false` is correct; on a BE host the host-native layout is BE, so we
-/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this
-/// routing the loaders would byte-swap an already-decoded host-native `f32` on
-/// BE hosts, corrupting every output path.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` — same truth table, different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Rgbf32 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf32> {
@@ -228,20 +209,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf32_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf32_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float→u8 conversion when
@@ -256,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -276,7 +257,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
       w,
       h,
     )?;
-    rgbf32_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
+    rgbf32_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -318,7 +299,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     // less memory pass for combined `with_rgb + with_rgba` callers.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index c041b901..22159b92 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -51,7 +51,8 @@ use crate::{
     gbrapf32_to_rgba_u16_row, gbrpf16_to_rgb_f16_row, gbrpf16_to_rgb_row, gbrpf16_to_rgba_f16_row,
     gbrpf16_to_rgba_row, gbrpf32_to_hsv_row, gbrpf32_to_luma_row, gbrpf32_to_luma_u16_row,
     gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_f32_row,
-    gbrpf32_to_rgba_u16_row, scalar::alpha_extract::copy_alpha_plane_f32_to_u8,
+    gbrpf32_to_rgba_u16_row,
+    scalar::{alpha_extract::copy_alpha_plane_f32_to_u8, planar_gbr_f16::widen_f16_be_to_host_f32},
   },
   yuv::{Gbrapf16, Gbrapf16Row, Gbrapf16Sink, Gbrpf16, Gbrpf16Row, Gbrpf16Sink},
 };
@@ -64,56 +65,22 @@ const GBR_F16_FULL_RANGE: bool = true;
 // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated).
 const WIDEN_CHUNK: usize = 64;
 
-/// `BE` value that makes the `gbrpf16_to_*` / `gbrapf16_to_*` row dispatchers
-/// (and the widened `gbrpf32_to_*` chain after `widen_f16_to_f32`) treat
-/// their input as **host-native** (a no-op byte-swap).
-///
-/// [`crate::frame::Gbrpf16Frame`] / [`crate::frame::Gbrapf16Frame`] expose
-/// `&[half::f16]` plane rows in **host-native** layout — the API contract
-/// is that the caller hands us already-decoded half-floats. The kernel `BE`
-/// parameter, however, names the **encoded** byte order (so `BE = false`
-/// means "decode LE-encoded bytes" via `u16::from_le`). On a LE host the
-/// host-native layout is LE, so `BE = false` is correct; on a BE host the
-/// host-native layout is BE, so we must request `BE = true` to make
-/// `u16::from_be` no-op the swap. Without this routing the loaders would
-/// byte-swap an already-decoded host-native `f16` on BE hosts, corrupting
-/// every output path (codex PR #84 Finding 3).
-///
-/// Crucially, the **widened f32 chain** must also use `HOST_NATIVE_BE`:
-/// after [`widen_f16_to_f32`] (which calls `half::f16::to_f32` on host-native
-/// f16 bits) the scratch is host-native f32, so the downstream
-/// `gbrpf32_to_*` kernel's `from_le`/`from_be` loader must be a no-op —
-/// achieved by routing with `HOST_NATIVE_BE`.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced in `c3a6478` and the `Rgbf16` sinker fix in
-/// `dcf40a3`. Same truth table:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-///
-/// The α-plane scatter for [`Gbrapf16`] (Strategy A+ / standalone-RGBA)
-/// widens the host-native f16 α plane to host-native f32 via
-/// [`widen_f16_to_f32`] then calls `copy_alpha_plane_f32_to_u8` — both
-/// operations are endian-agnostic. Mix-mode corruption (LE-decoded RGB +
-/// host-native α) is therefore eliminated by routing the RGB chain via
-/// `HOST_NATIVE_BE`.
+// Endianness routing for **post-widen** `gbrpf32_to_*` calls.
+//
+// `widen_f16_be_to_host_f32::<false>` produces **host-native f32 scratch** from
+// LE-encoded f16 plane bits (it normalises bits before widening), so the
+// downstream `gbrpf32_to_*::<HOST_NATIVE_BE>` kernel sees input that already
+// matches the host's byte order. The kernel's `from_le` / `from_be` then
+// becomes a no-op on every host — correct.
+//
+// Distinct from the **direct** Frame-to-row-kernel pattern elsewhere in this
+// file (the `gbrpf16_to_*::<false>` u8/f16 calls): those receive raw LE-encoded
+// `&[half::f16]` plane bytes per the unified Frame contract, so they pass
+// `BE = false` to tell the kernel to apply `from_le`. Post-widen scratch is
+// already host-native, so it must use `BE = HOST_NATIVE_BE` to keep the kernel
+// byte-swap a no-op on every host.
 const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
 
-/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements).
-///
-/// The source slice is `&[half::f16]` in **host-native** layout (per the
-/// `Gbrpf16Frame` / `Gbrapf16Frame` API contract); `to_f32` interprets the
-/// bits as host-native and emits host-native `f32`. Downstream `gbrpf32_to_*`
-/// callers must therefore route with [`HOST_NATIVE_BE`] (not the encoded
-/// `BE`) to avoid double byte-swapping.
-#[cfg_attr(not(tarpaulin), inline(always))]
-fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) {
-  for i in 0..count {
-    dst[i] = src[i].to_f32();
-  }
-}
-
 // ---- Gbrpf16 accessor impl block ----------------------------------------
 
 impl<'a> MixedSinker<'a, Gbrpf16> {
@@ -351,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf16_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf16_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -363,14 +330,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
           height: h,
           channels: 4,
         })?;
-      gbrpf16_to_rgba_f16_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf16_to_rgba_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- Paths that require widening f16 → f32 ---------------------------
@@ -394,9 +354,14 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
       let mut offset = 0;
       while offset < w {
         let n = (w - offset).min(WIDEN_CHUNK);
-        widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n);
-        widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n);
-        widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n);
+        // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the
+        // downstream `gbrpf32_to_*` kernel (invoked with `BE = HOST_NATIVE_BE`
+        // — see module-scope constant) sees host-native f32 on every host.
+        // The post-widen scratch is host-native, distinct from the direct-
+        // Frame paths which use `<false>` per the LE-encoded byte contract.
+        widen_f16_be_to_host_f32::<false>(g_in, offset, &mut gf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(b_in, offset, &mut bf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(r_in, offset, &mut rf_chunk, n);
         let gf = &gf_chunk[..n];
         let bf = &bf_chunk[..n];
         let rf = &rf_chunk[..n];
@@ -480,7 +445,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf16_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -504,7 +469,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
       w,
       h,
     )?;
-    gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf16_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     // Strategy A: expand RGB → RGBA (constant α = 0xFF).
     if let Some(buf) = rgba.as_deref_mut() {
@@ -761,7 +726,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       // rgb_f16: no source α — use the no-α kernel (lossless scatter).
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf16_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf16_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -774,15 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
           height: h,
           channels: 4,
         })?;
-      gbrapf16_to_rgba_f16_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf16_to_rgba_f16_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- Paths that require widening f16 → f32 ---------------------------
@@ -806,10 +763,12 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       let mut offset = 0;
       while offset < w {
         let n = (w - offset).min(WIDEN_CHUNK);
-        widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n);
-        widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n);
-        widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n);
-        widen_f16_to_f32(&a_in[offset..], &mut af_chunk, n);
+        // Bit-normalise LE-encoded f16 plane bits → host-native f32 (see the
+        // canonical helper's docs); downstream kernel uses `BE = false`.
+        widen_f16_be_to_host_f32::<false>(g_in, offset, &mut gf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(b_in, offset, &mut bf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(r_in, offset, &mut rf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(a_in, offset, &mut af_chunk, n);
         let gf = &gf_chunk[..n];
         let bf = &bf_chunk[..n];
         let rf = &rf_chunk[..n];
@@ -918,7 +877,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       // Write opaque RGB → RGBA (α = 0xFF), then overwrite α from source.
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf16_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       // Scatter f16 α → u8 slot 3: widen + clamp + scale.
       widen_and_scatter_f16_alpha_to_u8(a_in, rgba_row, w);
       return Ok(());
@@ -944,7 +903,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       w,
       h,
     )?;
-    gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf16_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from source.
     if let Some(buf) = rgba.as_deref_mut() {
@@ -962,14 +921,26 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
 ///
 /// Used by `Gbrapf16` Strategy A+ and standalone-RGBA paths to overwrite
 /// the per-pixel alpha byte from the f16 source α plane.
+///
+/// Endian routing: `widen_f16_be_to_host_f32::<false>` converts the
+/// LE-encoded `Gbrapf16Frame` α plane bits into **host-native f32**
+/// scratch. The downstream `copy_alpha_plane_f32_to_u8` therefore receives
+/// host-native f32 input, not LE-encoded f32, and must be invoked with
+/// `BE = HOST_NATIVE_BE` so the kernel's `from_le` / `from_be` is a no-op
+/// on every host (no second byte-swap). This is the **post-widen** routing
+/// pattern; contrast with `planar_gbr_float.rs` which calls the same
+/// helper with `BE = false` because it consumes the **direct** LE-encoded
+/// `Gbrapf32Frame` α plane.
 #[cfg_attr(not(tarpaulin), inline(always))]
 fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8], width: usize) {
   let mut af_chunk = [0.0f32; WIDEN_CHUNK];
   let mut offset = 0;
   while offset < width {
     let n = (width - offset).min(WIDEN_CHUNK);
-    widen_f16_to_f32(&alpha_f16[offset..], &mut af_chunk, n);
-    copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n);
+    // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping
+    // and scaling to u8 — correct on both LE and BE hosts.
+    widen_f16_be_to_host_f32::<false>(alpha_f16, offset, &mut af_chunk, n);
+    copy_alpha_plane_f32_to_u8::<HOST_NATIVE_BE>(&af_chunk[..n], &mut rgba_out[offset * 4..], n);
     offset += n;
   }
 }
diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs
index 7b979049..ddee135b 100644
--- a/src/sinker/mixed/planar_gbr_float.rs
+++ b/src/sinker/mixed/planar_gbr_float.rs
@@ -53,36 +53,6 @@ use crate::{
 const GBR_FLOAT_LUMA_MATRIX: ColorMatrix = ColorMatrix::Bt709;
 const GBR_FLOAT_FULL_RANGE: bool = true;
 
-/// `BE` value that makes the `gbrpf32_to_*` / `gbrapf32_to_*` row dispatchers
-/// treat their input as **host-native** (a no-op byte-swap).
-///
-/// [`crate::frame::Gbrpf32Frame`] / [`crate::frame::Gbrapf32Frame`] expose
-/// `&[f32]` plane rows in **host-native** layout — the API contract is that
-/// the caller hands us already-decoded floats. The kernel `BE` parameter,
-/// however, names the **encoded** byte order (so `BE = false` means "decode
-/// LE-encoded bytes" via `u32::from_le`). On a LE host the host-native layout
-/// is LE, so `BE = false` is correct; on a BE host the host-native layout is
-/// BE, so we must request `BE = true` to make `u32::from_be` no-op the swap.
-/// Without this routing the loaders would byte-swap an already-decoded host-
-/// native `f32` on BE hosts, corrupting every output path (codex PR #84
-/// Finding 2).
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` and the `Rgbf32` sinker fix in `dcf40a3`. Same truth table,
-/// different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-///
-/// The α-plane scatter (Strategy A+ / standalone-RGBA) consumes the host-
-/// native `&[f32]` α plane via `copy_alpha_plane_f32_to_u8`, which is endian-
-/// agnostic — there's no BE branching needed for the α path because it does
-/// not byte-load through `from_le`/`from_be`. Mix-mode corruption (LE-decoded
-/// RGB + host-native α) is therefore eliminated by routing the RGB chain via
-/// `HOST_NATIVE_BE`.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Gbrpf32 accessor impl block ----------------------------------------
 
 impl<'a> MixedSinker<'a, Gbrpf32> {
@@ -321,7 +291,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_f32.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f32.as_deref_mut() {
@@ -333,14 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf32_to_rgba_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- f16 narrowing (independent of integer paths) --------------------
@@ -348,7 +311,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -360,14 +323,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f16_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf32_to_rgba_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGB / RGBA path (direct float → u16, no staging) -----------
@@ -375,12 +331,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_u16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_u16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf32_to_rgba_u16_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
     }
 
     // ---- u8 RGBA standalone fast path (no RGB / luma / HSV needed) -------
@@ -395,7 +351,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf32_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -422,10 +378,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
       w,
       h,
     )?;
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf32_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
-      gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -438,7 +394,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     }
 
     if let Some(luma_u16) = luma_u16.as_deref_mut() {
-      gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_u16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -451,7 +407,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     }
 
     if let Some(hsv) = hsv.as_mut() {
-      gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_hsv_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -721,7 +677,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_f32.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f32.as_deref_mut() {
@@ -733,15 +689,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf32_to_rgba_f32_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- f16 narrowing (independent of integer paths) --------------------
@@ -749,7 +697,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -761,15 +709,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f16_row::<HOST_NATIVE_BE>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf32_to_rgba_f16_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGB path (direct, no staging) ------------------------------
@@ -777,14 +717,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_u16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_u16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGBA path (direct — source α clamped + scaled) -------------
 
     if let Some(buf) = self.rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      gbrapf32_to_rgba_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+      gbrapf32_to_rgba_u16_row::<false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
     }
 
     // ---- u8 RGBA standalone fast path ------------------------------------
@@ -799,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrapf32_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+      gbrapf32_to_rgba_row::<false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -826,10 +766,10 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
       w,
       h,
     )?;
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf32_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
-      gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -842,7 +782,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     }
 
     if let Some(luma_u16) = luma_u16.as_deref_mut() {
-      gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_u16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -855,7 +795,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     }
 
     if let Some(hsv) = hsv.as_mut() {
-      gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_hsv_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -869,10 +809,20 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
 
     // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from
     // the source f32 α plane (clamped × 255 → u8).
+    //
+    // `BE = false`: `a_in` is the **direct** Gbrapf32Frame α plane, which
+    // is LE-encoded f32 per the Phase-1 unified Frame contract. The helper
+    // bit-normalises each f32 to host-native order before clamp/scale, so
+    // the conversion compiles to a no-op on LE hosts and a `swap_bytes` on
+    // BE hosts (e.g., s390x). Without this BE hosts would clamp byte-
+    // swapped garbage and emit α = 0 / 255 regardless of intent. Distinct
+    // from the **post-widen** routing in `planar_gbr_f16.rs`
+    // (`widen_and_scatter_f16_alpha_to_u8`), which feeds host-native f32
+    // scratch into the same helper with `BE = HOST_NATIVE_BE`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-      copy_alpha_plane_f32_to_u8(a_in, rgba_row, w);
+      copy_alpha_plane_f32_to_u8::<false>(a_in, rgba_row, w);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index 1cd09a1f..574e482c 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -311,131 +311,49 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
   assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless");
 }
 
-/// Sinker-layer host-native-`f16` regression for the bug fixed alongside
-/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to
-/// hardcode `::<false>` when calling the row dispatchers, telling them to
-/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native
-/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted
-/// every output path on BE hosts (the `u16` loaders would byte-swap an
-/// already-decoded f16 bit-pattern). The fix replaces those `::<false>` with
-/// `::<HOST_NATIVE_BE>`, which is `false` on LE and `true` on BE — a no-op
-/// byte-swap on either host.
+/// LE-encoded byte contract regression: builds an [`Rgbf16Frame`] from a
+/// `&[half::f16]` plane explicitly encoded as LE bytes (per the FFmpeg
+/// `AV_PIX_FMT_*LE` convention documented on `Rgbf16Frame`), runs it
+/// through the sinker's `with_rgb_f16` lossless pass-through, and asserts
+/// the output equals the host-native intended values.
 ///
-/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
-/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
-/// identical to `::<false>`, so this test cannot distinguish the broken vs
-/// fixed code on LE. It instead documents the equivalence at the **kernel
-/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both
-/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host.
+/// Vacuous on LE hosts (where `to_le` on a `u16` is a no-op so the LE-
+/// encoded plane *is* host-native), but on a BE host this would fail fast
+/// for any regression that drops the `::<false>` kernel routing — the
+/// kernel must apply `u16::from_le` to recover host-native f16 bit-patterns
+/// from the LE-encoded bytes.
 ///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
-/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
-/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. The dispatch-equivalence claim is specifically
-/// about the LE host-routing pattern; the BE-host correctness of the routing
-/// change is verified instead by
-/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the
-/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
-#[test]
-#[cfg(target_endian = "little")]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf16_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row,
-    rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row,
-  };
-
-  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
-  // Compute it locally so the test asserts the same condition without taking
-  // a dependency on a private const.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33 covers SIMD main loop + scalar tail across every backend.
-  let w = 33usize;
-  let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25];
-  let pix: std::vec::Vec<half::f16> = (0..w * 3)
-    .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()]))
-    .collect();
-
-  // u8 RGB.
-  let mut rgb_false = std::vec![0u8; w * 3];
-  let mut rgb_host = std::vec![0u8; w * 3];
-  rgbf16_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
-  rgbf16_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
-  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
-
-  // u8 RGBA.
-  let mut rgba_false = std::vec![0u8; w * 4];
-  let mut rgba_host = std::vec![0u8; w * 4];
-  rgbf16_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
-  rgbf16_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
-  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
-
-  // u16 RGB.
-  let mut rgb_u16_false = std::vec![0u16; w * 3];
-  let mut rgb_u16_host = std::vec![0u16; w * 3];
-  rgbf16_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
-  rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
-  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
-
-  // u16 RGBA.
-  let mut rgba_u16_false = std::vec![0u16; w * 4];
-  let mut rgba_u16_host = std::vec![0u16; w * 4];
-  rgbf16_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
-  rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
-  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
-
-  // f16 lossless pass-through.
-  let mut f16_false = std::vec![half::f16::ZERO; w * 3];
-  let mut f16_host = std::vec![half::f16::ZERO; w * 3];
-  rgbf16_to_rgb_f16_row::<false>(&pix, &mut f16_false, w, true);
-  rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(&pix, &mut f16_host, w, true);
-  assert_eq!(f16_false, f16_host, "f16 RGB diverges");
-  if !HOST_NATIVE_BE {
-    assert_eq!(
-      f16_host, pix,
-      "f16 lossless pass-through corrupted on LE host"
-    );
-  }
-
-  // f32 lossless widen.
-  let mut f32_false = std::vec![0.0f32; w * 3];
-  let mut f32_host = std::vec![0.0f32; w * 3];
-  rgbf16_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
-  rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
-  assert_eq!(f32_false, f32_host, "f32 widen diverges");
-}
-
-/// End-to-end sinker contract test: feeding host-native `half::f16` through
-/// [`MixedSinker<Rgbf16>`] must round-trip the f16 input bit-exact via
-/// `with_rgb_f16` on every host. Documents the public-API contract that the
-/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level
-/// test above; together they cover both the dispatch boundary and the public
-/// sinker boundary.
+/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
+///
+/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf16_sinker_host_native_contract_lossless_passthrough() {
+fn rgbf16_sinker_le_encoded_frame_decodes_correctly() {
   let vals_f32 = [0.5f32, 1.5, -0.25, 100.0];
-  let pix: std::vec::Vec<half::f16> = (0..16 * 4 * 3)
+  let intended: Vec<half::f16> = (0..16 * 4 * 3)
     .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()]))
     .collect();
+  // Encode the plane as LE bytes reinterpreted as f16 (the documented
+  // `*LE` Frame contract). On LE host: identity. On BE host: byte-swapped
+  // bit-patterns the kernel must `from_le` back to host-native.
+  let pix: Vec<half::f16> = intended
+    .iter()
+    .map(|&v| half::f16::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
 
   let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3];
   let mut sink = MixedSinker::<Rgbf16>::new(16, 4)
+    .with_simd(false)
     .with_rgb_f16(&mut rgb_f16_out)
     .unwrap();
   rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  // Bit-exact pass-through on every host — broken `::<false>` routing
-  // would byte-swap on a BE host; the fixed routing keeps the f16 intact.
-  assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted");
+  assert_eq!(
+    rgb_f16_out, intended,
+    "Rgbf16 sinker LE-encoded plane decoded incorrectly"
+  );
 }
diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs
index fd4df2da..8bd01266 100644
--- a/src/sinker/mixed/tests/packed_rgb_float.rs
+++ b/src/sinker/mixed/tests/packed_rgb_float.rs
@@ -246,140 +246,59 @@ fn rgbf32_simd_matches_scalar_with_random_input() {
   assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless");
 }
 
-/// Sinker-layer host-native-`f32` regression for the bug fixed alongside
-/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to
-/// hardcode `::<false>` when calling the row dispatchers, telling them to
-/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native
-/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every
-/// output path on BE hosts (the loaders would byte-swap an already-decoded
-/// f32). The fix replaces those `::<false>` with `::<HOST_NATIVE_BE>`, which
-/// is `false` on LE and `true` on BE — a no-op byte-swap on either host.
+/// LE-encoded byte contract regression: builds an [`Rgbf32Frame`] from a
+/// `&[f32]` plane explicitly encoded as LE bytes (per the FFmpeg
+/// `AV_PIX_FMT_*LE` convention documented on `Rgbf32Frame`), runs it
+/// through the sinker's `with_rgb_f32` lossless pass-through, and asserts
+/// the output equals the host-native intended values.
 ///
-/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
-/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
-/// identical to `::<false>`, so this test cannot distinguish the broken vs
-/// fixed code on LE. It instead documents the equivalence at the **kernel
-/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both
-/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host.
+/// Vacuous on LE hosts (where `f32::to_le_bytes` is a no-op so the LE-
+/// encoded plane *is* host-native), but on a BE host this would fail fast
+/// for any regression that drops the `::<false>` kernel routing — the
+/// kernel must apply `u32::from_le` to recover host-native f32 from the
+/// LE-encoded bytes; if it skipped the swap (e.g. `::<HOST_NATIVE_BE>` on
+/// BE), the output would be byte-swapped relative to `intended`.
 ///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
-/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
-/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. The dispatch-equivalence claim is specifically
-/// about the LE host-routing pattern; the BE-host correctness of the routing
-/// change is verified instead by
-/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the
-/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
-#[test]
-#[cfg(target_endian = "little")]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf32_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row,
-    rgbf32_to_rgba_u16_row,
-  };
-
-  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
-  // Compute it locally so the test asserts the same condition without taking
-  // a dependency on a private const.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33 covers SIMD main loop + scalar tail across every backend.
-  let w = 33usize;
-  let mut pix = std::vec![0.0f32; w * 3];
-  for (i, v) in pix.iter_mut().enumerate() {
-    // Mix in-range, HDR, and negative values to exercise every clamp branch.
-    *v = match i % 5 {
-      0 => 0.0,
-      1 => 0.5,
-      2 => 1.0,
-      3 => 1.75,
-      _ => -0.25,
-    };
-  }
-
-  // u8 RGB.
-  let mut rgb_false = std::vec![0u8; w * 3];
-  let mut rgb_host = std::vec![0u8; w * 3];
-  rgbf32_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
-  rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
-  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
-
-  // u8 RGBA.
-  let mut rgba_false = std::vec![0u8; w * 4];
-  let mut rgba_host = std::vec![0u8; w * 4];
-  rgbf32_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
-  rgbf32_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
-  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
-
-  // u16 RGB.
-  let mut rgb_u16_false = std::vec![0u16; w * 3];
-  let mut rgb_u16_host = std::vec![0u16; w * 3];
-  rgbf32_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
-  rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
-  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
-
-  // u16 RGBA.
-  let mut rgba_u16_false = std::vec![0u16; w * 4];
-  let mut rgba_u16_host = std::vec![0u16; w * 4];
-  rgbf32_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
-  rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
-  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
-
-  // f32 lossless pass-through.
-  let mut f32_false = std::vec![0.0f32; w * 3];
-  let mut f32_host = std::vec![0.0f32; w * 3];
-  rgbf32_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
-  rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
-  assert_eq!(f32_false, f32_host, "f32 RGB diverges");
-  // And on the host (LE on every CI runner) both must equal `pix` bit-exact.
-  if !HOST_NATIVE_BE {
-    assert_eq!(
-      f32_host, pix,
-      "f32 lossless pass-through corrupted on LE host"
-    );
-  }
-}
-
-/// End-to-end sinker contract test: feeding host-native `f32` through
-/// [`MixedSinker<Rgbf32>`] must produce the same output every other sinker
-/// would expect from a host-native source — specifically, `with_rgb_f32`
-/// must be bit-exact identical to the input on every host. Documents the
-/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves.
-/// Pairs with the kernel-level test above; together they cover both the
-/// dispatch boundary and the public sinker boundary.
+/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
+///
+/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf32_sinker_host_native_contract_lossless_passthrough() {
+fn rgbf32_sinker_le_encoded_frame_decodes_correctly() {
   // Mix HDR, in-range, and negative values — the f32 lossless path must
   // round-trip them bit-exact on every host.
-  let mut pix = std::vec![0.0f32; 16 * 4 * 3];
-  for (i, v) in pix.iter_mut().enumerate() {
-    *v = match i % 4 {
+  let intended: Vec<f32> = (0..16 * 4 * 3)
+    .map(|i| match i % 4 {
       0 => 0.5,
       1 => 1.5,
       2 => -0.25,
       _ => 100.0,
-    };
-  }
+    })
+    .collect();
+  // Construct the plane as LE-encoded bytes reinterpreted as f32 (the
+  // documented `*LE` Frame contract). On LE host this is identity; on BE
+  // host the bit-pattern is byte-swapped so the kernel must `from_le` it
+  // back to host-native.
+  let pix: Vec<f32> = intended
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
 
   let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3];
   let mut sink = MixedSinker::<Rgbf32>::new(16, 4)
+    .with_simd(false)
     .with_rgb_f32(&mut rgb_f32_out)
     .unwrap();
   rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  // Bit-exact pass-through on every host. On the buggy `::<false>` routing
-  // a BE host would see byte-swapped output here; on the fixed routing the
-  // assertion holds on both LE and BE.
-  assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted");
+  // Output must be host-native intended values. On a BE host with a
+  // regressed `::<HOST_NATIVE_BE>` routing this would be byte-swapped.
+  assert_eq!(
+    rgb_f32_out, intended,
+    "Rgbf32 sinker LE-encoded plane decoded incorrectly"
+  );
 }
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index e6420e59..a8132480 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -862,291 +862,176 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() {
   );
 }
 
-// ---- HOST_NATIVE_BE routing parity (codex PR #84 Findings 1-3) -------------
+// ---- LE-encoded byte contract regressions (post-#83/#84/#85 audit) --------
 //
-// LE-host routing-equivalence and host-native sinker-contract tests for the
-// `Gbrpf32` / `Gbrapf32` / `Gbrpf16` / `Gbrapf16` sinkers. Mirrors the
-// `Rgbf32` / `Rgbf16` sinker tests added for PR #83's `dcf40a3` (sinker
-// HOST_NATIVE_BE routing) and `c3a6478` (dispatch f16-widen HOST_NATIVE_BE
-// routing).
+// Each of the four float planar GBR Frame types is documented as
+// LE-encoded bytes reinterpreted as `f32` / `half::f16` (FFmpeg `*LE`
+// pixel-format convention). The sinker row-kernel dispatch must apply
+// `u32::from_le` / `u16::from_le` (kernel `BE = false`) to recover host-
+// native arithmetic from those bytes. These tests build a plane explicitly
+// from LE-encoded bit patterns (`f32::from_bits(intended.to_bits().to_le())`
+// and the f16 analogue) and assert the lossless pass-through output equals
+// the host-native intended values.
 //
-// On a LE host `HOST_NATIVE_BE = false`, so the kernel-level test below is
-// a routing sanity check (proving the dispatcher / sinker substitute the
-// correct `BE` template parameter); BE-host correctness of the routing is
-// verified by the existing row-kernel BE parity tests in
-// `src/row/arch/*/tests/` and by the contract tests below (which assert
-// host-native pass-through end-to-end on every host).
-
-/// Kernel-level test: on a LE host, `gbrpf32_to_*::<false>` and
-/// `gbrpf32_to_*::<HOST_NATIVE_BE>` must produce byte-identical output for
-/// every Tier 10 float planar GBR dispatcher across every output type
-/// (u8 RGB / u8 RGBA / u16 RGB / u16 RGBA / f32 lossless). Width 33 covers
-/// SIMD main loop + scalar tail across every backend; width 5 covers tail-
-/// only paths.
-///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host
-/// `::<false>` decodes the host-native fixture as LE-encoded (byte-swap)
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. This sinker-routing-equivalence claim is
-/// specifically about the LE host pattern; BE-host correctness of the
-/// routing change is verified by the contract tests below and the row-
-/// kernel BE parity tests in `src/row/arch/*/tests/`.
-#[test]
-#[cfg(target_endian = "little")]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrpf32_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_row,
-    gbrpf32_to_rgba_u16_row,
-  };
-
-  // Sinker-layer `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`; on
-  // the LE-host gate this evaluates to `false`.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33: SIMD main loop + scalar tail. Width 5: tail-only path. Run both
-  // to cover SIMD-tail-aware backends.
-  for w in [5usize, 7usize, 33usize] {
-    let mut gp = std::vec![0.0f32; w];
-    let mut bp = std::vec![0.0f32; w];
-    let mut rp = std::vec![0.0f32; w];
-    for (i, (g, (b, r))) in gp
-      .iter_mut()
-      .zip(bp.iter_mut().zip(rp.iter_mut()))
-      .enumerate()
-    {
-      *g = match i % 5 {
-        0 => 0.0,
-        1 => 0.5,
-        2 => 1.0,
-        3 => 1.75,
-        _ => -0.25,
-      };
-      *b = match i % 5 {
-        0 => 0.25,
-        1 => 0.75,
-        2 => 1.5,
-        3 => 0.0,
-        _ => -0.5,
-      };
-      *r = match i % 5 {
-        0 => 1.0,
-        1 => 0.5,
-        2 => 0.0,
-        3 => -0.25,
-        _ => 1.25,
-      };
-    }
-
-    let mut rgb_false = std::vec![0u8; w * 3];
-    let mut rgb_host = std::vec![0u8; w * 3];
-    gbrpf32_to_rgb_row::<false>(&gp, &bp, &rp, &mut rgb_false, w, true);
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgb_host, w, true);
-    assert_eq!(rgb_false, rgb_host, "u8 RGB diverges (w = {w})");
-
-    let mut rgba_false = std::vec![0u8; w * 4];
-    let mut rgba_host = std::vec![0u8; w * 4];
-    gbrpf32_to_rgba_row::<false>(&gp, &bp, &rp, &mut rgba_false, w, true);
-    gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgba_host, w, true);
-    assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges (w = {w})");
-
-    let mut u16_false = std::vec![0u16; w * 3];
-    let mut u16_host = std::vec![0u16; w * 3];
-    gbrpf32_to_rgb_u16_row::<false>(&gp, &bp, &rp, &mut u16_false, w, true);
-    gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16_host, w, true);
-    assert_eq!(u16_false, u16_host, "u16 RGB diverges (w = {w})");
-
-    let mut u16a_false = std::vec![0u16; w * 4];
-    let mut u16a_host = std::vec![0u16; w * 4];
-    gbrpf32_to_rgba_u16_row::<false>(&gp, &bp, &rp, &mut u16a_false, w, true);
-    gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16a_host, w, true);
-    assert_eq!(u16a_false, u16a_host, "u16 RGBA diverges (w = {w})");
-
-    let mut f32_false = std::vec![0.0f32; w * 3];
-    let mut f32_host = std::vec![0.0f32; w * 3];
-    gbrpf32_to_rgb_f32_row::<false>(&gp, &bp, &rp, &mut f32_false, w, true);
-    gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut f32_host, w, true);
-    assert_eq!(f32_false, f32_host, "f32 RGB diverges (w = {w})");
-  }
-}
+// Vacuous on LE host (where `to_le` is identity so the LE-encoded plane is
+// host-native already), but on a BE host any regression that drops the
+// `::<false>` routing would be caught here — kernel without `from_le` would
+// emit byte-swapped bit-patterns, failing the bit-exact assertion below.
+//
+// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
 
-/// Sinker contract test: feeding host-native `f32` planes through
-/// [`MixedSinker<Gbrpf32>`] must produce the same output every other sinker
-/// would expect from a host-native source — specifically, `with_rgb_f32`
-/// must be bit-exact identical to the source on every host. Documents the
-/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves.
-/// Pairs with the kernel-level test above; together they cover both the
-/// dispatch boundary and the public sinker boundary.
+/// LE-encoded byte contract regression for [`Gbrpf32`].
+///
+/// Forces `with_simd(false)` so the test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrpf32_sinker_host_native_contract_lossless_passthrough() {
+fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
   // Mix HDR, in-range, and negative values — the f32 lossless path must
   // round-trip them bit-exact on every host.
-  let mut gp = std::vec![0.0f32; w * h];
-  let mut bp = std::vec![0.0f32; w * h];
-  let mut rp = std::vec![0.0f32; w * h];
-  for (i, (g, (b, r))) in gp
-    .iter_mut()
-    .zip(bp.iter_mut().zip(rp.iter_mut()))
-    .enumerate()
-  {
-    *g = match i % 4 {
+  let intended_g: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 0.5,
       1 => 1.5,
       2 => -0.25,
       _ => 100.0,
-    };
-    *b = match i % 4 {
+    })
+    .collect();
+  let intended_b: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 0.0,
       1 => 0.25,
       2 => 1.0,
       _ => f32::INFINITY,
-    };
-    *r = match i % 4 {
+    })
+    .collect();
+  let intended_r: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 1.0,
       1 => -1.0,
       2 => 65505.0,
       _ => 0.5,
-    };
-  }
+    })
+    .collect();
+  // LE-encode each plane (per the documented `*LE` Frame contract).
+  let gp: Vec<f32> = intended_g
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
+  let bp: Vec<f32> = intended_b
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
+  let rp: Vec<f32> = intended_r
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Gbrpf32Frame::try_new(
     &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
   )
   .unwrap();
 
-  // rgb_f32 lossless: each pixel `(R, G, B)` packed in source plane order.
   let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf32>::new(w, h)
+    .with_simd(false)
     .with_rgb_f32(&mut rgb_f32)
     .unwrap();
   gbrpf32_to(&src, &mut sink).unwrap();
 
-  // The lossless scatter writes `(R, G, B)` per pixel in plane-index order.
-  // Bit-exact equality on every host. Buggy `::<false>` routing on a BE host
-  // would byte-swap the output here; the fix keeps it bit-exact.
   for i in 0..(w * h) {
-    assert_eq!(rgb_f32[i * 3], rp[i], "R mismatch at idx {i}");
-    assert_eq!(rgb_f32[i * 3 + 1], gp[i], "G mismatch at idx {i}");
-    assert_eq!(rgb_f32[i * 3 + 2], bp[i], "B mismatch at idx {i}");
+    assert_eq!(
+      rgb_f32[i * 3].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgb_f32[i * 3 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgb_f32[i * 3 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
   }
 }
 
-/// Same as [`gbrpf32_kernel_host_native_be_matches_false_on_le_host`] but
-/// for the `Gbrpf16` family — covers both `use_simd = false` (dispatch's
-/// scalar widen-fallback) and `use_simd = true` (SIMD widen path) at tail
-/// widths 5, 7, 33 to exercise every backend's main loop + scalar tail.
+/// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA
+/// pass-through, including the α plane).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg(target_endian = "little")]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrpf16_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    gbrpf16_to_rgb_row, gbrpf16_to_rgb_u16_row, gbrpf16_to_rgba_row, gbrpf16_to_rgba_u16_row,
+fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.1 + (i as f32) * 0.001).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.2 + (i as f32) * 0.002).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.3 + (i as f32) * 0.003).collect();
+  let intended_a: Vec<f32> = (0..w * h).map(|i| 0.5 + (i as f32) * 0.0005).collect();
+
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
   };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
 
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  for w in [5usize, 7usize, 33usize] {
-    let gp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 0.0,
-          1 => 0.5,
-          2 => 1.0,
-          3 => 1.75,
-          _ => -0.25,
-        })
-      })
-      .collect();
-    let bp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 0.25,
-          1 => 0.75,
-          2 => 1.5,
-          3 => 0.0,
-          _ => -0.5,
-        })
-      })
-      .collect();
-    let rp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 1.0,
-          1 => 0.5,
-          2 => 0.0,
-          3 => -0.25,
-          _ => 1.25,
-        })
-      })
-      .collect();
-
-    // Both `use_simd = false` and `use_simd = true` to cover dispatch's
-    // scalar widen-fallback and the SIMD widen path on every backend.
-    for use_simd in [false, true] {
-      let mut rgb_false = std::vec![0u8; w * 3];
-      let mut rgb_host = std::vec![0u8; w * 3];
-      gbrpf16_to_rgb_row::<false>(&gp, &bp, &rp, &mut rgb_false, w, use_simd);
-      gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgb_host, w, use_simd);
-      assert_eq!(
-        rgb_false, rgb_host,
-        "u8 RGB diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut rgba_false = std::vec![0u8; w * 4];
-      let mut rgba_host = std::vec![0u8; w * 4];
-      gbrpf16_to_rgba_row::<false>(&gp, &bp, &rp, &mut rgba_false, w, use_simd);
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgba_host, w, use_simd);
-      assert_eq!(
-        rgba_false, rgba_host,
-        "u8 RGBA diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut u16_false = std::vec![0u16; w * 3];
-      let mut u16_host = std::vec![0u16; w * 3];
-      gbrpf16_to_rgb_u16_row::<false>(&gp, &bp, &rp, &mut u16_false, w, use_simd);
-      gbrpf16_to_rgb_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16_host, w, use_simd);
-      assert_eq!(
-        u16_false, u16_host,
-        "u16 RGB diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut u16a_false = std::vec![0u16; w * 4];
-      let mut u16a_host = std::vec![0u16; w * 4];
-      gbrpf16_to_rgba_u16_row::<false>(&gp, &bp, &rp, &mut u16a_false, w, use_simd);
-      gbrpf16_to_rgba_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16a_host, w, use_simd);
-      assert_eq!(
-        u16a_false, u16a_host,
-        "u16 RGBA diverges (w = {w}, use_simd = {use_simd})"
-      );
-    }
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
+  let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+    .with_simd(false)
+    .with_rgba_f32(&mut rgba_f32)
+    .unwrap();
+  gbrapf32_to(&src, &mut sink).unwrap();
+
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_f32[i * 4].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 3].to_bits(),
+      intended_a[i].to_bits(),
+      "A idx {i}"
+    );
   }
 }
 
-/// Sinker contract: host-native `half::f16` source through [`MixedSinker<Gbrpf16>`]
-/// `with_rgb_f16` must round-trip the planes bit-exact on every host. The
-/// `::<HOST_NATIVE_BE>` routing keeps the lossless interleave a no-op in the
-/// BE-load layer; the buggy `::<false>` routing on a BE host would byte-swap
-/// every f16 element.
+/// LE-encoded byte contract regression for [`Gbrpf16`].
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
+fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
-  let gp: Vec<half::f16> = (0..w * h)
+  let intended_g: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 0.5,
@@ -1156,7 +1041,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
-  let bp: Vec<half::f16> = (0..w * h)
+  let intended_b: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 0.0,
@@ -1166,7 +1051,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
-  let rp: Vec<half::f16> = (0..w * h)
+  let intended_r: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 1.0,
@@ -1176,6 +1061,14 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
 
   let src = Gbrpf16Frame::try_new(
     &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
@@ -1184,6 +1077,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
 
   let mut rgb_f16 = std::vec![half::f16::ZERO; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+    .with_simd(false)
     .with_rgb_f16(&mut rgb_f16)
     .unwrap();
   gbrpf16_to(&src, &mut sink).unwrap();
@@ -1191,46 +1085,53 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
   for i in 0..(w * h) {
     assert_eq!(
       rgb_f16[i * 3].to_bits(),
-      rp[i].to_bits(),
-      "R mismatch at idx {i}"
+      intended_r[i].to_bits(),
+      "R idx {i}"
     );
     assert_eq!(
       rgb_f16[i * 3 + 1].to_bits(),
-      gp[i].to_bits(),
-      "G mismatch at idx {i}"
+      intended_g[i].to_bits(),
+      "G idx {i}"
     );
     assert_eq!(
       rgb_f16[i * 3 + 2].to_bits(),
-      bp[i].to_bits(),
-      "B mismatch at idx {i}"
+      intended_b[i].to_bits(),
+      "B idx {i}"
     );
   }
 }
 
-/// Sinker contract: [`MixedSinker<Gbrapf16>`] `with_rgba_f16` must round-trip
-/// the source α plane bit-exact alongside the G/B/R planes, on every host.
-/// Validates Strategy A+ alpha consistency under the `HOST_NATIVE_BE`
-/// routing — the previous mix-mode (LE-decoded RGB + host-native α) is gone.
+/// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA
+/// pass-through, including the α plane).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() {
+fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
-  let gp: Vec<half::f16> = (0..w * h)
+  let intended_g: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001))
     .collect();
-  let bp: Vec<half::f16> = (0..w * h)
+  let intended_b: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002))
     .collect();
-  let rp: Vec<half::f16> = (0..w * h)
+  let intended_r: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003))
     .collect();
-  let ap: Vec<half::f16> = (0..w * h)
+  let intended_a: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001))
     .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
 
   let src = Gbrapf16Frame::try_new(
     &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
@@ -1239,57 +1140,543 @@ fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() {
 
   let mut rgba_f16 = std::vec![half::f16::ZERO; w * h * 4];
   let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+    .with_simd(false)
     .with_rgba_f16(&mut rgba_f16)
     .unwrap();
   gbrapf16_to(&src, &mut sink).unwrap();
 
   for i in 0..(w * h) {
-    assert_eq!(rgba_f16[i * 4].to_bits(), rp[i].to_bits(), "R idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 1].to_bits(), gp[i].to_bits(), "G idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 2].to_bits(), bp[i].to_bits(), "B idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 3].to_bits(), ap[i].to_bits(), "A idx {i}");
+    assert_eq!(
+      rgba_f16[i * 4].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 3].to_bits(),
+      intended_a[i].to_bits(),
+      "A idx {i}"
+    );
   }
 }
 
-/// Sinker contract: [`MixedSinker<Gbrapf32>`] `with_rgba_f32` lossless
-/// pass-through plus α — confirms Strategy A+ alpha consistency when the
-/// f32 RGB chain routes via `HOST_NATIVE_BE`. The α plane is host-native
-/// f32, also routed via `HOST_NATIVE_BE`, eliminating any mix-mode.
+/// LE-encoded byte contract regression for [`Gbrpf16`] **widening path**
+/// (`with_rgb_f32`). Exercises the f16 → f32 widen step in the sinker — which
+/// must bit-normalise LE-encoded f16 plane bits before converting to f32.
+///
+/// Vacuous on LE hosts (where `to_le` is identity); on a BE host any
+/// regression that drops the bit-normalize-first step in
+/// `widen_f16_be_to_host_f32::<false>` would interpret byte-swapped bits as
+/// host-native f16 and decode to wildly wrong f32 values.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha() {
+fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
-  let mut gp = std::vec![0.0f32; w * h];
-  let mut bp = std::vec![0.0f32; w * h];
-  let mut rp = std::vec![0.0f32; w * h];
-  let mut ap = std::vec![0.0f32; w * h];
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.5,
+        1 => 0.25,
+        2 => 0.0,
+        _ => 1.0,
+      })
+    })
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.125,
+        1 => 0.75,
+        2 => 0.0625,
+        _ => 0.875,
+      })
+    })
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.375,
+        1 => 0.625,
+        2 => 0.9375,
+        _ => 0.03125,
+      })
+    })
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+
+  let src = Gbrpf16Frame::try_new(
+    &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
+  let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+    .with_simd(false)
+    .with_rgb_f32(&mut rgb_f32)
+    .unwrap();
+  gbrpf16_to(&src, &mut sink).unwrap();
+
   for i in 0..(w * h) {
-    gp[i] = 0.1 + (i as f32) * 0.001;
-    bp[i] = 0.2 + (i as f32) * 0.002;
-    rp[i] = 0.3 + (i as f32) * 0.003;
-    ap[i] = 0.5 + (i as f32) * 0.0005;
+    assert_eq!(rgb_f32[i * 3], intended_r[i].to_f32(), "R idx {i}");
+    assert_eq!(rgb_f32[i * 3 + 1], intended_g[i].to_f32(), "G idx {i}");
+    assert_eq!(rgb_f32[i * 3 + 2], intended_b[i].to_f32(), "B idx {i}");
   }
+}
 
-  let src = Gbrapf32Frame::try_new(
+/// LE-encoded byte contract regression for [`Gbrapf16`] **widening path**
+/// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 →
+/// f32 widen step — same bit-normalise-first contract as the no-α variant.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
+#[test]
+fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001))
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002))
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003))
+    .collect();
+  let intended_a: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001))
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
+
+  let src = Gbrapf16Frame::try_new(
     &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
   )
   .unwrap();
 
   let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
-  let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+  let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+    .with_simd(false)
     .with_rgba_f32(&mut rgba_f32)
     .unwrap();
-  gbrapf32_to(&src, &mut sink).unwrap();
+  gbrapf16_to(&src, &mut sink).unwrap();
+
+  for i in 0..(w * h) {
+    assert_eq!(rgba_f32[i * 4], intended_r[i].to_f32(), "R idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 1], intended_g[i].to_f32(), "G idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 2], intended_b[i].to_f32(), "B idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 3], intended_a[i].to_f32(), "A idx {i}");
+  }
+}
+
+/// LE-encoded byte contract regression for [`Gbrpf16`] **widening → narrow
+/// chain** (`with_rgb_u16` and `with_rgba`). Covers the post-widen routing
+/// where `gbrpf32_to_rgb_u16_row` / `gbrpf32_to_rgba_u16_row` /
+/// `gbrpf32_to_rgb_row` are invoked on **host-native f32 scratch** produced
+/// by `widen_f16_be_to_host_f32::<false>`.
+///
+/// On a BE host this would have been corrupted under the prior
+/// `gbrpf32_to_*::<false>` post-widen routing — that kernel applied
+/// `from_le` to scratch that was already host-native, byte-swapping the
+/// f32 representation before scaling. Fixed by routing post-widen calls
+/// through `::<HOST_NATIVE_BE>` (`true` on BE, `false` on LE), which makes
+/// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch
+/// the double-swap on BE.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
+#[test]
+fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.5,
+        1 => 0.25,
+        2 => 0.0,
+        _ => 1.0,
+      })
+    })
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.125,
+        1 => 0.75,
+        2 => 0.0625,
+        _ => 0.875,
+      })
+    })
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.375,
+        1 => 0.625,
+        2 => 0.9375,
+        _ => 0.03125,
+      })
+    })
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+
+  let src = Gbrpf16Frame::try_new(
+    &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
 
+  // Exercise the u16 narrow path (post-widen → gbrpf32_to_rgb_u16_row).
+  let mut rgb_u16 = std::vec![0u16; w * h * 3];
+  // Exercise the u8 narrow path via with_rgba (Strategy A: post-widen
+  // is unused for u8 since rgba=opaque-α; we trigger the SAME post-widen
+  // path by also attaching luma_u16 alongside u16).
+  let mut luma_u16 = std::vec![0u16; w * h];
+  {
+    let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+      .with_simd(false)
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap()
+      .with_luma_u16(&mut luma_u16)
+      .unwrap();
+    gbrpf16_to(&src, &mut sink).unwrap();
+  }
+
+  // Assert RGB u16 output matches the intended (clamp+scale × 65535) values.
+  let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 };
   for i in 0..(w * h) {
-    assert_eq!(rgba_f32[i * 4], rp[i], "R idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 1], gp[i], "G idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 2], bp[i], "B idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 3], ap[i], "A idx {i}");
+    assert_eq!(
+      rgb_u16[i * 3],
+      to_u16(intended_r[i].to_f32()),
+      "RGB u16 R idx {i}"
+    );
+    assert_eq!(
+      rgb_u16[i * 3 + 1],
+      to_u16(intended_g[i].to_f32()),
+      "RGB u16 G idx {i}"
+    );
+    assert_eq!(
+      rgb_u16[i * 3 + 2],
+      to_u16(intended_b[i].to_f32()),
+      "RGB u16 B idx {i}"
+    );
+  }
+  // Sanity: luma_u16 (post-widen narrow) is non-zero — locks down that
+  // the post-widen luma kernel also sees host-native f32 scratch.
+  assert!(
+    luma_u16.iter().any(|&v| v > 0),
+    "luma_u16 must contain non-zero samples — \
+     a corrupted byte-swap would still emit non-zero output but the rgb_u16 \
+     assertion above is the primary guard"
+  );
+}
+
+// ---- LE-encoded Strategy A+ alpha-patch regressions (codex 3rd-pass) ------
+//
+// The `copy_alpha_plane_f32_to_u8` (and `copy_alpha_plane_f32_to_u16`,
+// `copy_alpha_plane_f32`) helper used to read each f32 α sample as
+// host-native, which silently corrupted the α slot on BE hosts processing
+// the LE-encoded `Gbrapf32Frame` α plane (the byte-swapped bits clamp to
+// near-zero or near-one, producing α = 0 or 255 regardless of intent).
+// Same bug class as the u16 alpha-patch helpers fixed in cf26058.
+//
+// These regressions trigger the **Strategy A+ combo path** (`with_rgb` +
+// `with_rgba`, `with_rgb_u16` + `with_rgba_u16`) on a Frame whose α plane
+// is built from explicit LE-encoded f32 bit-patterns. On a LE host the
+// `to_le` on f32 bits is identity so the test reduces to the original
+// semantics; on a BE host the kernel without `from_le` would clamp
+// byte-swapped garbage and the assertion would fail. The non-multiple-of-
+// SIMD widths (15, 17) exercise scalar-tail correctness in addition to
+// any vectorized body.
+
+/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb` + `with_rgba`)
+/// on a LE-encoded f32 α plane must reproduce standalone `with_rgba` output
+/// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::<false>`
+/// (already endian-aware), so any deviation indicates the Strategy A+
+/// alpha-patch path corrupted the α plane.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
+#[test]
+fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
+  // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend.
+  let w = 15usize;
+  let h = 3usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.10 + (i as f32) * 0.001).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.20 + (i as f32) * 0.002).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.30 + (i as f32) * 0.003).collect();
+  // Deliberately mix in-range, boundary, > 1, and negative α to stress
+  // clamp/scale correctness *after* the bit-normalize step.
+  let intended_a: Vec<f32> = (0..w * h)
+    .map(|i| match i % 7 {
+      0 => 0.0,
+      1 => 0.5,
+      2 => 1.0,
+      3 => 1.5,
+      4 => -0.1,
+      5 => 0.123,
+      _ => 0.876,
+    })
+    .collect();
+
+  // LE-encode every plane (per the documented `*LE` Frame contract).
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
+
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba`.
+  let mut rgba_ref = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
+      .with_rgba(&mut rgba_ref)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  // Strategy A+: `with_rgb` + `with_rgba` combo (alpha-patch path).
+  let mut rgb_combo = std::vec![0u8; w * h * 3];
+  let mut rgba_combo = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
+      .with_rgb(&mut rgb_combo)
+      .unwrap()
+      .with_rgba(&mut rgba_combo)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf32 Strategy A+ alpha-patch must equal standalone `with_rgba`"
+  );
+}
+
+/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb_u16` +
+/// `with_rgba_u16`) on a LE-encoded f32 α plane. Defense-in-depth: the
+/// current sinker calls `gbrapf32_to_rgba_u16_row::<false>` directly here
+/// (no alpha-patch helper invocation), but any future routing change that
+/// switches to the alpha-patch helper must keep BE-host correctness.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
+#[test]
+fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
+  // 17 is non-multiple-of-{4,8,16}.
+  let w = 17usize;
+  let h = 3usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.11 + (i as f32) * 0.0011).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.22 + (i as f32) * 0.0022).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.33 + (i as f32) * 0.0033).collect();
+  let intended_a: Vec<f32> = (0..w * h)
+    .map(|i| match i % 5 {
+      0 => 0.0,
+      1 => 0.25,
+      2 => 1.0,
+      3 => 0.5,
+      _ => 0.75,
+    })
+    .collect();
+
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
+
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba_u16`.
+  let mut rgba_ref = std::vec![0u16; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
+      .with_rgba_u16(&mut rgba_ref)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  // Combo: `with_rgb_u16` + `with_rgba_u16`.
+  let mut rgb_combo = std::vec![0u16; w * h * 3];
+  let mut rgba_combo = std::vec![0u16; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
+      .with_rgb_u16(&mut rgb_combo)
+      .unwrap()
+      .with_rgba_u16(&mut rgba_combo)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf32 Strategy A+ rgba_u16 must equal standalone `with_rgba_u16`"
+  );
+
+  // Independently assert the α slot reflects the intended values
+  // (clamp × 65535 + 0.5). This catches a hypothetical regression where
+  // both code paths share the same bug.
+  let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 };
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_combo[i * 4 + 3],
+      to_u16(intended_a[i]),
+      "α slot idx {i}"
+    );
+  }
+}
+
+/// Codex 3rd-pass regression: Gbrapf16 Strategy A+ (`with_rgb` + `with_rgba`)
+/// on a LE-encoded f16 α plane. This exercises the **post-widen** routing
+/// pattern in `widen_and_scatter_f16_alpha_to_u8`: the f16 α plane is
+/// widened to host-native f32 scratch via `widen_f16_be_to_host_f32::<false>`,
+/// then the alpha-patch helper must consume that scratch with
+/// `BE = HOST_NATIVE_BE` (no double byte-swap). The test compares the
+/// Strategy A+ combo output against the standalone `with_rgba` path, which
+/// uses the `gbrpf16_to_rgba_row::<false>` direct kernel + the same
+/// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the
+/// `widen_and_scatter` helper, so this test guards against the
+/// post-widen routing flag being wrong).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
+#[test]
+fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
+  let w = 15usize;
+  let h = 3usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.10 + (i as f32) * 0.001))
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.20 + (i as f32) * 0.002))
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.30 + (i as f32) * 0.003))
+    .collect();
+  let intended_a: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 5 {
+        0 => 0.0,
+        1 => 0.5,
+        2 => 1.0,
+        3 => 0.25,
+        _ => 0.75,
+      })
+    })
+    .collect();
+
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
+
+  let src = Gbrapf16Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba` (uses `gbrpf16_to_rgba_row` then
+  // `widen_and_scatter_f16_alpha_to_u8` → exercises the post-widen helper
+  // as well, with the same routing as the combo path).
+  let mut rgba_ref = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_simd(false)
+      .with_rgba(&mut rgba_ref)
+      .unwrap();
+    gbrapf16_to(&src, &mut sink).unwrap();
+  }
+
+  // Strategy A+: `with_rgb` + `with_rgba`.
+  let mut rgb_combo = std::vec![0u8; w * h * 3];
+  let mut rgba_combo = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_simd(false)
+      .with_rgb(&mut rgb_combo)
+      .unwrap()
+      .with_rgba(&mut rgba_combo)
+      .unwrap();
+    gbrapf16_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf16 Strategy A+ post-widen alpha-patch must equal standalone `with_rgba`"
+  );
+
+  // Independently assert α slot reflects the intended values
+  // (widen → clamp × 255 + 0.5).
+  let to_u8 = |v: f32| -> u8 { (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8 };
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_combo[i * 4 + 3],
+      to_u8(intended_a[i].to_f32()),
+      "α slot idx {i}"
+    );
   }
 }