From fb3c4bc8d8bc8577847b3a5238cbd6f3ba55ac0b Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 19:36:30 +1200
Subject: [PATCH 01/10] docs(frame): clarify LE-encoded byte contract on
 f32/f16 frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Affected types: Rgbf32Frame, Rgbf16Frame, Gbrpf32Frame, Gbrapf32Frame,
Gbrpf16Frame, Gbrapf16Frame.

Each doc-comment now explicitly states the LE-encoded byte contract,
mirroring the established Grayf32Frame doc pattern: the &[f32] / &[f16]
plane is the LE-encoded byte layout reinterpreted as f32 / f16, matching
FFmpeg's canonical *LE pixel-format suffixes (AV_PIX_FMT_GBRPF32LE etc).
Adds the bytemuck::cast_slice + linesize-division-by-element-size
instruction so callers wiring up FFmpeg buffers don't have to guess.

Codex 3rd-pass review of PR #85 caught that PR #83 + PR #84 introduced
sinker routings that assumed the planes are already host-native f32 / f16
— a wrong reading of the *LE contract that would corrupt every output
path on a BE host. Pinning the doc-level contract here so future readers
don't repeat the mistake; sinker revert lands in the next commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/frame/packed_rgb_f16.rs   | 17 ++++++++++-
 src/frame/packed_rgb_float.rs | 17 ++++++++++-
 src/frame/planar_gbr_float.rs | 56 +++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs
index 1469df86..ecf9e8c6 100644
--- a/src/frame/packed_rgb_f16.rs
+++ b/src/frame/packed_rgb_f16.rs
@@ -50,7 +50,8 @@ pub enum Rgbf16FrameError {
   },
 }
 
-/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`).
+/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`,
+/// FFmpeg canonical `*LE` convention — see endian note below).
 /// One plane, 3 × `f16` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -65,6 +66,20 @@ pub enum Rgbf16FrameError {
 /// `stride` is in **`f16` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted
+/// as `f16`, matching the FFmpeg `*LE` pixel-format convention. On a
+/// little-endian host (every CI runner today) LE bytes _are_ host-native,
+/// so `&[half::f16]` is also a host-native f16 slice; on a big-endian host
+/// the bytes have to be byte-swapped back to host-native before arithmetic.
+/// Downstream row kernels handle this byte-swap (or no-op on LE) under the
+/// hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf16Frame<'a> {
   rgb: &'a [half::f16],
diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs
index d1f79cef..166b0848 100644
--- a/src/frame/packed_rgb_float.rs
+++ b/src/frame/packed_rgb_float.rs
@@ -50,7 +50,8 @@ pub enum Rgbf32FrameError {
   },
 }
 
-/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`).
+/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`,
+/// FFmpeg canonical `*LE` convention — see endian note below).
 /// One plane, 3 × `f32` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -64,6 +65,20 @@ pub enum Rgbf32FrameError {
 /// `stride` is in **`f32` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as
+/// `f32`, matching the FFmpeg `*LE` pixel-format convention. On a
+/// little-endian host (every CI runner today) LE bytes _are_ host-native,
+/// so `&[f32]` is also a host-native float slice; on a big-endian host the
+/// bytes have to be byte-swapped back to host-native before arithmetic.
+/// Downstream row kernels handle this byte-swap (or no-op on LE) under the
+/// hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf32Frame<'a> {
   rgb: &'a [f32],
diff --git a/src/frame/planar_gbr_float.rs b/src/frame/planar_gbr_float.rs
index d612e715..505a7bd7 100644
--- a/src/frame/planar_gbr_float.rs
+++ b/src/frame/planar_gbr_float.rs
@@ -147,6 +147,20 @@ const fn check_plane(
 /// `f32` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// preserved bit-exact on lossless pass-through outputs and clamped to
 /// `[0.0, 1.0]` on integer-output paths.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf32Frame<'a> {
   g: &'a [f32],
@@ -250,6 +264,20 @@ impl<'a> Gbrpf32Frame<'a> {
 /// Four full-resolution `f32` planes in **G, B, R, A** order. Alpha is
 /// real per-pixel; nominal range `[0.0, 1.0]` (opaque = 1.0). Stride is
 /// in `f32` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf32Frame<'a> {
   g: &'a [f32],
@@ -372,6 +400,20 @@ impl<'a> Gbrapf32Frame<'a> {
 /// Three full-resolution [`half::f16`] planes in **G, B, R** order. Stride
 /// is in `f16` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// permitted (saturation to `+Inf` occurs on f16→f32 narrowing paths).
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf16Frame<'a> {
   g: &'a [half::f16],
@@ -475,6 +517,20 @@ impl<'a> Gbrpf16Frame<'a> {
 /// Four full-resolution [`half::f16`] planes in **G, B, R, A** order.
 /// Alpha is real per-pixel; nominal range `[0.0, 1.0]`. Stride is in
 /// `f16` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf16Frame<'a> {
   g: &'a [half::f16],

From 740bc0d696918ad70fb1a50f9d4287d73faa9e1f Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 19:40:09 +1200
Subject: [PATCH 02/10] fix(sinker): revert HOST_NATIVE_BE routing for f32/f16
 RGB & GBR sinkers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors PR #85's Grayf32 sinker revert (52f8191) for the other six float
frame types affected by PR #83 (dcf40a3) and PR #84 (8627280):
Rgbf32, Rgbf16, Gbrpf32, Gbrapf32, Gbrpf16, Gbrapf16.

The earlier PRs introduced sinker-layer `HOST_NATIVE_BE` consts that
made the row-kernel `BE` parameter a no-op byte-swap on whichever host
the build targeted. That assumed the Frame's plane bytes are already
host-native f32/f16 — which contradicts the LE-encoded byte contract
clarified in the previous commit. On a BE host the routing skipped the
required `u32::from_le` / `u16::from_le` swap and would have corrupted
every output path.

Reverted call sites (all `::<HOST_NATIVE_BE>` -> `::<false>`):
  - src/sinker/mixed/packed_rgb_float.rs:    6 sites (Rgbf32)
  - src/sinker/mixed/packed_rgb_f16.rs:      7 sites (Rgbf16)
  - src/sinker/mixed/planar_gbr_float.rs:   22 sites (Gbrpf32 + Gbrapf32)
  - src/sinker/mixed/planar_gbr_f16.rs:     22 sites (Gbrpf16 + Gbrapf16)

Removes the four `const HOST_NATIVE_BE: bool = cfg!(target_endian = "big")`
definitions and their explanatory doc-comments. Rewrites the
`widen_f16_to_f32` doc in planar_gbr_f16.rs to accurately describe the
LE-encoded contract (helper is correct on LE host; cross-endian widen
should use the bit-normalising `widen_f16_be_to_host_f32` pattern).

Untouched: `BE == HOST_NATIVE_BE` `copy_from_slice` fast paths in scalar
row kernels (`rgbf32_to_rgb_f32_row`, `gbrpf32_to_rgb_f32_row`, etc.) —
generic in `BE`, correct under either contract. Per-arch SIMD kernels
and `widen_f16_be_to_host_f32` left alone — different layer.

Verification (aarch64-apple-darwin):

  cargo test --target aarch64-apple-darwin --lib   # 2246 ok

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/packed_rgb_f16.rs   |  33 ++-------
 src/sinker/mixed/packed_rgb_float.rs |  31 ++------
 src/sinker/mixed/planar_gbr_f16.rs   | 102 +++++++++++----------------
 src/sinker/mixed/planar_gbr_float.rs |  74 ++++++-------------
 4 files changed, 75 insertions(+), 165 deletions(-)

diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs
index 62ab1cc8..e349f130 100644
--- a/src/sinker/mixed/packed_rgb_f16.rs
+++ b/src/sinker/mixed/packed_rgb_f16.rs
@@ -34,25 +34,6 @@ use crate::{
   yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink},
 };
 
-/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as
-/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`]
-/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the
-/// caller hands us already-decoded half-floats. The kernel `BE` parameter,
-/// however, names the **encoded** byte order (so `BE = false` means "decode
-/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout
-/// is LE, so `BE = false` is correct; on a BE host the host-native layout is
-/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap.
-/// Without this routing the loaders would byte-swap an already-decoded host-
-/// native `f16` on BE hosts, corrupting every output path.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` — same truth table, different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Rgbf16 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf16> {
@@ -253,27 +234,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if let Some(buf) = rgb_f16.as_deref_mut() {
       let f16_start = one_plane_start * 3;
       let f16_end = one_plane_end * 3;
-      rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
+      rgbf16_to_rgb_f16_row::<false>(rgb_in, &mut buf[f16_start..f16_end], w, use_simd);
     }
 
     // Lossless f32 widen — also independent of integer conversion paths.
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf16_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf16_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct half-float → u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma /
@@ -288,7 +269,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -307,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
       w,
       h,
     )?;
-    rgbf16_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
+    rgbf16_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -347,7 +328,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> {
     // over `rgb_row` via `expand_rgb_to_rgba_row`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf16_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf16_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs
index f189e5ab..e1c17a39 100644
--- a/src/sinker/mixed/packed_rgb_float.rs
+++ b/src/sinker/mixed/packed_rgb_float.rs
@@ -31,25 +31,6 @@ use crate::{
   yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink},
 };
 
-/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as
-/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`]
-/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller
-/// hands us already-decoded floats. The kernel `BE` parameter, however, names
-/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes"
-/// via `u32::from_le`). On a LE host the host-native layout is LE, so
-/// `BE = false` is correct; on a BE host the host-native layout is BE, so we
-/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this
-/// routing the loaders would byte-swap an already-decoded host-native `f32` on
-/// BE hosts, corrupting every output path.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` — same truth table, different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Rgbf32 impl -------------------------------------------------------
 
 impl<'a> MixedSinker<'a, Rgbf32> {
@@ -228,20 +209,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if let Some(buf) = rgb_f32.as_deref_mut() {
       let f32_start = one_plane_start * 3;
       let f32_end = one_plane_end * 3;
-      rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
+      rgbf32_to_rgb_f32_row::<false>(rgb_in, &mut buf[f32_start..f32_end], w, use_simd);
     }
 
     // u16 RGB output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgb_u16.as_deref_mut() {
       let u16_start = one_plane_start * 3;
       let u16_end = one_plane_end * 3;
-      rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
+      rgbf32_to_rgb_u16_row::<false>(rgb_in, &mut buf[u16_start..u16_end], w, use_simd);
     }
 
     // u16 RGBA output — direct float→u16 conversion (no staging).
     if let Some(buf) = rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_u16_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     // u8 RGBA standalone fast path — direct float→u8 conversion when
@@ -256,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     if want_rgba_u8 && !need_u8_rgb {
       let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -276,7 +257,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
       w,
       h,
     )?;
-    rgbf32_to_rgb_row::<HOST_NATIVE_BE>(rgb_in, rgb_row, w, use_simd);
+    rgbf32_to_rgb_row::<false>(rgb_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
       rgb_to_luma_row(
@@ -318,7 +299,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> {
     // less memory pass for combined `with_rgb + with_rgba` callers.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      rgbf32_to_rgba_row::<HOST_NATIVE_BE>(rgb_in, rgba_row, w, use_simd);
+      rgbf32_to_rgba_row::<false>(rgb_in, rgba_row, w, use_simd);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index c041b901..6a0a351e 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -64,49 +64,27 @@ const GBR_F16_FULL_RANGE: bool = true;
 // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated).
 const WIDEN_CHUNK: usize = 64;
 
-/// `BE` value that makes the `gbrpf16_to_*` / `gbrapf16_to_*` row dispatchers
-/// (and the widened `gbrpf32_to_*` chain after `widen_f16_to_f32`) treat
-/// their input as **host-native** (a no-op byte-swap).
-///
-/// [`crate::frame::Gbrpf16Frame`] / [`crate::frame::Gbrapf16Frame`] expose
-/// `&[half::f16]` plane rows in **host-native** layout — the API contract
-/// is that the caller hands us already-decoded half-floats. The kernel `BE`
-/// parameter, however, names the **encoded** byte order (so `BE = false`
-/// means "decode LE-encoded bytes" via `u16::from_le`). On a LE host the
-/// host-native layout is LE, so `BE = false` is correct; on a BE host the
-/// host-native layout is BE, so we must request `BE = true` to make
-/// `u16::from_be` no-op the swap. Without this routing the loaders would
-/// byte-swap an already-decoded host-native `f16` on BE hosts, corrupting
-/// every output path (codex PR #84 Finding 3).
-///
-/// Crucially, the **widened f32 chain** must also use `HOST_NATIVE_BE`:
-/// after [`widen_f16_to_f32`] (which calls `half::f16::to_f32` on host-native
-/// f16 bits) the scratch is host-native f32, so the downstream
-/// `gbrpf32_to_*` kernel's `from_le`/`from_be` loader must be a no-op —
-/// achieved by routing with `HOST_NATIVE_BE`.
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced in `c3a6478` and the `Rgbf16` sinker fix in
-/// `dcf40a3`. Same truth table:
+/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements).
 ///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
+/// The source slice is `&[half::f16]` from a [`Gbrpf16Frame`] /
+/// [`Gbrapf16Frame`] plane — i.e. **LE-encoded bytes** reinterpreted as
+/// `half::f16` (per the FFmpeg `*LE` contract). On a little-endian host
+/// LE bytes _are_ host-native, so `half::f16::to_f32` interprets the bits
+/// correctly and the downstream `gbrpf32_to_*` kernel — invoked with
+/// `BE = false` — sees host-native f32 (its `u32::from_le` is a no-op on
+/// LE host) → correct.
 ///
-/// The α-plane scatter for [`Gbrapf16`] (Strategy A+ / standalone-RGBA)
-/// widens the host-native f16 α plane to host-native f32 via
-/// [`widen_f16_to_f32`] then calls `copy_alpha_plane_f32_to_u8` — both
-/// operations are endian-agnostic. Mix-mode corruption (LE-decoded RGB +
-/// host-native α) is therefore eliminated by routing the RGB chain via
-/// `HOST_NATIVE_BE`.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements).
+/// On a big-endian host this helper would mis-interpret the LE-encoded
+/// bits as host-native (a separate bug from the sinker-layer routing
+/// reverted in this PR); the bit-normalising scalar helper
+/// [`crate::row::scalar::widen_f16_be_to_host_f32`] is the correct pattern
+/// for cross-endian widen-then-convert chains and is what the per-arch
+/// SIMD backends use. Wiring this sinker-local helper to that bit-
+/// normalising version is left as a future improvement — every CI runner
+/// today is LE and exercises the correct path here.
 ///
-/// The source slice is `&[half::f16]` in **host-native** layout (per the
-/// `Gbrpf16Frame` / `Gbrapf16Frame` API contract); `to_f32` interprets the
-/// bits as host-native and emits host-native `f32`. Downstream `gbrpf32_to_*`
-/// callers must therefore route with [`HOST_NATIVE_BE`] (not the encoded
-/// `BE`) to avoid double byte-swapping.
+/// [`Gbrpf16Frame`]: crate::frame::Gbrpf16Frame
+/// [`Gbrapf16Frame`]: crate::frame::Gbrapf16Frame
 #[cfg_attr(not(tarpaulin), inline(always))]
 fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) {
   for i in 0..count {
@@ -351,7 +329,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf16_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf16_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -363,7 +341,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
           height: h,
           channels: 4,
         })?;
-      gbrpf16_to_rgba_f16_row::<HOST_NATIVE_BE>(
+      gbrpf16_to_rgba_f16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -407,29 +385,29 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         if let Some(buf) = self.rgb_f32.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_f32.as_deref_mut() {
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrpf32_to_rgba_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgba_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgb_u16.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_u16.as_deref_mut() {
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgba_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.luma.as_deref_mut() {
-          gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_luma_row::<false>(
             gf,
             bf,
             rf,
@@ -442,7 +420,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         }
 
         if let Some(buf) = self.luma_u16.as_deref_mut() {
-          gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_luma_u16_row::<false>(
             gf,
             bf,
             rf,
@@ -455,7 +433,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         }
 
         if let Some(hsv) = self.hsv.as_mut() {
-          gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_hsv_row::<false>(
             gf,
             bf,
             rf,
@@ -480,7 +458,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf16_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -504,7 +482,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
       w,
       h,
     )?;
-    gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf16_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     // Strategy A: expand RGB → RGBA (constant α = 0xFF).
     if let Some(buf) = rgba.as_deref_mut() {
@@ -761,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       // rgb_f16: no source α — use the no-α kernel (lossless scatter).
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf16_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf16_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -774,7 +752,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
           height: h,
           channels: 4,
         })?;
-      gbrapf16_to_rgba_f16_row::<HOST_NATIVE_BE>(
+      gbrapf16_to_rgba_f16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -821,14 +799,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         if let Some(buf) = self.rgb_f32.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_f32.as_deref_mut() {
           // gbrapf32_to_rgba_f32_row with widened source α (lossless).
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
+          gbrapf32_to_rgba_f32_row::<false>(
             gf,
             bf,
             rf,
@@ -842,14 +820,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         if let Some(buf) = self.rgb_u16.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_u16.as_deref_mut() {
           // gbrapf32_to_rgba_u16_row with widened source α.
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+          gbrapf32_to_rgba_u16_row::<false>(
             gf,
             bf,
             rf,
@@ -861,7 +839,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         }
 
         if let Some(buf) = self.luma.as_deref_mut() {
-          gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_luma_row::<false>(
             gf,
             bf,
             rf,
@@ -874,7 +852,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         }
 
         if let Some(buf) = self.luma_u16.as_deref_mut() {
-          gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_luma_u16_row::<false>(
             gf,
             bf,
             rf,
@@ -887,7 +865,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         }
 
         if let Some(hsv) = self.hsv.as_mut() {
-          gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+          gbrpf32_to_hsv_row::<false>(
             gf,
             bf,
             rf,
@@ -918,7 +896,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
       // Write opaque RGB → RGBA (α = 0xFF), then overwrite α from source.
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf16_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       // Scatter f16 α → u8 slot 3: widen + clamp + scale.
       widen_and_scatter_f16_alpha_to_u8(a_in, rgba_row, w);
       return Ok(());
@@ -944,7 +922,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       w,
       h,
     )?;
-    gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf16_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from source.
     if let Some(buf) = rgba.as_deref_mut() {
diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs
index 7b979049..361eee76 100644
--- a/src/sinker/mixed/planar_gbr_float.rs
+++ b/src/sinker/mixed/planar_gbr_float.rs
@@ -53,36 +53,6 @@ use crate::{
 const GBR_FLOAT_LUMA_MATRIX: ColorMatrix = ColorMatrix::Bt709;
 const GBR_FLOAT_FULL_RANGE: bool = true;
 
-/// `BE` value that makes the `gbrpf32_to_*` / `gbrapf32_to_*` row dispatchers
-/// treat their input as **host-native** (a no-op byte-swap).
-///
-/// [`crate::frame::Gbrpf32Frame`] / [`crate::frame::Gbrapf32Frame`] expose
-/// `&[f32]` plane rows in **host-native** layout — the API contract is that
-/// the caller hands us already-decoded floats. The kernel `BE` parameter,
-/// however, names the **encoded** byte order (so `BE = false` means "decode
-/// LE-encoded bytes" via `u32::from_le`). On a LE host the host-native layout
-/// is LE, so `BE = false` is correct; on a BE host the host-native layout is
-/// BE, so we must request `BE = true` to make `u32::from_be` no-op the swap.
-/// Without this routing the loaders would byte-swap an already-decoded host-
-/// native `f32` on BE hosts, corrupting every output path (codex PR #84
-/// Finding 2).
-///
-/// This is the **sinker-layer** complement to the SIMD-backend-internal
-/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in
-/// `c3a6478` and the `Rgbf32` sinker fix in `dcf40a3`. Same truth table,
-/// different layer:
-///
-///   • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct.
-///   • BE host: `HOST_NATIVE_BE = true`  → `from_be` (no-op on BE) → correct.
-///
-/// The α-plane scatter (Strategy A+ / standalone-RGBA) consumes the host-
-/// native `&[f32]` α plane via `copy_alpha_plane_f32_to_u8`, which is endian-
-/// agnostic — there's no BE branching needed for the α path because it does
-/// not byte-load through `from_le`/`from_be`. Mix-mode corruption (LE-decoded
-/// RGB + host-native α) is therefore eliminated by routing the RGB chain via
-/// `HOST_NATIVE_BE`.
-const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
 // ---- Gbrpf32 accessor impl block ----------------------------------------
 
 impl<'a> MixedSinker<'a, Gbrpf32> {
@@ -321,7 +291,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_f32.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f32.as_deref_mut() {
@@ -333,7 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_rgba_f32_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -348,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -360,7 +330,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f16_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_rgba_f16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -375,12 +345,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if let Some(buf) = self.rgb_u16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_u16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf32_to_rgba_u16_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
     }
 
     // ---- u8 RGBA standalone fast path (no RGB / luma / HSV needed) -------
@@ -395,7 +365,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgba_row, w, use_simd);
+      gbrpf32_to_rgba_row::<false>(g_in, b_in, r_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -422,10 +392,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
       w,
       h,
     )?;
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf32_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
-      gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -438,7 +408,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     }
 
     if let Some(luma_u16) = luma_u16.as_deref_mut() {
-      gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_u16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -451,7 +421,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
     }
 
     if let Some(hsv) = hsv.as_mut() {
-      gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_hsv_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -721,7 +691,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_f32.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f32.as_deref_mut() {
@@ -733,7 +703,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
+      gbrapf32_to_rgba_f32_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -749,7 +719,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_f16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_f16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     if let Some(buf) = self.rgba_f16.as_deref_mut() {
@@ -761,7 +731,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f16_row::<HOST_NATIVE_BE>(
+      gbrapf32_to_rgba_f16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -777,14 +747,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if let Some(buf) = self.rgb_u16.as_deref_mut() {
       let start = one_plane_start * 3;
       let end = one_plane_end * 3;
-      gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
+      gbrpf32_to_rgb_u16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGBA path (direct — source α clamped + scaled) -------------
 
     if let Some(buf) = self.rgba_u16.as_deref_mut() {
       let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
-      gbrapf32_to_rgba_u16_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+      gbrapf32_to_rgba_u16_row::<false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
     }
 
     // ---- u8 RGBA standalone fast path ------------------------------------
@@ -799,7 +769,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     if want_rgba && !need_u8_rgb {
       let rgba_buf = self.rgba.as_deref_mut().unwrap();
       let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
-      gbrapf32_to_rgba_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
+      gbrapf32_to_rgba_row::<false>(g_in, b_in, r_in, a_in, rgba_row, w, use_simd);
       return Ok(());
     }
 
@@ -826,10 +796,10 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
       w,
       h,
     )?;
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(g_in, b_in, r_in, rgb_row, w, use_simd);
+    gbrpf32_to_rgb_row::<false>(g_in, b_in, r_in, rgb_row, w, use_simd);
 
     if let Some(luma) = luma.as_deref_mut() {
-      gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -842,7 +812,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     }
 
     if let Some(luma_u16) = luma_u16.as_deref_mut() {
-      gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_luma_u16_row::<false>(
         g_in,
         b_in,
         r_in,
@@ -855,7 +825,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
     }
 
     if let Some(hsv) = hsv.as_mut() {
-      gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
+      gbrpf32_to_hsv_row::<false>(
         g_in,
         b_in,
         r_in,

From da794cedfa61508dcc1d85fefffdb76b140665e7 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 19:46:55 +1200
Subject: [PATCH 03/10] test(sinker): replace wrong-contract f32/f16 sinker
 tests with LE-encoded regressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the host-native-contract tests that PR #83 + PR #84 added — they
were typed against an incorrect host-native-f32/f16 reading of the Frame
contract and would mask the BE-host corruption introduced by the now-
reverted `::<HOST_NATIVE_BE>` sinker routings:

  - rgbf32_kernel_host_native_be_matches_false_on_le_host       (deleted)
  - rgbf32_sinker_host_native_contract_lossless_passthrough     (deleted)
  - rgbf16_kernel_host_native_be_matches_false_on_le_host       (deleted)
  - rgbf16_sinker_host_native_contract_lossless_passthrough     (deleted)
  - gbrpf32_kernel_host_native_be_matches_false_on_le_host      (deleted)
  - gbrpf32_sinker_host_native_contract_lossless_passthrough    (deleted)
  - gbrpf16_kernel_host_native_be_matches_false_on_le_host      (deleted)
  - gbrpf16_sinker_host_native_contract_lossless_passthrough    (deleted)
  - gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha  (deleted)
  - gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha  (deleted)

Adds LE-encoded byte contract regressions following PR #85's `52f8191`
pattern (`grayf32_sinker_le_encoded_frame_decodes_correctly`):

  - rgbf32_sinker_le_encoded_frame_decodes_correctly    (new)
  - rgbf16_sinker_le_encoded_frame_decodes_correctly    (new)
  - gbrpf32_sinker_le_encoded_frame_decodes_correctly   (new)
  - gbrapf32_sinker_le_encoded_frame_decodes_correctly  (new)
  - gbrpf16_sinker_le_encoded_frame_decodes_correctly   (new)
  - gbrapf16_sinker_le_encoded_frame_decodes_correctly  (new)

Each test constructs an explicitly LE-encoded plane via
`f32::from_bits(intended.to_bits().to_le())` (or the f16 analogue),
builds the Frame, runs the sinker's lossless pass-through, and asserts
the output equals the host-native intended values. Vacuous on LE hosts
(where `to_le` is identity) but on a BE host any regression that drops
the `::<false>` kernel routing would fail fast.

Also picks up two trivial pre-existing trailing-blank-line cleanups in
`src/row/arch/{neon,x86_sse41}/endian.rs` produced by `cargo fmt` while
running it across the modified files.

Verification (aarch64-apple-darwin, x86_64-apple-darwin, wasm32):

  cargo test --target aarch64-apple-darwin --lib                  # 2242 ok
  cargo build --target x86_64-apple-darwin --tests                # ok
  RUSTFLAGS="-C target-feature=+simd128" \
    cargo build --target wasm32-unknown-unknown --tests           # ok
  cargo build --no-default-features                               # ok
  cargo fmt --check                                               # ok
  cargo clippy --all-targets --all-features -- -D warnings        # ok

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/planar_gbr_f16.rs         |  39 +-
 src/sinker/mixed/planar_gbr_float.rs       |  38 +-
 src/sinker/mixed/tests/packed_rgb_f16.rs   | 133 ++----
 src/sinker/mixed/tests/packed_rgb_float.rs | 146 ++-----
 src/sinker/mixed/tests/planar_gbr_float.rs | 454 ++++++++-------------
 5 files changed, 225 insertions(+), 585 deletions(-)

diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index 6a0a351e..150ef729 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -341,14 +341,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
           height: h,
           channels: 4,
         })?;
-      gbrpf16_to_rgba_f16_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf16_to_rgba_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- Paths that require widening f16 → f32 ---------------------------
@@ -752,15 +745,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
           height: h,
           channels: 4,
         })?;
-      gbrapf16_to_rgba_f16_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf16_to_rgba_f16_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- Paths that require widening f16 → f32 ---------------------------
@@ -806,15 +791,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
           // gbrapf32_to_rgba_f32_row with widened source α (lossless).
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_f32_row::<false>(
-            gf,
-            bf,
-            rf,
-            af,
-            &mut buf[start..end],
-            n,
-            use_simd,
-          );
+          gbrapf32_to_rgba_f32_row::<false>(gf, bf, rf, af, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgb_u16.as_deref_mut() {
@@ -827,15 +804,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
           // gbrapf32_to_rgba_u16_row with widened source α.
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_u16_row::<false>(
-            gf,
-            bf,
-            rf,
-            af,
-            &mut buf[start..end],
-            n,
-            use_simd,
-          );
+          gbrapf32_to_rgba_u16_row::<false>(gf, bf, rf, af, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.luma.as_deref_mut() {
diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs
index 361eee76..cb71de5f 100644
--- a/src/sinker/mixed/planar_gbr_float.rs
+++ b/src/sinker/mixed/planar_gbr_float.rs
@@ -303,14 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f32_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf32_to_rgba_f32_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- f16 narrowing (independent of integer paths) --------------------
@@ -330,14 +323,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> {
           height: h,
           channels: 4,
         })?;
-      gbrpf32_to_rgba_f16_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrpf32_to_rgba_f16_row::<false>(g_in, b_in, r_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGB / RGBA path (direct float → u16, no staging) -----------
@@ -703,15 +689,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f32_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf32_to_rgba_f32_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- f16 narrowing (independent of integer paths) --------------------
@@ -731,15 +709,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
           height: h,
           channels: 4,
         })?;
-      gbrapf32_to_rgba_f16_row::<false>(
-        g_in,
-        b_in,
-        r_in,
-        a_in,
-        &mut buf[start..end],
-        w,
-        use_simd,
-      );
+      gbrapf32_to_rgba_f16_row::<false>(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd);
     }
 
     // ---- u16 RGB path (direct, no staging) ------------------------------
diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index 1cd09a1f..0f227c14 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -311,122 +311,36 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
   assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless");
 }
 
-/// Sinker-layer host-native-`f16` regression for the bug fixed alongside
-/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to
-/// hardcode `::<false>` when calling the row dispatchers, telling them to
-/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native
-/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted
-/// every output path on BE hosts (the `u16` loaders would byte-swap an
-/// already-decoded f16 bit-pattern). The fix replaces those `::<false>` with
-/// `::<HOST_NATIVE_BE>`, which is `false` on LE and `true` on BE — a no-op
-/// byte-swap on either host.
+/// LE-encoded byte contract regression: builds an [`Rgbf16Frame`] from a
+/// `&[half::f16]` plane explicitly encoded as LE bytes (per the FFmpeg
+/// `AV_PIX_FMT_*LE` convention documented on `Rgbf16Frame`), runs it
+/// through the sinker's `with_rgb_f16` lossless pass-through, and asserts
+/// the output equals the host-native intended values.
 ///
-/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
-/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
-/// identical to `::<false>`, so this test cannot distinguish the broken vs
-/// fixed code on LE. It instead documents the equivalence at the **kernel
-/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both
-/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host.
+/// Vacuous on LE hosts (where `to_le` on a `u16` is a no-op so the LE-
+/// encoded plane *is* host-native), but on a BE host this would fail fast
+/// for any regression that drops the `::<false>` kernel routing — the
+/// kernel must apply `u16::from_le` to recover host-native f16 bit-patterns
+/// from the LE-encoded bytes.
 ///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
-/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
-/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. The dispatch-equivalence claim is specifically
-/// about the LE host-routing pattern; the BE-host correctness of the routing
-/// change is verified instead by
-/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the
-/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
+/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
 #[test]
-#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn rgbf16_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row,
-    rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row,
-  };
-
-  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
-  // Compute it locally so the test asserts the same condition without taking
-  // a dependency on a private const.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33 covers SIMD main loop + scalar tail across every backend.
-  let w = 33usize;
-  let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25];
-  let pix: std::vec::Vec<half::f16> = (0..w * 3)
-    .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()]))
-    .collect();
-
-  // u8 RGB.
-  let mut rgb_false = std::vec![0u8; w * 3];
-  let mut rgb_host = std::vec![0u8; w * 3];
-  rgbf16_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
-  rgbf16_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
-  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
-
-  // u8 RGBA.
-  let mut rgba_false = std::vec![0u8; w * 4];
-  let mut rgba_host = std::vec![0u8; w * 4];
-  rgbf16_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
-  rgbf16_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
-  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
-
-  // u16 RGB.
-  let mut rgb_u16_false = std::vec![0u16; w * 3];
-  let mut rgb_u16_host = std::vec![0u16; w * 3];
-  rgbf16_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
-  rgbf16_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
-  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
-
-  // u16 RGBA.
-  let mut rgba_u16_false = std::vec![0u16; w * 4];
-  let mut rgba_u16_host = std::vec![0u16; w * 4];
-  rgbf16_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
-  rgbf16_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
-  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
-
-  // f16 lossless pass-through.
-  let mut f16_false = std::vec![half::f16::ZERO; w * 3];
-  let mut f16_host = std::vec![half::f16::ZERO; w * 3];
-  rgbf16_to_rgb_f16_row::<false>(&pix, &mut f16_false, w, true);
-  rgbf16_to_rgb_f16_row::<HOST_NATIVE_BE>(&pix, &mut f16_host, w, true);
-  assert_eq!(f16_false, f16_host, "f16 RGB diverges");
-  if !HOST_NATIVE_BE {
-    assert_eq!(
-      f16_host, pix,
-      "f16 lossless pass-through corrupted on LE host"
-    );
-  }
-
-  // f32 lossless widen.
-  let mut f32_false = std::vec![0.0f32; w * 3];
-  let mut f32_host = std::vec![0.0f32; w * 3];
-  rgbf16_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
-  rgbf16_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
-  assert_eq!(f32_false, f32_host, "f32 widen diverges");
-}
-
-/// End-to-end sinker contract test: feeding host-native `half::f16` through
-/// [`MixedSinker<Rgbf16>`] must round-trip the f16 input bit-exact via
-/// `with_rgb_f16` on every host. Documents the public-API contract that the
-/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level
-/// test above; together they cover both the dispatch boundary and the public
-/// sinker boundary.
-#[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf16_sinker_host_native_contract_lossless_passthrough() {
+fn rgbf16_sinker_le_encoded_frame_decodes_correctly() {
   let vals_f32 = [0.5f32, 1.5, -0.25, 100.0];
-  let pix: std::vec::Vec<half::f16> = (0..16 * 4 * 3)
+  let intended: Vec<half::f16> = (0..16 * 4 * 3)
     .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()]))
     .collect();
+  // Encode the plane as LE bytes reinterpreted as f16 (the documented
+  // `*LE` Frame contract). On LE host: identity. On BE host: byte-swapped
+  // bit-patterns the kernel must `from_le` back to host-native.
+  let pix: Vec<half::f16> = intended
+    .iter()
+    .map(|&v| half::f16::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
 
   let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3];
@@ -435,7 +349,8 @@ fn rgbf16_sinker_host_native_contract_lossless_passthrough() {
     .unwrap();
   rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  // Bit-exact pass-through on every host — broken `::<false>` routing
-  // would byte-swap on a BE host; the fixed routing keeps the f16 intact.
-  assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted");
+  assert_eq!(
+    rgb_f16_out, intended,
+    "Rgbf16 sinker LE-encoded plane decoded incorrectly"
+  );
 }
diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs
index fd4df2da..38f23f28 100644
--- a/src/sinker/mixed/tests/packed_rgb_float.rs
+++ b/src/sinker/mixed/tests/packed_rgb_float.rs
@@ -246,130 +246,44 @@ fn rgbf32_simd_matches_scalar_with_random_input() {
   assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless");
 }
 
-/// Sinker-layer host-native-`f32` regression for the bug fixed alongside
-/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to
-/// hardcode `::<false>` when calling the row dispatchers, telling them to
-/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native
-/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every
-/// output path on BE hosts (the loaders would byte-swap an already-decoded
-/// f32). The fix replaces those `::<false>` with `::<HOST_NATIVE_BE>`, which
-/// is `false` on LE and `true` on BE — a no-op byte-swap on either host.
+/// LE-encoded byte contract regression: builds an [`Rgbf32Frame`] from a
+/// `&[f32]` plane explicitly encoded as LE bytes (per the FFmpeg
+/// `AV_PIX_FMT_*LE` convention documented on `Rgbf32Frame`), runs it
+/// through the sinker's `with_rgb_f32` lossless pass-through, and asserts
+/// the output equals the host-native intended values.
 ///
-/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run),
-/// `HOST_NATIVE_BE = false` and `::<HOST_NATIVE_BE>` is byte-for-byte
-/// identical to `::<false>`, so this test cannot distinguish the broken vs
-/// fixed code on LE. It instead documents the equivalence at the **kernel
-/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both
-/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`)
-/// must produce identical output on the active host.
+/// Vacuous on LE hosts (where `f32::to_le_bytes` is a no-op so the LE-
+/// encoded plane *is* host-native), but on a BE host this would fail fast
+/// for any regression that drops the `::<false>` kernel routing — the
+/// kernel must apply `u32::from_le` to recover host-native f32 from the
+/// LE-encoded bytes; if it skipped the swap (e.g. `::<HOST_NATIVE_BE>` on
+/// BE), the output would be byte-swapped relative to `intended`.
 ///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the
-/// equality `::<false>` ≡ `::<HOST_NATIVE_BE>` is _false_ — `::<false>`
-/// decodes the host-native fixture as if it were LE-encoded (byte-swap),
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. The dispatch-equivalence claim is specifically
-/// about the LE host-routing pattern; the BE-host correctness of the routing
-/// change is verified instead by
-/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the
-/// row-kernel BE parity tests in `src/row/arch/*/tests/`.
+/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
 #[test]
-#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn rgbf32_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row,
-    rgbf32_to_rgba_u16_row,
-  };
-
-  // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`.
-  // Compute it locally so the test asserts the same condition without taking
-  // a dependency on a private const.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33 covers SIMD main loop + scalar tail across every backend.
-  let w = 33usize;
-  let mut pix = std::vec![0.0f32; w * 3];
-  for (i, v) in pix.iter_mut().enumerate() {
-    // Mix in-range, HDR, and negative values to exercise every clamp branch.
-    *v = match i % 5 {
-      0 => 0.0,
-      1 => 0.5,
-      2 => 1.0,
-      3 => 1.75,
-      _ => -0.25,
-    };
-  }
-
-  // u8 RGB.
-  let mut rgb_false = std::vec![0u8; w * 3];
-  let mut rgb_host = std::vec![0u8; w * 3];
-  rgbf32_to_rgb_row::<false>(&pix, &mut rgb_false, w, true);
-  rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&pix, &mut rgb_host, w, true);
-  assert_eq!(rgb_false, rgb_host, "u8 RGB diverges");
-
-  // u8 RGBA.
-  let mut rgba_false = std::vec![0u8; w * 4];
-  let mut rgba_host = std::vec![0u8; w * 4];
-  rgbf32_to_rgba_row::<false>(&pix, &mut rgba_false, w, true);
-  rgbf32_to_rgba_row::<HOST_NATIVE_BE>(&pix, &mut rgba_host, w, true);
-  assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges");
-
-  // u16 RGB.
-  let mut rgb_u16_false = std::vec![0u16; w * 3];
-  let mut rgb_u16_host = std::vec![0u16; w * 3];
-  rgbf32_to_rgb_u16_row::<false>(&pix, &mut rgb_u16_false, w, true);
-  rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgb_u16_host, w, true);
-  assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges");
-
-  // u16 RGBA.
-  let mut rgba_u16_false = std::vec![0u16; w * 4];
-  let mut rgba_u16_host = std::vec![0u16; w * 4];
-  rgbf32_to_rgba_u16_row::<false>(&pix, &mut rgba_u16_false, w, true);
-  rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(&pix, &mut rgba_u16_host, w, true);
-  assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges");
-
-  // f32 lossless pass-through.
-  let mut f32_false = std::vec![0.0f32; w * 3];
-  let mut f32_host = std::vec![0.0f32; w * 3];
-  rgbf32_to_rgb_f32_row::<false>(&pix, &mut f32_false, w, true);
-  rgbf32_to_rgb_f32_row::<HOST_NATIVE_BE>(&pix, &mut f32_host, w, true);
-  assert_eq!(f32_false, f32_host, "f32 RGB diverges");
-  // And on the host (LE on every CI runner) both must equal `pix` bit-exact.
-  if !HOST_NATIVE_BE {
-    assert_eq!(
-      f32_host, pix,
-      "f32 lossless pass-through corrupted on LE host"
-    );
-  }
-}
-
-/// End-to-end sinker contract test: feeding host-native `f32` through
-/// [`MixedSinker<Rgbf32>`] must produce the same output every other sinker
-/// would expect from a host-native source — specifically, `with_rgb_f32`
-/// must be bit-exact identical to the input on every host. Documents the
-/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves.
-/// Pairs with the kernel-level test above; together they cover both the
-/// dispatch boundary and the public sinker boundary.
-#[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn rgbf32_sinker_host_native_contract_lossless_passthrough() {
+fn rgbf32_sinker_le_encoded_frame_decodes_correctly() {
   // Mix HDR, in-range, and negative values — the f32 lossless path must
   // round-trip them bit-exact on every host.
-  let mut pix = std::vec![0.0f32; 16 * 4 * 3];
-  for (i, v) in pix.iter_mut().enumerate() {
-    *v = match i % 4 {
+  let intended: Vec<f32> = (0..16 * 4 * 3)
+    .map(|i| match i % 4 {
       0 => 0.5,
       1 => 1.5,
       2 => -0.25,
       _ => 100.0,
-    };
-  }
+    })
+    .collect();
+  // Construct the plane as LE-encoded bytes reinterpreted as f32 (the
+  // documented `*LE` Frame contract). On LE host this is identity; on BE
+  // host the bit-pattern is byte-swapped so the kernel must `from_le` it
+  // back to host-native.
+  let pix: Vec<f32> = intended
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap();
 
   let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3];
@@ -378,8 +292,10 @@ fn rgbf32_sinker_host_native_contract_lossless_passthrough() {
     .unwrap();
   rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  // Bit-exact pass-through on every host. On the buggy `::<false>` routing
-  // a BE host would see byte-swapped output here; on the fixed routing the
-  // assertion holds on both LE and BE.
-  assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted");
+  // Output must be host-native intended values. On a BE host with a
+  // regressed `::<HOST_NATIVE_BE>` routing this would be byte-swapped.
+  assert_eq!(
+    rgb_f32_out, intended,
+    "Rgbf32 sinker LE-encoded plane decoded incorrectly"
+  );
 }
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index e6420e59..0676c34e 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -862,291 +862,172 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() {
   );
 }
 
-// ---- HOST_NATIVE_BE routing parity (codex PR #84 Findings 1-3) -------------
+// ---- LE-encoded byte contract regressions (post-#83/#84/#85 audit) --------
 //
-// LE-host routing-equivalence and host-native sinker-contract tests for the
-// `Gbrpf32` / `Gbrapf32` / `Gbrpf16` / `Gbrapf16` sinkers. Mirrors the
-// `Rgbf32` / `Rgbf16` sinker tests added for PR #83's `dcf40a3` (sinker
-// HOST_NATIVE_BE routing) and `c3a6478` (dispatch f16-widen HOST_NATIVE_BE
-// routing).
+// Each of the four float planar GBR Frame types is documented as
+// LE-encoded bytes reinterpreted as `f32` / `half::f16` (FFmpeg `*LE`
+// pixel-format convention). The sinker row-kernel dispatch must apply
+// `u32::from_le` / `u16::from_le` (kernel `BE = false`) to recover host-
+// native arithmetic from those bytes. These tests build a plane explicitly
+// from LE-encoded bit patterns (`f32::from_bits(intended.to_bits().to_le())`
+// and the f16 analogue) and assert the lossless pass-through output equals
+// the host-native intended values.
 //
-// On a LE host `HOST_NATIVE_BE = false`, so the kernel-level test below is
-// a routing sanity check (proving the dispatcher / sinker substitute the
-// correct `BE` template parameter); BE-host correctness of the routing is
-// verified by the existing row-kernel BE parity tests in
-// `src/row/arch/*/tests/` and by the contract tests below (which assert
-// host-native pass-through end-to-end on every host).
-
-/// Kernel-level test: on a LE host, `gbrpf32_to_*::<false>` and
-/// `gbrpf32_to_*::<HOST_NATIVE_BE>` must produce byte-identical output for
-/// every Tier 10 float planar GBR dispatcher across every output type
-/// (u8 RGB / u8 RGBA / u16 RGB / u16 RGBA / f32 lossless). Width 33 covers
-/// SIMD main loop + scalar tail across every backend; width 5 covers tail-
-/// only paths.
-///
-/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host
-/// `::<false>` decodes the host-native fixture as LE-encoded (byte-swap)
-/// while `::<HOST_NATIVE_BE> == ::<true>` decodes as BE (no swap), so the
-/// outputs diverge by design. This sinker-routing-equivalence claim is
-/// specifically about the LE host pattern; BE-host correctness of the
-/// routing change is verified by the contract tests below and the row-
-/// kernel BE parity tests in `src/row/arch/*/tests/`.
-#[test]
-#[cfg(target_endian = "little")]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrpf32_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_row,
-    gbrpf32_to_rgba_u16_row,
-  };
-
-  // Sinker-layer `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`; on
-  // the LE-host gate this evaluates to `false`.
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  // Width 33: SIMD main loop + scalar tail. Width 5: tail-only path. Run both
-  // to cover SIMD-tail-aware backends.
-  for w in [5usize, 7usize, 33usize] {
-    let mut gp = std::vec![0.0f32; w];
-    let mut bp = std::vec![0.0f32; w];
-    let mut rp = std::vec![0.0f32; w];
-    for (i, (g, (b, r))) in gp
-      .iter_mut()
-      .zip(bp.iter_mut().zip(rp.iter_mut()))
-      .enumerate()
-    {
-      *g = match i % 5 {
-        0 => 0.0,
-        1 => 0.5,
-        2 => 1.0,
-        3 => 1.75,
-        _ => -0.25,
-      };
-      *b = match i % 5 {
-        0 => 0.25,
-        1 => 0.75,
-        2 => 1.5,
-        3 => 0.0,
-        _ => -0.5,
-      };
-      *r = match i % 5 {
-        0 => 1.0,
-        1 => 0.5,
-        2 => 0.0,
-        3 => -0.25,
-        _ => 1.25,
-      };
-    }
-
-    let mut rgb_false = std::vec![0u8; w * 3];
-    let mut rgb_host = std::vec![0u8; w * 3];
-    gbrpf32_to_rgb_row::<false>(&gp, &bp, &rp, &mut rgb_false, w, true);
-    gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgb_host, w, true);
-    assert_eq!(rgb_false, rgb_host, "u8 RGB diverges (w = {w})");
-
-    let mut rgba_false = std::vec![0u8; w * 4];
-    let mut rgba_host = std::vec![0u8; w * 4];
-    gbrpf32_to_rgba_row::<false>(&gp, &bp, &rp, &mut rgba_false, w, true);
-    gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgba_host, w, true);
-    assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges (w = {w})");
-
-    let mut u16_false = std::vec![0u16; w * 3];
-    let mut u16_host = std::vec![0u16; w * 3];
-    gbrpf32_to_rgb_u16_row::<false>(&gp, &bp, &rp, &mut u16_false, w, true);
-    gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16_host, w, true);
-    assert_eq!(u16_false, u16_host, "u16 RGB diverges (w = {w})");
-
-    let mut u16a_false = std::vec![0u16; w * 4];
-    let mut u16a_host = std::vec![0u16; w * 4];
-    gbrpf32_to_rgba_u16_row::<false>(&gp, &bp, &rp, &mut u16a_false, w, true);
-    gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16a_host, w, true);
-    assert_eq!(u16a_false, u16a_host, "u16 RGBA diverges (w = {w})");
-
-    let mut f32_false = std::vec![0.0f32; w * 3];
-    let mut f32_host = std::vec![0.0f32; w * 3];
-    gbrpf32_to_rgb_f32_row::<false>(&gp, &bp, &rp, &mut f32_false, w, true);
-    gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut f32_host, w, true);
-    assert_eq!(f32_false, f32_host, "f32 RGB diverges (w = {w})");
-  }
-}
+// Vacuous on LE host (where `to_le` is identity so the LE-encoded plane is
+// host-native already), but on a BE host any regression that drops the
+// `::<false>` routing would be caught here — kernel without `from_le` would
+// emit byte-swapped bit-patterns, failing the bit-exact assertion below.
+//
+// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
 
-/// Sinker contract test: feeding host-native `f32` planes through
-/// [`MixedSinker<Gbrpf32>`] must produce the same output every other sinker
-/// would expect from a host-native source — specifically, `with_rgb_f32`
-/// must be bit-exact identical to the source on every host. Documents the
-/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves.
-/// Pairs with the kernel-level test above; together they cover both the
-/// dispatch boundary and the public sinker boundary.
+/// LE-encoded byte contract regression for [`Gbrpf32`].
 #[test]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn gbrpf32_sinker_host_native_contract_lossless_passthrough() {
+fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
   // Mix HDR, in-range, and negative values — the f32 lossless path must
   // round-trip them bit-exact on every host.
-  let mut gp = std::vec![0.0f32; w * h];
-  let mut bp = std::vec![0.0f32; w * h];
-  let mut rp = std::vec![0.0f32; w * h];
-  for (i, (g, (b, r))) in gp
-    .iter_mut()
-    .zip(bp.iter_mut().zip(rp.iter_mut()))
-    .enumerate()
-  {
-    *g = match i % 4 {
+  let intended_g: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 0.5,
       1 => 1.5,
       2 => -0.25,
       _ => 100.0,
-    };
-    *b = match i % 4 {
+    })
+    .collect();
+  let intended_b: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 0.0,
       1 => 0.25,
       2 => 1.0,
       _ => f32::INFINITY,
-    };
-    *r = match i % 4 {
+    })
+    .collect();
+  let intended_r: Vec<f32> = (0..w * h)
+    .map(|i| match i % 4 {
       0 => 1.0,
       1 => -1.0,
       2 => 65505.0,
       _ => 0.5,
-    };
-  }
+    })
+    .collect();
+  // LE-encode each plane (per the documented `*LE` Frame contract).
+  let gp: Vec<f32> = intended_g
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
+  let bp: Vec<f32> = intended_b
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
+  let rp: Vec<f32> = intended_r
+    .iter()
+    .map(|&v| f32::from_bits(v.to_bits().to_le()))
+    .collect();
   let src = Gbrpf32Frame::try_new(
     &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
   )
   .unwrap();
 
-  // rgb_f32 lossless: each pixel `(R, G, B)` packed in source plane order.
   let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf32>::new(w, h)
     .with_rgb_f32(&mut rgb_f32)
     .unwrap();
   gbrpf32_to(&src, &mut sink).unwrap();
 
-  // The lossless scatter writes `(R, G, B)` per pixel in plane-index order.
-  // Bit-exact equality on every host. Buggy `::<false>` routing on a BE host
-  // would byte-swap the output here; the fix keeps it bit-exact.
   for i in 0..(w * h) {
-    assert_eq!(rgb_f32[i * 3], rp[i], "R mismatch at idx {i}");
-    assert_eq!(rgb_f32[i * 3 + 1], gp[i], "G mismatch at idx {i}");
-    assert_eq!(rgb_f32[i * 3 + 2], bp[i], "B mismatch at idx {i}");
+    assert_eq!(
+      rgb_f32[i * 3].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgb_f32[i * 3 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgb_f32[i * 3 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
   }
 }
 
-/// Same as [`gbrpf32_kernel_host_native_be_matches_false_on_le_host`] but
-/// for the `Gbrpf16` family — covers both `use_simd = false` (dispatch's
-/// scalar widen-fallback) and `use_simd = true` (SIMD widen path) at tail
-/// widths 5, 7, 33 to exercise every backend's main loop + scalar tail.
+/// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA
+/// pass-through, including the α plane).
 #[test]
-#[cfg(target_endian = "little")]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn gbrpf16_kernel_host_native_be_matches_false_on_le_host() {
-  use crate::row::{
-    gbrpf16_to_rgb_row, gbrpf16_to_rgb_u16_row, gbrpf16_to_rgba_row, gbrpf16_to_rgba_u16_row,
+fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.1 + (i as f32) * 0.001).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.2 + (i as f32) * 0.002).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.3 + (i as f32) * 0.003).collect();
+  let intended_a: Vec<f32> = (0..w * h).map(|i| 0.5 + (i as f32) * 0.0005).collect();
+
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
   };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
 
-  const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
-
-  for w in [5usize, 7usize, 33usize] {
-    let gp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 0.0,
-          1 => 0.5,
-          2 => 1.0,
-          3 => 1.75,
-          _ => -0.25,
-        })
-      })
-      .collect();
-    let bp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 0.25,
-          1 => 0.75,
-          2 => 1.5,
-          3 => 0.0,
-          _ => -0.5,
-        })
-      })
-      .collect();
-    let rp: Vec<half::f16> = (0..w)
-      .map(|i| {
-        half::f16::from_f32(match i % 5 {
-          0 => 1.0,
-          1 => 0.5,
-          2 => 0.0,
-          3 => -0.25,
-          _ => 1.25,
-        })
-      })
-      .collect();
-
-    // Both `use_simd = false` and `use_simd = true` to cover dispatch's
-    // scalar widen-fallback and the SIMD widen path on every backend.
-    for use_simd in [false, true] {
-      let mut rgb_false = std::vec![0u8; w * 3];
-      let mut rgb_host = std::vec![0u8; w * 3];
-      gbrpf16_to_rgb_row::<false>(&gp, &bp, &rp, &mut rgb_false, w, use_simd);
-      gbrpf16_to_rgb_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgb_host, w, use_simd);
-      assert_eq!(
-        rgb_false, rgb_host,
-        "u8 RGB diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut rgba_false = std::vec![0u8; w * 4];
-      let mut rgba_host = std::vec![0u8; w * 4];
-      gbrpf16_to_rgba_row::<false>(&gp, &bp, &rp, &mut rgba_false, w, use_simd);
-      gbrpf16_to_rgba_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut rgba_host, w, use_simd);
-      assert_eq!(
-        rgba_false, rgba_host,
-        "u8 RGBA diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut u16_false = std::vec![0u16; w * 3];
-      let mut u16_host = std::vec![0u16; w * 3];
-      gbrpf16_to_rgb_u16_row::<false>(&gp, &bp, &rp, &mut u16_false, w, use_simd);
-      gbrpf16_to_rgb_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16_host, w, use_simd);
-      assert_eq!(
-        u16_false, u16_host,
-        "u16 RGB diverges (w = {w}, use_simd = {use_simd})"
-      );
-
-      let mut u16a_false = std::vec![0u16; w * 4];
-      let mut u16a_host = std::vec![0u16; w * 4];
-      gbrpf16_to_rgba_u16_row::<false>(&gp, &bp, &rp, &mut u16a_false, w, use_simd);
-      gbrpf16_to_rgba_u16_row::<HOST_NATIVE_BE>(&gp, &bp, &rp, &mut u16a_host, w, use_simd);
-      assert_eq!(
-        u16a_false, u16a_host,
-        "u16 RGBA diverges (w = {w}, use_simd = {use_simd})"
-      );
-    }
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
+  let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+    .with_rgba_f32(&mut rgba_f32)
+    .unwrap();
+  gbrapf32_to(&src, &mut sink).unwrap();
+
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_f32[i * 4].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
+    assert_eq!(
+      rgba_f32[i * 4 + 3].to_bits(),
+      intended_a[i].to_bits(),
+      "A idx {i}"
+    );
   }
 }
 
-/// Sinker contract: host-native `half::f16` source through [`MixedSinker<Gbrpf16>`]
-/// `with_rgb_f16` must round-trip the planes bit-exact on every host. The
-/// `::<HOST_NATIVE_BE>` routing keeps the lossless interleave a no-op in the
-/// BE-load layer; the buggy `::<false>` routing on a BE host would byte-swap
-/// every f16 element.
+/// LE-encoded byte contract regression for [`Gbrpf16`].
 #[test]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
+fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
-  let gp: Vec<half::f16> = (0..w * h)
+  let intended_g: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 0.5,
@@ -1156,7 +1037,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
-  let bp: Vec<half::f16> = (0..w * h)
+  let intended_b: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 0.0,
@@ -1166,7 +1047,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
-  let rp: Vec<half::f16> = (0..w * h)
+  let intended_r: Vec<half::f16> = (0..w * h)
     .map(|i| {
       half::f16::from_f32(match i % 4 {
         0 => 1.0,
@@ -1176,6 +1057,14 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
       })
     })
     .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
 
   let src = Gbrpf16Frame::try_new(
     &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
@@ -1191,46 +1080,53 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() {
   for i in 0..(w * h) {
     assert_eq!(
       rgb_f16[i * 3].to_bits(),
-      rp[i].to_bits(),
-      "R mismatch at idx {i}"
+      intended_r[i].to_bits(),
+      "R idx {i}"
     );
     assert_eq!(
       rgb_f16[i * 3 + 1].to_bits(),
-      gp[i].to_bits(),
-      "G mismatch at idx {i}"
+      intended_g[i].to_bits(),
+      "G idx {i}"
     );
     assert_eq!(
       rgb_f16[i * 3 + 2].to_bits(),
-      bp[i].to_bits(),
-      "B mismatch at idx {i}"
+      intended_b[i].to_bits(),
+      "B idx {i}"
     );
   }
 }
 
-/// Sinker contract: [`MixedSinker<Gbrapf16>`] `with_rgba_f16` must round-trip
-/// the source α plane bit-exact alongside the G/B/R planes, on every host.
-/// Validates Strategy A+ alpha consistency under the `HOST_NATIVE_BE`
-/// routing — the previous mix-mode (LE-decoded RGB + host-native α) is gone.
+/// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA
+/// pass-through, including the α plane).
 #[test]
 #[cfg_attr(
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() {
+fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
-  let gp: Vec<half::f16> = (0..w * h)
+  let intended_g: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001))
     .collect();
-  let bp: Vec<half::f16> = (0..w * h)
+  let intended_b: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002))
     .collect();
-  let rp: Vec<half::f16> = (0..w * h)
+  let intended_r: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003))
     .collect();
-  let ap: Vec<half::f16> = (0..w * h)
+  let intended_a: Vec<half::f16> = (0..w * h)
     .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001))
     .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
 
   let src = Gbrapf16Frame::try_new(
     &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
@@ -1244,52 +1140,26 @@ fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() {
   gbrapf16_to(&src, &mut sink).unwrap();
 
   for i in 0..(w * h) {
-    assert_eq!(rgba_f16[i * 4].to_bits(), rp[i].to_bits(), "R idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 1].to_bits(), gp[i].to_bits(), "G idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 2].to_bits(), bp[i].to_bits(), "B idx {i}");
-    assert_eq!(rgba_f16[i * 4 + 3].to_bits(), ap[i].to_bits(), "A idx {i}");
-  }
-}
-
-/// Sinker contract: [`MixedSinker<Gbrapf32>`] `with_rgba_f32` lossless
-/// pass-through plus α — confirms Strategy A+ alpha consistency when the
-/// f32 RGB chain routes via `HOST_NATIVE_BE`. The α plane is host-native
-/// f32, also routed via `HOST_NATIVE_BE`, eliminating any mix-mode.
-#[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha() {
-  let w = 16usize;
-  let h = 4usize;
-  let mut gp = std::vec![0.0f32; w * h];
-  let mut bp = std::vec![0.0f32; w * h];
-  let mut rp = std::vec![0.0f32; w * h];
-  let mut ap = std::vec![0.0f32; w * h];
-  for i in 0..(w * h) {
-    gp[i] = 0.1 + (i as f32) * 0.001;
-    bp[i] = 0.2 + (i as f32) * 0.002;
-    rp[i] = 0.3 + (i as f32) * 0.003;
-    ap[i] = 0.5 + (i as f32) * 0.0005;
-  }
-
-  let src = Gbrapf32Frame::try_new(
-    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
-  )
-  .unwrap();
-
-  let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
-  let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
-    .with_rgba_f32(&mut rgba_f32)
-    .unwrap();
-  gbrapf32_to(&src, &mut sink).unwrap();
-
-  for i in 0..(w * h) {
-    assert_eq!(rgba_f32[i * 4], rp[i], "R idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 1], gp[i], "G idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 2], bp[i], "B idx {i}");
-    assert_eq!(rgba_f32[i * 4 + 3], ap[i], "A idx {i}");
+    assert_eq!(
+      rgba_f16[i * 4].to_bits(),
+      intended_r[i].to_bits(),
+      "R idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 1].to_bits(),
+      intended_g[i].to_bits(),
+      "G idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 2].to_bits(),
+      intended_b[i].to_bits(),
+      "B idx {i}"
+    );
+    assert_eq!(
+      rgba_f16[i * 4 + 3].to_bits(),
+      intended_a[i].to_bits(),
+      "A idx {i}"
+    );
   }
 }
 

From 78f1c2f8ae0562b41a5635e1cb4014a19dbf69ba Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 20:35:28 +1200
Subject: [PATCH 04/10] fix(sinker): replace LE-host-only widen_f16_to_f32
 helper with bit-normalize-first canonical helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the BE-host correctness gap flagged in the original body of #92:
the sinker-local `widen_f16_to_f32` in `src/sinker/mixed/planar_gbr_f16.rs`
called `half::f16::to_f32` directly on f16 plane bits, treating the bytes
as host-native. That assumption only holds on a little-endian host with an
LE-encoded source — on a big-endian host the LE-encoded f16 bits would be
byte-swapped from the intended values, and `to_f32` would decode them as
arbitrarily wrong magnitudes / signs / NaNs.

Replaces every call site with the canonical bit-normalize-first helper
`crate::row::scalar::planar_gbr_f16::widen_f16_be_to_host_f32::<false>`
(BE = false because Frame plane bytes are LE-encoded per the contract
documented in Phase 1 of this PR). The shared helper does
`f32 = f16::from_bits(u16::from_le(plane_bits)).to_f32()` — correct on
both LE hosts (where `from_le` is identity) and BE hosts (where it byte-
swaps the LE-encoded plane bits back to host-native f16 bits before the
f16 → f32 conversion). The downstream `gbrpf32_to_*` row kernels are still
invoked with `BE = false` since the widened scratch is now host-native f32
(its `u32::from_le` is then a no-op on LE host and the right byte-swap on
BE host — same contract as the dispatch f16-widen fallback and the per-
backend SIMD scalar tails already use).

Updated 7 call sites:
  - 3 in the `Gbrpf16` `process` widen loop (G, B, R planes).
  - 4 in the `Gbrapf16` `process` widen loop (G, B, R, A planes).
  - 1 in the `widen_and_scatter_f16_alpha_to_u8` u8 RGBA fast-path helper
    (counted as the seventh distinct call site; the other six are the
    two widen-loop blocks).

Removed the local `widen_f16_to_f32` helper (and its now-stale
LE-host-only-caveat doc-comment) entirely — no remaining callers.

Adds two LE-encoded byte-contract regression tests covering the widening
path (the existing PR #92 tests covered only the lossless f16 pass-through
path which doesn't go through the widen helper):

  - gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly
    (Gbrpf16 → with_rgb_f32; exercises the 3-plane widen loop)
  - gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly
    (Gbrapf16 → with_rgba_f32; exercises the 4-plane widen loop incl. α)

Each test constructs LE-encoded `&[half::f16]` planes via
`half::f16::from_bits(intended.to_bits().to_le())`, builds a Frame, runs
the widening sinker path, and asserts the f32 output equals the host-
native expected values. Vacuous on LE hosts (where `to_le` is identity)
but on a BE host any regression that drops the bit-normalize-first step
in `widen_f16_be_to_host_f32::<false>` would fail fast.

Verification (aarch64-apple-darwin, x86_64-apple-darwin, wasm32):

  cargo test --target aarch64-apple-darwin --lib                  # 2269 ok
  cargo build --target x86_64-apple-darwin --tests                # 0 warnings
  RUSTFLAGS="-C target-feature=+simd128" \
    cargo build --target wasm32-unknown-unknown --tests           # ok
  cargo build --no-default-features                               # ok
  cargo fmt --check                                               # ok
  cargo clippy --all-targets --all-features -- -D warnings        # ok

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/planar_gbr_f16.rs         |  54 +++------
 src/sinker/mixed/tests/planar_gbr_float.rs | 125 +++++++++++++++++++++
 2 files changed, 142 insertions(+), 37 deletions(-)

diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index 150ef729..94fc963f 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -51,7 +51,8 @@ use crate::{
     gbrapf32_to_rgba_u16_row, gbrpf16_to_rgb_f16_row, gbrpf16_to_rgb_row, gbrpf16_to_rgba_f16_row,
     gbrpf16_to_rgba_row, gbrpf32_to_hsv_row, gbrpf32_to_luma_row, gbrpf32_to_luma_u16_row,
     gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_f32_row,
-    gbrpf32_to_rgba_u16_row, scalar::alpha_extract::copy_alpha_plane_f32_to_u8,
+    gbrpf32_to_rgba_u16_row,
+    scalar::{alpha_extract::copy_alpha_plane_f32_to_u8, planar_gbr_f16::widen_f16_be_to_host_f32},
   },
   yuv::{Gbrapf16, Gbrapf16Row, Gbrapf16Sink, Gbrpf16, Gbrpf16Row, Gbrpf16Sink},
 };
@@ -64,34 +65,6 @@ const GBR_F16_FULL_RANGE: bool = true;
 // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated).
 const WIDEN_CHUNK: usize = 64;
 
-/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements).
-///
-/// The source slice is `&[half::f16]` from a [`Gbrpf16Frame`] /
-/// [`Gbrapf16Frame`] plane — i.e. **LE-encoded bytes** reinterpreted as
-/// `half::f16` (per the FFmpeg `*LE` contract). On a little-endian host
-/// LE bytes _are_ host-native, so `half::f16::to_f32` interprets the bits
-/// correctly and the downstream `gbrpf32_to_*` kernel — invoked with
-/// `BE = false` — sees host-native f32 (its `u32::from_le` is a no-op on
-/// LE host) → correct.
-///
-/// On a big-endian host this helper would mis-interpret the LE-encoded
-/// bits as host-native (a separate bug from the sinker-layer routing
-/// reverted in this PR); the bit-normalising scalar helper
-/// [`crate::row::scalar::widen_f16_be_to_host_f32`] is the correct pattern
-/// for cross-endian widen-then-convert chains and is what the per-arch
-/// SIMD backends use. Wiring this sinker-local helper to that bit-
-/// normalising version is left as a future improvement — every CI runner
-/// today is LE and exercises the correct path here.
-///
-/// [`Gbrpf16Frame`]: crate::frame::Gbrpf16Frame
-/// [`Gbrapf16Frame`]: crate::frame::Gbrapf16Frame
-#[cfg_attr(not(tarpaulin), inline(always))]
-fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) {
-  for i in 0..count {
-    dst[i] = src[i].to_f32();
-  }
-}
-
 // ---- Gbrpf16 accessor impl block ----------------------------------------
 
 impl<'a> MixedSinker<'a, Gbrpf16> {
@@ -365,9 +338,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
       let mut offset = 0;
       while offset < w {
         let n = (w - offset).min(WIDEN_CHUNK);
-        widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n);
-        widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n);
-        widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n);
+        // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the
+        // downstream `gbrpf32_to_*` kernel (invoked with `BE = false`) sees
+        // host-native f32 on every host. See the canonical helper's docs.
+        widen_f16_be_to_host_f32::<false>(g_in, offset, &mut gf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(b_in, offset, &mut bf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(r_in, offset, &mut rf_chunk, n);
         let gf = &gf_chunk[..n];
         let bf = &bf_chunk[..n];
         let rf = &rf_chunk[..n];
@@ -769,10 +745,12 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
       let mut offset = 0;
       while offset < w {
         let n = (w - offset).min(WIDEN_CHUNK);
-        widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n);
-        widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n);
-        widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n);
-        widen_f16_to_f32(&a_in[offset..], &mut af_chunk, n);
+        // Bit-normalise LE-encoded f16 plane bits → host-native f32 (see the
+        // canonical helper's docs); downstream kernel uses `BE = false`.
+        widen_f16_be_to_host_f32::<false>(g_in, offset, &mut gf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(b_in, offset, &mut bf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(r_in, offset, &mut rf_chunk, n);
+        widen_f16_be_to_host_f32::<false>(a_in, offset, &mut af_chunk, n);
         let gf = &gf_chunk[..n];
         let bf = &bf_chunk[..n];
         let rf = &rf_chunk[..n];
@@ -915,7 +893,9 @@ fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8
   let mut offset = 0;
   while offset < width {
     let n = (width - offset).min(WIDEN_CHUNK);
-    widen_f16_to_f32(&alpha_f16[offset..], &mut af_chunk, n);
+    // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping
+    // and scaling to u8 — correct on both LE and BE hosts.
+    widen_f16_be_to_host_f32::<false>(alpha_f16, offset, &mut af_chunk, n);
     copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n);
     offset += n;
   }
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index 0676c34e..7673e0a2 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -1163,6 +1163,131 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
   }
 }
 
+/// LE-encoded byte contract regression for [`Gbrpf16`] **widening path**
+/// (`with_rgb_f32`). Exercises the f16 → f32 widen step in the sinker — which
+/// must bit-normalise LE-encoded f16 plane bits before converting to f32.
+///
+/// Vacuous on LE hosts (where `to_le` is identity); on a BE host any
+/// regression that drops the bit-normalize-first step in
+/// `widen_f16_be_to_host_f32::<false>` would interpret byte-swapped bits as
+/// host-native f16 and decode to wildly wrong f32 values.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.5,
+        1 => 0.25,
+        2 => 0.0,
+        _ => 1.0,
+      })
+    })
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.125,
+        1 => 0.75,
+        2 => 0.0625,
+        _ => 0.875,
+      })
+    })
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.375,
+        1 => 0.625,
+        2 => 0.9375,
+        _ => 0.03125,
+      })
+    })
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+
+  let src = Gbrpf16Frame::try_new(
+    &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
+  let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+    .with_rgb_f32(&mut rgb_f32)
+    .unwrap();
+  gbrpf16_to(&src, &mut sink).unwrap();
+
+  for i in 0..(w * h) {
+    assert_eq!(rgb_f32[i * 3], intended_r[i].to_f32(), "R idx {i}");
+    assert_eq!(rgb_f32[i * 3 + 1], intended_g[i].to_f32(), "G idx {i}");
+    assert_eq!(rgb_f32[i * 3 + 2], intended_b[i].to_f32(), "B idx {i}");
+  }
+}
+
+/// LE-encoded byte contract regression for [`Gbrapf16`] **widening path**
+/// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 →
+/// f32 widen step — same bit-normalise-first contract as the no-α variant.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001))
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002))
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003))
+    .collect();
+  let intended_a: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001))
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
+
+  let src = Gbrapf16Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
+  let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+    .with_rgba_f32(&mut rgba_f32)
+    .unwrap();
+  gbrapf16_to(&src, &mut sink).unwrap();
+
+  for i in 0..(w * h) {
+    assert_eq!(rgba_f32[i * 4], intended_r[i].to_f32(), "R idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 1], intended_g[i].to_f32(), "G idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 2], intended_b[i].to_f32(), "B idx {i}");
+    assert_eq!(rgba_f32[i * 4 + 3], intended_a[i].to_f32(), "A idx {i}");
+  }
+}
+
 // ---- 32-bit overflow guards ------------------------------------------------
 //
 // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message

From cf2605812c681d800b574c3fe99c682289385d8a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 21:01:31 +1200
Subject: [PATCH 05/10] fix(sinker): post-widen HOST_NATIVE_BE routing + Ya16
 alpha endian-aware + frame docs cite LE names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses three follow-up findings from Codex's review of PR #92.

Finding 1 — Post-widen f32 routing on Gbrpf16 / Gbrapf16. The Phase 2
follow-up replaced the local `widen_f16_to_f32` helper with the canonical
`widen_f16_be_to_host_f32::<false>`, but the immediate downstream
`gbrpf32_to_*::<false>` calls still told the f32 row kernel to apply
`from_le` AGAIN — on a BE host this would byte-swap the already-host-native
f32 scratch, corrupting output.

Introduce a module-scope `HOST_NATIVE_BE = cfg!(target_endian = "big")`
constant in `src/sinker/mixed/planar_gbr_f16.rs` and route ALL post-widen
`gbrpf32_to_*` / `gbrapf32_to_*` calls through `::<HOST_NATIVE_BE>` (14
sites total across both Gbrpf16 and Gbrapf16 sinker bodies). The kernel's
`from_le` / `from_be` then becomes a no-op on every host since post-widen
scratch already matches the host byte order.

Distinct from the Phase 2 revert: direct Frame-to-row-kernel calls
(`gbrpf16_to_*::<false>` for u8 / f16 outputs) still use `<false>` per
the LE-encoded Frame contract — those receive raw LE-encoded `&[half::f16]`
plane bytes, so the kernel must apply `from_le`. Post-widen scratch is
host-native f32, so it must use `<HOST_NATIVE_BE>` to keep the swap a
no-op everywhere. The module-level comment block makes this distinction
explicit.

Finding 2 — Ya16 RGB+RGBA combined-path alpha not endian-aware. When
`MixedSinker<Ya16>` attached BOTH `with_rgb` and `with_rgba`, the sinker
ran `ya16_to_rgb_*::<false>` (correct), expanded the RGB output to RGBA
prefix, and called `copy_alpha_ya_u16` / `copy_alpha_ya_u16_to_u8` to
overlay the alpha plane from the source — but those helpers read raw
`packed[n*2+1]` directly without `u16::from_le`. On a BE host processing
the LE-encoded `Ya16Frame`, the host-native u16 read would interpret
LE-encoded bytes as host-native (BE), yielding the wrong alpha byte.

Audit found four alpha-patch helpers with the same pattern across three
formats — Ya16 (`copy_alpha_ya_u16` / `_to_u8`), AYUV64
(`copy_alpha_packed_u16x4_at_0` / `_to_u8_at_0`), and Rgba64 / Bgra64
(`copy_alpha_packed_u16x4_at_3` / `_to_u8_at_3`). All six were made
endian-aware via a `<const BE: bool>` parameter applying
`u16::from_le(packed[..])` (or `from_be` per BE) before scaling /
storing, mirroring the pre-existing `copy_alpha_plane_u16` /
`copy_alpha_plane_u16_to_u8` pattern from PR #81.

Updated all 8 sinker call sites to pass `<false>` (Frame contract is LE).
The two AYUV64 helpers also have SIMD dispatchers — those gained a
`<const BE: bool>` generic and the `safe_for_simd = !BE && cfg!(target_endian = "little")`
gate that already exists for `copy_alpha_plane_u16_to_u8`, falling back
to scalar in the BE-host or BE-data quadrants. The five SIMD scalar tails
were pinned to `::<false>` matching the precedent from PR #81 / PR #82.

Finding 3 — Public Frame docs cite wrong FFmpeg names. `Rgbf16Frame`
docs cited `AV_PIX_FMT_RGBF16` (target-endian alias — `RGBF16LE` on LE,
`RGBF16BE` on BE). `Rgbf32Frame` docs cited `AV_PIX_FMT_RGBF32` which
doesn't exist as an unsuffixed alias; FFmpeg has only the alpha-bearing
`RGBAF32LE` / `RGBAF32BE` pair. A BE caller following the unsuffixed
contract could pass native-BE bytes into an LE-decoder.

Updated `src/frame/packed_rgb_f16.rs` to cite `AV_PIX_FMT_RGBF16LE`
explicitly, with a parenthetical noting that the unsuffixed name is a
target-endian alias to avoid future confusion. Updated
`src/frame/packed_rgb_float.rs` to drop the citation of the non-existent
`AV_PIX_FMT_RGBF32` constant entirely and instead document the contract
as "LE-encoded f32 bytes — `colconv` pins to the canonical FFmpeg `*LE`
byte-order convention regardless of host endianness, exactly as the
existing `AV_PIX_FMT_RGBAF32LE` flavour".

Tests added:
- `gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly`
  exercises the Gbrpf16 widen → f32 → u16 chain (with `with_luma_u16`
  alongside) under the LE-encoded Frame contract — vacuous on LE,
  catches the double-swap on BE.
- `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded` and
  the u16 sibling assert that Strategy A+ (combined `with_rgb` +
  `with_rgba`) produces α bytes byte-identical to the standalone
  `with_rgba` path on every host. Builds the `&[u16]` plane via
  `u16::from_ne_bytes(x.to_le_bytes())` so the in-memory bytes spell
  the LE-encoded contract on both LE and BE hosts.
- 8 new BE-parity tests in `src/row/scalar/alpha_extract.rs` for the
  newly endian-aware AYUV64 / Rgba64 / Ya16 helpers (4 helpers × u8 / u16
  variants), all using the `swap_bytes`-fixture pattern that exists for
  `copy_alpha_plane_u16` so the BE arm runs on every host without
  needing a real BE runner.

Verified:
- `cargo test --target aarch64-apple-darwin --lib` — 2278 passed
- `cargo build --target x86_64-apple-darwin --tests` — clean
- `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` — clean
- `cargo build --no-default-features` — clean
- `cargo fmt --check` — clean
- `cargo clippy --all-targets --all-features -- -D warnings` — clean
- `cargo check --target s390x-unknown-linux-gnu --lib` — clean (BE-host smoke)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/frame/packed_rgb_f16.rs                |  20 +-
 src/frame/packed_rgb_float.rs              |  22 +-
 src/row/arch/neon/alpha_extract.rs         |   8 +-
 src/row/arch/wasm_simd128/alpha_extract.rs |   8 +-
 src/row/arch/x86_avx2/alpha_extract.rs     |   8 +-
 src/row/arch/x86_avx512/alpha_extract.rs   |   8 +-
 src/row/arch/x86_sse41/alpha_extract.rs    |   8 +-
 src/row/dispatch/alpha_extract.rs          |  35 ++--
 src/row/scalar/alpha_extract.rs            | 224 +++++++++++++++++++--
 src/sinker/mixed/ayuv64.rs                 |  11 +-
 src/sinker/mixed/gray.rs                   | 129 +++++++++++-
 src/sinker/mixed/packed_rgb_16bit.rs       |  28 ++-
 src/sinker/mixed/planar_gbr_f16.rs         |  60 ++++--
 src/sinker/mixed/tests/planar_gbr_float.rs | 109 ++++++++++
 14 files changed, 587 insertions(+), 91 deletions(-)

diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs
index ecf9e8c6..d1decc2b 100644
--- a/src/frame/packed_rgb_f16.rs
+++ b/src/frame/packed_rgb_f16.rs
@@ -50,8 +50,7 @@ pub enum Rgbf16FrameError {
   },
 }
 
-/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`,
-/// FFmpeg canonical `*LE` convention — see endian note below).
+/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16LE`).
 /// One plane, 3 × `f16` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -70,12 +69,17 @@ pub enum Rgbf16FrameError {
 /// # Endian contract — **LE-encoded bytes**
 ///
 /// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted
-/// as `f16`, matching the FFmpeg `*LE` pixel-format convention. On a
-/// little-endian host (every CI runner today) LE bytes _are_ host-native,
-/// so `&[half::f16]` is also a host-native f16 slice; on a big-endian host
-/// the bytes have to be byte-swapped back to host-native before arithmetic.
-/// Downstream row kernels handle this byte-swap (or no-op on LE) under the
-/// hood — callers do **not** pre-swap.
+/// as `f16`, matching the FFmpeg **`AV_PIX_FMT_RGBF16LE`** pixel-format
+/// convention. (FFmpeg's unsuffixed `AV_PIX_FMT_RGBF16` is a *target-endian*
+/// alias — `RGBF16LE` on a little-endian host, `RGBF16BE` on a big-endian
+/// host — so this contract pins the canonical `*LE` byte order regardless
+/// of host endianness.)
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[half::f16]` is also a host-native f16 slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
 ///
 /// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer
 /// from FFmpeg should cast via `bytemuck::cast_slice` and divide
diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs
index 166b0848..f002ecd2 100644
--- a/src/frame/packed_rgb_float.rs
+++ b/src/frame/packed_rgb_float.rs
@@ -50,8 +50,7 @@ pub enum Rgbf32FrameError {
   },
 }
 
-/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`,
-/// FFmpeg canonical `*LE` convention — see endian note below).
+/// A validated packed **RGBF32** frame.
 /// One plane, 3 × `f32` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -69,12 +68,19 @@ pub enum Rgbf32FrameError {
 /// # Endian contract — **LE-encoded bytes**
 ///
 /// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as
-/// `f32`, matching the FFmpeg `*LE` pixel-format convention. On a
-/// little-endian host (every CI runner today) LE bytes _are_ host-native,
-/// so `&[f32]` is also a host-native float slice; on a big-endian host the
-/// bytes have to be byte-swapped back to host-native before arithmetic.
-/// Downstream row kernels handle this byte-swap (or no-op on LE) under the
-/// hood — callers do **not** pre-swap.
+/// `f32`. Note that, unlike RGBA float and the f16 variants, FFmpeg does
+/// **not** define an `AV_PIX_FMT_RGBF32` constant for non-α 3-channel f32
+/// (only the 4-channel `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE`
+/// pair exists). `colconv` pins this frame to the canonical FFmpeg `*LE`
+/// byte-order convention — i.e. callers should pass LE-encoded bytes
+/// regardless of host endianness, exactly as they would for the existing
+/// `AV_PIX_FMT_RGBAF32LE` flavour.
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[f32]` is also a host-native float slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
 ///
 /// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer
 /// from FFmpeg should cast via `bytemuck::cast_slice` and divide
diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs
index ffb04e6a..5135b3d6 100644
--- a/src/row/arch/neon/alpha_extract.rs
+++ b/src/row/arch/neon/alpha_extract.rs
@@ -116,7 +116,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+    scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -154,7 +154,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_at_0(
+    scalar::copy_alpha_packed_u16x4_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -357,7 +357,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -375,7 +375,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs
index b999b618..bee7633d 100644
--- a/src/row/arch/wasm_simd128/alpha_extract.rs
+++ b/src/row/arch/wasm_simd128/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -226,7 +226,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -518,7 +518,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -538,7 +538,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs
index 1ebe97c1..2c58d3e9 100644
--- a/src/row/arch/x86_avx2/alpha_extract.rs
+++ b/src/row/arch/x86_avx2/alpha_extract.rs
@@ -213,7 +213,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -636,7 +636,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -657,7 +657,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs
index 203e08e3..45743fb5 100644
--- a/src/row/arch/x86_avx512/alpha_extract.rs
+++ b/src/row/arch/x86_avx512/alpha_extract.rs
@@ -206,7 +206,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -604,7 +604,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -627,7 +627,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs
index d327e299..4b1d800b 100644
--- a/src/row/arch/x86_sse41/alpha_extract.rs
+++ b/src/row/arch/x86_sse41/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -227,7 +227,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -521,7 +521,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -542,7 +542,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs
index 00ecb61e..1e0351c5 100644
--- a/src/row/dispatch/alpha_extract.rs
+++ b/src/row/dispatch/alpha_extract.rs
@@ -95,17 +95,26 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(
 /// Runtime-dispatched α-extract for AYUV64 → u8 RGBA: gather α from
 /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u8) via `>> 8`.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source `packed` plane byte order (`false` = LE on
+/// disk/wire — matching the LE-encoded `Ayuv64Frame` contract;
+/// `true` = BE). Like [`copy_alpha_plane_u16_to_u8`], the existing SIMD
+/// helpers use host-native u16 loads with no `from_le` / `from_be`
+/// normalisation, so SIMD is only correct on LE host processing LE
+/// source. The dispatcher computes
+/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and falls
+/// back to the target-endian-aware scalar in every other quadrant.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width);
+  // SIMD α-extract helpers use host-native u16 loads. Force scalar in
+  // any quadrant where source byte order doesn't match host byte order.
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_packed_u16x4_to_u8_at_0::<BE>(packed, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -141,7 +150,7 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
     },
     _ => {}
   }
-  scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width);
+  scalar::copy_alpha_packed_u16x4_to_u8_at_0::<BE>(packed, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
@@ -152,17 +161,19 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
 /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u16). No depth
 /// conversion.
 ///
-/// Selects the highest available SIMD backend; falls back to scalar.
-/// When `use_simd` is `false`, calls scalar directly.
+/// `BE` selects the source `packed` plane byte order. See
+/// [`copy_alpha_packed_u16x4_to_u8_at_0`] for the rationale: SIMD is
+/// only correct on LE host with LE source; scalar is target-endian-aware.
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_packed_u16x4_at_0(
+pub(crate) fn copy_alpha_packed_u16x4_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u16],
   width: usize,
   use_simd: bool,
 ) {
-  if !use_simd {
-    return scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width);
+  let safe_for_simd = !BE && cfg!(target_endian = "little");
+  if !safe_for_simd || !use_simd {
+    return scalar::copy_alpha_packed_u16x4_at_0::<BE>(packed, rgba_out, width);
   }
   cfg_select! {
     target_arch = "aarch64" => {
@@ -198,7 +209,7 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(
     },
     _ => {}
   }
-  scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width);
+  scalar::copy_alpha_packed_u16x4_at_0::<BE>(packed, rgba_out, width);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index 6c77346a..bbf5922b 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -27,7 +27,16 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], wi
 /// into `rgba_out[3 + 4*n]` (u8 element) with depth-conv `>> 8`.
 ///
 /// AYUV64 layout per pixel: `[A(16), Y(16), U(16), V(16)]` — α is at slot 0.
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_AYUV64LE` per the Frame
+/// contract; `true` = BE on disk/wire). Each raw u16 is normalised to
+/// host-native order via `u16::from_le` / `u16::from_be` before the
+/// `>> 8` depth conversion. On a host whose endianness matches the
+/// source the conversion compiles to a no-op; otherwise it is a
+/// `swap_bytes`. Without this a BE host (e.g., s390x) processing the
+/// LE-encoded Frame would emit a byte-reversed α byte.
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -35,17 +44,34 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0(
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 4] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 4])
+    } else {
+      u16::from_le(packed[n * 4])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
 /// AYUV64 → u16 RGBA: gather α from `packed[0 + 4*n]` (u16) into
 /// `rgba_out[3 + 4*n]` (u16). No depth conversion.
-pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_packed_u16x4_to_u8_at_0`] for the full rationale.
+pub(crate) fn copy_alpha_packed_u16x4_at_0<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = packed[n * 4];
+    let raw = if BE {
+      u16::from_be(packed[n * 4])
+    } else {
+      u16::from_le(packed[n * 4])
+    };
+    rgba_out[n * 4 + 3] = raw;
   }
 }
 
@@ -58,8 +84,15 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16],
 /// Used in Strategy A+: after `expand_rgb_to_rgba_row` fills the RGBA buffer
 /// with a forced-opaque alpha, this helper overwrites only the α slot with the
 /// real source alpha, depth-converted to u8.
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_RGBA64LE` /
+/// `AV_PIX_FMT_BGRA64LE` per the Frame contract; `true` = BE). Each raw
+/// u16 is normalised to host-native order via `u16::from_le` /
+/// `u16::from_be` before the `>> 8` depth conversion. Without this a BE
+/// host processing the LE-encoded Frame would emit a byte-reversed α byte.
 #[allow(dead_code)] // wired in sinker Task 10
-pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
+pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3<const BE: bool>(
   packed: &[u16],
   rgba_out: &mut [u8],
   width: usize,
@@ -67,7 +100,12 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 4 + 3] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 4 + 3])
+    } else {
+      u16::from_le(packed[n * 4 + 3])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
@@ -77,12 +115,24 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3(
 /// Used in Strategy A+: after `expand_rgb_u16_to_rgba_u16_row` fills the
 /// RGBA buffer, this helper overwrites only the α slot with the real source
 /// alpha at native 16-bit depth.
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_packed_u16x4_to_u8_at_3`] for the full rationale.
 #[allow(dead_code)] // wired in sinker Task 10
-pub(crate) fn copy_alpha_packed_u16x4_at_3(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) {
+pub(crate) fn copy_alpha_packed_u16x4_at_3<const BE: bool>(
+  packed: &[u16],
+  rgba_u16_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 4, "packed too short");
   debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
   for n in 0..width {
-    rgba_u16_out[n * 4 + 3] = packed[n * 4 + 3];
+    let raw = if BE {
+      u16::from_be(packed[n * 4 + 3])
+    } else {
+      u16::from_le(packed[n * 4 + 3])
+    };
+    rgba_u16_out[n * 4 + 3] = raw;
   }
 }
 
@@ -195,21 +245,49 @@ pub(crate) fn copy_alpha_ya_u8(packed: &[u8], rgba_out: &mut [u8], width: usize)
 /// into `rgba_out[3 + 4*n]` (u8).
 ///
 /// Ya16 layout per pixel: `[Y(16), A(16)]` — α is at odd u16 offsets (slot 1).
-pub(crate) fn copy_alpha_ya_u16_to_u8(packed: &[u16], rgba_out: &mut [u8], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_YA16LE` per the
+/// `Ya16Frame` contract; `true` = BE). Each raw u16 is normalised to
+/// host-native order via `u16::from_le` / `u16::from_be` before the
+/// `>> 8` depth conversion. Without this a BE host processing the
+/// LE-encoded Frame would emit a byte-reversed α byte.
+pub(crate) fn copy_alpha_ya_u16_to_u8<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (packed[n * 2 + 1] >> 8) as u8;
+    let raw = if BE {
+      u16::from_be(packed[n * 2 + 1])
+    } else {
+      u16::from_le(packed[n * 2 + 1])
+    };
+    rgba_out[n * 4 + 3] = (raw >> 8) as u8;
   }
 }
 
 /// Ya16 → u16 RGBA: gather A from `packed[1 + 2*n]` (u16) into
 /// `rgba_out[3 + 4*n]` (u16). No depth conversion.
-pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usize) {
+///
+/// `BE` selects the **byte order** of the encoded source `packed` plane.
+/// See [`copy_alpha_ya_u16_to_u8`] for the full rationale.
+pub(crate) fn copy_alpha_ya_u16<const BE: bool>(
+  packed: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(packed.len() >= width * 2, "packed too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = packed[n * 2 + 1];
+    let raw = if BE {
+      u16::from_be(packed[n * 2 + 1])
+    } else {
+      u16::from_le(packed[n * 2 + 1])
+    };
+    rgba_out[n * 4 + 3] = raw;
   }
 }
 
@@ -270,21 +348,56 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_0_depth_converts_correctly() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0x12, 1, 1, 1, 0xAB]);
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_0_preserves_native_u16() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,];
     let mut rgba = std::vec![1u16; 8];
-    copy_alpha_packed_u16x4_at_0(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0x1234, 1, 1, 1, 0xABCD]);
   }
 
+  /// BE parity for AYUV64 alpha-at-slot-0 → u8 RGBA: byte-swapping the
+  /// packed source and toggling the `BE` flag must yield byte-for-byte
+  /// identical output. Locks down the corruption where a BE host
+  /// processing the LE-encoded Frame contract would emit a byte-reversed α.
+  #[test]
+  fn copy_alpha_packed_u16x4_to_u8_at_0_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 8];
+    let mut rgba_be = std::vec![1u8; 8];
+    copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_to_u8_at_0::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for AYUV64 alpha-at-slot-0 → u16 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_at_0_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 8];
+    let mut rgba_be = std::vec![7u16; 8];
+    copy_alpha_packed_u16x4_at_0::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_at_0::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
   #[test]
   fn copy_alpha_plane_u8_scatters_into_rgba_alpha_slot() {
     let alpha = std::vec![50u8, 60, 70, 80];
@@ -423,22 +536,59 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_ya_u16_to_u8_depth_converts_via_high_byte() {
     // Ya16 packed → u8 RGBA: α >> 8 selects the high byte.
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_ya_u16_to_u8(&packed, &mut rgba, 2);
+    copy_alpha_ya_u16_to_u8::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0xFF]);
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_ya_u16_preserves_native_u16() {
     let packed: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC];
     let mut rgba = std::vec![1u16; 8];
-    copy_alpha_ya_u16(&packed, &mut rgba, 2);
+    copy_alpha_ya_u16::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xABCD, 1, 1, 1, 0x9ABC]);
   }
 
+  /// BE parity for Ya16 → u8 RGBA: byte-swapping the packed source and
+  /// toggling the `BE` flag must yield byte-for-byte identical output.
+  /// Locks down the codex-flagged corruption where a BE host (e.g.
+  /// s390x) processing the LE-encoded `Ya16Frame` would otherwise emit
+  /// a byte-reversed α byte under the combined `with_rgb + with_rgba`
+  /// Strategy A+ path.
+  #[test]
+  fn copy_alpha_ya_u16_to_u8_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00, 0x0001, 0x00FF];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 12];
+    let mut rgba_be = std::vec![1u8; 12];
+    copy_alpha_ya_u16_to_u8::<false>(&packed_le, &mut rgba_le, 3);
+    copy_alpha_ya_u16_to_u8::<true>(&packed_be, &mut rgba_be, 3);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Ya16 → u16 RGBA (16-bit α path).
+  #[test]
+  fn copy_alpha_ya_u16_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC, 0x0001, 0x00FF];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 12];
+    let mut rgba_be = std::vec![7u16; 12];
+    copy_alpha_ya_u16::<false>(&packed_le, &mut rgba_le, 3);
+    copy_alpha_ya_u16::<true>(&packed_be, &mut rgba_be, 3);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
   #[test]
   fn copy_alpha_plane_f32_to_u8_clamps_and_scales() {
     // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3.
@@ -494,39 +644,73 @@ mod tests {
 
   /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n].
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_3_narrows_correctly() {
     let packed: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
     let mut rgba = std::vec![1u8; 8];
-    copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 2);
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed, &mut rgba, 2);
     assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0x12]);
   }
 
   /// Alpha at slot 3 is copied verbatim (no depth conversion).
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_3_copies_verbatim() {
     let packed: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
     let mut rgba_u16 = std::vec![1u16; 8];
-    copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 2);
+    copy_alpha_packed_u16x4_at_3::<false>(&packed, &mut rgba_u16, 2);
     assert_eq!(rgba_u16, std::vec![1, 1, 1, 0xABFF, 1, 1, 1, 0x1234]);
   }
 
   /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_to_u8_at_3_touches_only_alpha_slot() {
     let packed: std::vec::Vec<u16> = std::vec![0, 0, 0, 0xFFFF];
     let mut rgba = std::vec![42u8; 4];
-    copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 1);
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed, &mut rgba, 1);
     assert_eq!(rgba[..3], [42, 42, 42]);
     assert_eq!(rgba[3], 0xFF);
   }
 
   /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_packed_u16x4_at_3_touches_only_alpha_slot() {
     let packed: std::vec::Vec<u16> = std::vec![0, 0, 0, 0xBEEF];
     let mut rgba_u16 = std::vec![99u16; 4];
-    copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 1);
+    copy_alpha_packed_u16x4_at_3::<false>(&packed, &mut rgba_u16, 1);
     assert_eq!(rgba_u16[..3], [99, 99, 99]);
     assert_eq!(rgba_u16[3], 0xBEEF);
   }
+
+  /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u8 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_to_u8_at_3_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![1u8; 8];
+    let mut rgba_be = std::vec![1u8; 8];
+    copy_alpha_packed_u16x4_to_u8_at_3::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_to_u8_at_3::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u16 RGBA.
+  #[test]
+  fn copy_alpha_packed_u16x4_at_3_be_parity_with_swapped_buffer() {
+    let packed_le: std::vec::Vec<u16> = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234];
+    let packed_be: std::vec::Vec<u16> = packed_le.iter().map(|x| x.swap_bytes()).collect();
+    let mut rgba_le = std::vec![7u16; 8];
+    let mut rgba_be = std::vec![7u16; 8];
+    copy_alpha_packed_u16x4_at_3::<false>(&packed_le, &mut rgba_le, 2);
+    copy_alpha_packed_u16x4_at_3::<true>(&packed_be, &mut rgba_be, 2);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + byte-swapped buffer must match LE path"
+    );
+  }
 }
diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs
index 7782994d..6ab55456 100644
--- a/src/sinker/mixed/ayuv64.rs
+++ b/src/sinker/mixed/ayuv64.rs
@@ -350,7 +350,8 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0(
+        // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+        crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
           packed, rgba_row, w, use_simd,
         );
       }
@@ -404,7 +405,13 @@ impl PixelSink for MixedSinker<'_, Ayuv64> {
         let rgba_u16_row =
           rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0(packed, rgba_u16_row, w, use_simd);
+        // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+        crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0::<false>(
+          packed,
+          rgba_u16_row,
+          w,
+          use_simd,
+        );
       }
     }
 
diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs
index 5d6c44e1..4cd03d13 100644
--- a/src/sinker/mixed/gray.rs
+++ b/src/sinker/mixed/gray.rs
@@ -1499,8 +1499,9 @@ impl PixelSink for MixedSinker<'_, Ya16> {
         let rgba_u16_row =
           rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        // Patch α from source (native u16 depth).
-        copy_alpha_ya_u16(packed, rgba_u16_row, w);
+        // Patch α from source (native u16 depth). `Ya16Frame` is LE-encoded
+        // per the unified Frame contract → `BE = false`.
+        copy_alpha_ya_u16::<false>(packed, rgba_u16_row, w);
       }
     }
 
@@ -1562,7 +1563,8 @@ impl PixelSink for MixedSinker<'_, Ya16> {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
       // Overwrite the α channel with real source α (>> 8 for u8 output).
-      copy_alpha_ya_u16_to_u8(packed, rgba_row, w);
+      // `Ya16Frame` is LE-encoded per the unified Frame contract → `BE = false`.
+      copy_alpha_ya_u16_to_u8::<false>(packed, rgba_row, w);
     }
 
     Ok(())
@@ -2408,6 +2410,127 @@ mod tests {
     assert_eq!(v, [0x80, 0xFF]);
   }
 
+  /// Strategy A+ (combined `with_rgb` + `with_rgba`) must produce α bytes
+  /// byte-identical to the standalone `with_rgba` path. Locks down the
+  /// codex-flagged corruption where a BE host processing the LE-encoded
+  /// `Ya16Frame` would otherwise diverge between the two paths: standalone
+  /// uses the endian-aware `ya16_to_rgba_row::<false>` kernel; combined
+  /// expanded RGB → RGBA then patched α via `copy_alpha_ya_u16_to_u8` which
+  /// previously read raw `packed[n*2+1]` host-native and so emitted a
+  /// byte-reversed α byte on BE. After the fix, `copy_alpha_ya_u16_to_u8`
+  /// is target-endian-aware (`<false>` for the LE Frame contract) and the
+  /// two paths agree on every host.
+  ///
+  /// To exercise the LE-encoded byte contract on every host we build the
+  /// `&[u16]` plane by bit-casting LE bytes — `u16::from_le_bytes` per
+  /// sample. On LE hosts that's a no-op; on BE hosts it byte-swaps so the
+  /// in-memory bytes match the FFmpeg `AV_PIX_FMT_YA16LE` layout.
+  #[test]
+  fn ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded() {
+    let w: u32 = 8;
+    let h: u32 = 1;
+    // Logical samples (Y, A) per pixel.
+    let samples: [(u16, u16); 8] = [
+      (0x0000, 0xFFFF),
+      (0x8000, 0x4000),
+      (0xFFFF, 0x0000),
+      (0x1234, 0xABCD),
+      (0x00FF, 0xFF00),
+      (0x5A5A, 0xA5A5),
+      (0x7FFF, 0x8000),
+      (0xC000, 0x3FFF),
+    ];
+    // Build the `&[u16]` plane such that its in-memory bytes match the
+    // FFmpeg `AV_PIX_FMT_YA16LE` byte layout on every host. We want a
+    // host-native u16 whose underlying bytes spell `[low, high]` (LE):
+    // `u16::from_ne_bytes(x.to_le_bytes())` is `x` on LE and `x.swap_bytes()`
+    // on BE — the right value to store in either case.
+    let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) };
+    let packed: std::vec::Vec<u16> = samples
+      .iter()
+      .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)])
+      .collect();
+    let frame = Ya16Frame::new(&packed, w, h, w * 2);
+
+    // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the
+    // newly endian-aware `copy_alpha_ya_u16_to_u8::<false>`.
+    let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize];
+    let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_rgb(&mut rgb_combined)
+        .unwrap()
+        .with_rgba(&mut rgba_combined)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    // Run standalone (with_rgba only) — exercises the endian-aware
+    // `ya16_to_rgba_row::<false>` kernel.
+    let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_rgba(&mut rgba_standalone)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    assert_eq!(
+      rgba_combined, rgba_standalone,
+      "combined (with_rgb+with_rgba) RGBA must equal standalone with_rgba"
+    );
+  }
+
+  /// u16 RGBA variant of the combined-vs-standalone parity check. Locks
+  /// down `copy_alpha_ya_u16::<false>` (the u16 alpha-patch helper for
+  /// 16-bit RGBA outputs).
+  #[test]
+  fn ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded() {
+    let w: u32 = 8;
+    let h: u32 = 1;
+    let samples: [(u16, u16); 8] = [
+      (0x0000, 0xFFFF),
+      (0x8000, 0x4000),
+      (0xFFFF, 0x0000),
+      (0x1234, 0xABCD),
+      (0x00FF, 0xFF00),
+      (0x5A5A, 0xA5A5),
+      (0x7FFF, 0x8000),
+      (0xC000, 0x3FFF),
+    ];
+    // See sibling test for the `le_encoded` rationale.
+    let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) };
+    let packed: std::vec::Vec<u16> = samples
+      .iter()
+      .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)])
+      .collect();
+    let frame = Ya16Frame::new(&packed, w, h, w * 2);
+
+    let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize];
+    let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_rgb_u16(&mut rgb_combined)
+        .unwrap()
+        .with_rgba_u16(&mut rgba_combined)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize];
+    {
+      let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_rgba_u16(&mut rgba_standalone)
+        .unwrap();
+      ya16_to(&frame, FR, M, &mut sink).unwrap();
+    }
+
+    assert_eq!(
+      rgba_combined, rgba_standalone,
+      "combined (with_rgb_u16+with_rgba_u16) RGBA u16 must equal standalone"
+    );
+  }
+
   #[test]
   #[cfg_attr(
     miri,
diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs
index 837b8246..49e34d84 100644
--- a/src/sinker/mixed/packed_rgb_16bit.rs
+++ b/src/sinker/mixed/packed_rgb_16bit.rs
@@ -726,7 +726,11 @@ impl PixelSink for MixedSinker<'_, Rgba64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::<false>(
+          in64, rgba_row, w,
+        );
       }
     }
 
@@ -759,7 +763,13 @@ impl PixelSink for MixedSinker<'_, Rgba64> {
         let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::<false>(
+          in64,
+          rgba_u16_row,
+          w,
+        );
       }
     }
 
@@ -995,7 +1005,11 @@ impl PixelSink for MixedSinker<'_, Bgra64> {
         let rgba_buf = rgba.as_deref_mut().unwrap();
         let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?;
         expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::<false>(
+          in64, rgba_row, w,
+        );
       }
     }
 
@@ -1024,7 +1038,13 @@ impl PixelSink for MixedSinker<'_, Bgra64> {
         let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
         let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?;
         expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w);
-        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w);
+        // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame
+        // contract → `BE = false`.
+        crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::<false>(
+          in64,
+          rgba_u16_row,
+          w,
+        );
       }
     }
 
diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index 94fc963f..12435e52 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -65,6 +65,22 @@ const GBR_F16_FULL_RANGE: bool = true;
 // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated).
 const WIDEN_CHUNK: usize = 64;
 
+// Endianness routing for **post-widen** `gbrpf32_to_*` calls.
+//
+// `widen_f16_be_to_host_f32::<false>` produces **host-native f32 scratch** from
+// LE-encoded f16 plane bits (it normalises bits before widening), so the
+// downstream `gbrpf32_to_*::<HOST_NATIVE_BE>` kernel sees input that already
+// matches the host's byte order. The kernel's `from_le` / `from_be` then
+// becomes a no-op on every host — correct.
+//
+// Distinct from the **direct** Frame-to-row-kernel pattern elsewhere in this
+// file (the `gbrpf16_to_*::<false>` u8/f16 calls): those receive raw LE-encoded
+// `&[half::f16]` plane bytes per the unified Frame contract, so they pass
+// `BE = false` to tell the kernel to apply `from_le`. Post-widen scratch is
+// already host-native, so it must use `BE = HOST_NATIVE_BE` to keep the kernel
+// byte-swap a no-op on every host.
+const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
+
 // ---- Gbrpf16 accessor impl block ----------------------------------------
 
 impl<'a> MixedSinker<'a, Gbrpf16> {
@@ -354,29 +370,29 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         if let Some(buf) = self.rgb_f32.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_f32.as_deref_mut() {
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrpf32_to_rgba_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgba_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgb_u16.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_u16.as_deref_mut() {
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrpf32_to_rgba_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.luma.as_deref_mut() {
-          gbrpf32_to_luma_row::<false>(
+          gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
@@ -389,7 +405,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         }
 
         if let Some(buf) = self.luma_u16.as_deref_mut() {
-          gbrpf32_to_luma_u16_row::<false>(
+          gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
@@ -402,7 +418,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
         }
 
         if let Some(hsv) = self.hsv.as_mut() {
-          gbrpf32_to_hsv_row::<false>(
+          gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
@@ -762,31 +778,47 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         if let Some(buf) = self.rgb_f32.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_f32_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_f32.as_deref_mut() {
           // gbrapf32_to_rgba_f32_row with widened source α (lossless).
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_f32_row::<false>(gf, bf, rf, af, &mut buf[start..end], n, use_simd);
+          gbrapf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
+            gf,
+            bf,
+            rf,
+            af,
+            &mut buf[start..end],
+            n,
+            use_simd,
+          );
         }
 
         if let Some(buf) = self.rgb_u16.as_deref_mut() {
           let start = chunk_plane_start * 3;
           let end = chunk_plane_end * 3;
-          gbrpf32_to_rgb_u16_row::<false>(gf, bf, rf, &mut buf[start..end], n, use_simd);
+          gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(gf, bf, rf, &mut buf[start..end], n, use_simd);
         }
 
         if let Some(buf) = self.rgba_u16.as_deref_mut() {
           // gbrapf32_to_rgba_u16_row with widened source α.
           let start = chunk_plane_start * 4;
           let end = chunk_plane_end * 4;
-          gbrapf32_to_rgba_u16_row::<false>(gf, bf, rf, af, &mut buf[start..end], n, use_simd);
+          gbrapf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
+            gf,
+            bf,
+            rf,
+            af,
+            &mut buf[start..end],
+            n,
+            use_simd,
+          );
         }
 
         if let Some(buf) = self.luma.as_deref_mut() {
-          gbrpf32_to_luma_row::<false>(
+          gbrpf32_to_luma_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
@@ -799,7 +831,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         }
 
         if let Some(buf) = self.luma_u16.as_deref_mut() {
-          gbrpf32_to_luma_u16_row::<false>(
+          gbrpf32_to_luma_u16_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
@@ -812,7 +844,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
         }
 
         if let Some(hsv) = self.hsv.as_mut() {
-          gbrpf32_to_hsv_row::<false>(
+          gbrpf32_to_hsv_row::<HOST_NATIVE_BE>(
             gf,
             bf,
             rf,
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index 7673e0a2..faf8a2e1 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -1288,6 +1288,115 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   }
 }
 
+/// LE-encoded byte contract regression for [`Gbrpf16`] **widening → narrow
+/// chain** (`with_rgb_u16` and `with_rgba`). Covers the post-widen routing
+/// where `gbrpf32_to_rgb_u16_row` / `gbrpf32_to_rgba_u16_row` /
+/// `gbrpf32_to_rgb_row` are invoked on **host-native f32 scratch** produced
+/// by `widen_f16_be_to_host_f32::<false>`.
+///
+/// On a BE host this would have been corrupted under the prior
+/// `gbrpf32_to_*::<false>` post-widen routing — that kernel applied
+/// `from_le` to scratch that was already host-native, byte-swapping the
+/// f32 representation before scaling. Fixed by routing post-widen calls
+/// through `::<HOST_NATIVE_BE>` (`true` on BE, `false` on LE), which makes
+/// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch
+/// the double-swap on BE.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
+  let w = 16usize;
+  let h = 4usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.5,
+        1 => 0.25,
+        2 => 0.0,
+        _ => 1.0,
+      })
+    })
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.125,
+        1 => 0.75,
+        2 => 0.0625,
+        _ => 0.875,
+      })
+    })
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 4 {
+        0 => 0.375,
+        1 => 0.625,
+        2 => 0.9375,
+        _ => 0.03125,
+      })
+    })
+    .collect();
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+
+  let src = Gbrpf16Frame::try_new(
+    &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Exercise the u16 narrow path (post-widen → gbrpf32_to_rgb_u16_row).
+  let mut rgb_u16 = std::vec![0u16; w * h * 3];
+  // Exercise the u8 narrow path via with_rgba (Strategy A: post-widen
+  // is unused for u8 since rgba=opaque-α; we trigger the SAME post-widen
+  // path by also attaching luma_u16 alongside u16).
+  let mut luma_u16 = std::vec![0u16; w * h];
+  {
+    let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+      .with_rgb_u16(&mut rgb_u16)
+      .unwrap()
+      .with_luma_u16(&mut luma_u16)
+      .unwrap();
+    gbrpf16_to(&src, &mut sink).unwrap();
+  }
+
+  // Assert RGB u16 output matches the intended (clamp+scale × 65535) values.
+  let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 };
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgb_u16[i * 3],
+      to_u16(intended_r[i].to_f32()),
+      "RGB u16 R idx {i}"
+    );
+    assert_eq!(
+      rgb_u16[i * 3 + 1],
+      to_u16(intended_g[i].to_f32()),
+      "RGB u16 G idx {i}"
+    );
+    assert_eq!(
+      rgb_u16[i * 3 + 2],
+      to_u16(intended_b[i].to_f32()),
+      "RGB u16 B idx {i}"
+    );
+  }
+  // Sanity: luma_u16 (post-widen narrow) is non-zero — locks down that
+  // the post-widen luma kernel also sees host-native f32 scratch.
+  assert!(
+    luma_u16.iter().any(|&v| v > 0),
+    "luma_u16 must contain non-zero samples — \
+     a corrupted byte-swap would still emit non-zero output but the rgb_u16 \
+     assertion above is the primary guard"
+  );
+}
+
 // ---- 32-bit overflow guards ------------------------------------------------
 //
 // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message

From f226d5a0a8861689d3b9f992e6a797f2b1d3fd80 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 21:16:44 +1200
Subject: [PATCH 06/10] fix(sinker): endian-aware f32 alpha-patch helpers +
 dual routing for direct-Frame vs post-widen paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 3rd-pass review of #92 found that the f32 alpha-patch helpers
`copy_alpha_plane_f32_to_u8`, `copy_alpha_plane_f32_to_u16`, and
`copy_alpha_plane_f32` read each f32 α sample as host-native, which
silently corrupted the α slot on BE hosts (e.g., s390x) processing the
LE-encoded `Gbrapf32Frame` α plane. The byte-swapped f32 bits clamp to
near-zero or near-one, so BE hosts emitted α = 0 or 255 regardless of
the intended value. Same bug class as the u16 alpha-patch helpers fixed
in cf26058, but for the f32 path; the standalone `with_rgba_f32` test
didn't cover this because Strategy A+ uses a different code path
(`expand_rgb_to_rgba_row` + alpha-patch).

Helpers updated (4):
* `copy_alpha_plane_f32_to_u8`  — added `<const BE: bool>`, normalises
  each f32's bits via `f32::from_bits(u32::from_le/be(bits))` before
  clamp / scale / round-half-up.
* `copy_alpha_plane_f32_to_u16` — same pattern.
* `copy_alpha_plane_f32`        — same pattern (lossless pass-through).
* `copy_alpha_plane_f16` (planar_gbr_f16.rs) — added `<const BE: bool>`
  for consistency. Currently only test-called; future-proofs against a
  Strategy A+ wiring that consumes it.

Sinker call sites updated, dual routing pattern (4 sites total):
* `src/sinker/mixed/planar_gbr_float.rs` — Gbrapf32 Strategy A+ u8 RGBA
  path: `<false>` (direct-Frame, LE-encoded per Phase-1 contract).
* `src/sinker/mixed/planar_gbr_f16.rs`   — `widen_and_scatter_f16_alpha_to_u8`
  helper: `<HOST_NATIVE_BE>` (post-widen, scratch is host-native f32 after
  `widen_f16_be_to_host_f32::<false>`).

Audit summary: only one direct-Frame call site (Gbrapf32 u8 RGBA) and
one post-widen call site (Gbrapf16 u8 RGBA) currently invoke the f32
alpha-patch helpers. The Gbrapf32 RGBA u16 / RGBA f32 / RGBA f16 paths
all use direct kernels (`gbrapf32_to_rgba_u16_row` / `gbrapf32_to_rgba_f32_row`
/ `gbrapf32_to_rgba_f16_row`) and were already endian-aware via their
`<false>` parameter; same for the Gbrapf16 widened RGBA u16 / f32 paths.

Tests added (7):
* `copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer`
* `copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer`
* `copy_alpha_plane_f32_be_parity_with_swapped_buffer`
* `copy_alpha_plane_f16_be_parity_with_swapped_buffer`
* `gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly`
  (w = 15, exercises scalar tail; compares Strategy A+ combo to standalone
  `with_rgba`).
* `gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly`
  (w = 17, defense-in-depth for the u16 RGBA combo).
* `gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly`
  (w = 15, verifies HOST_NATIVE_BE routing on the post-widen helper).

The 3 prior `copy_alpha_plane_f32*_clamps_and_scales` /
`_lossless_passthrough` tests are now `#[cfg(target_endian = "little")]`
(host-native f32 fixtures); BE-host scalar correctness is locked down
by the dedicated `*_be_parity_with_swapped_buffer` tests.

Verified:
* `cargo test --target aarch64-apple-darwin --lib` (2285 pass, +7).
* `cargo build --target x86_64-apple-darwin --tests` (0 warnings).
* `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests`.
* `cargo build --no-default-features`.
* `cargo fmt --check`.
* `cargo clippy --all-targets --all-features -- -D warnings`.
* `cargo check --target s390x-unknown-linux-gnu --lib` (BE-host smoke).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/row/scalar/alpha_extract.rs            | 152 +++++++++++-
 src/row/scalar/planar_gbr_f16.rs           |  55 ++++-
 src/sinker/mixed/planar_gbr_f16.rs         |  12 +-
 src/sinker/mixed/planar_gbr_float.rs       |  12 +-
 src/sinker/mixed/tests/planar_gbr_float.rs | 268 +++++++++++++++++++++
 5 files changed, 485 insertions(+), 14 deletions(-)

diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs
index bbf5922b..6fc664ac 100644
--- a/src/row/scalar/alpha_extract.rs
+++ b/src/row/scalar/alpha_extract.rs
@@ -296,13 +296,42 @@ pub(crate) fn copy_alpha_ya_u16<const BE: bool>(
 /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 255, and rounded
 /// with round-half-up (`+ 0.5` then truncate). Only slot 3 of every 4-element
 /// tuple is written; R, G, B slots are untouched.
+///
+/// `BE` selects the **byte order** of the encoded source α plane:
+/// `false` = LE on disk/wire (e.g., `AV_PIX_FMT_GBRAPF32LE` per the
+/// `Gbrapf32Frame` contract; this also matches the case where the f32
+/// scratch is already host-native and the host is little-endian);
+/// `true` = BE on disk/wire (or host-native scratch on a BE host). Each
+/// raw f32 is bit-normalised to host-native order via
+/// `f32::from_bits(u32::from_le(bits))` (or `from_be`) BEFORE the clamp /
+/// scale / round-half-up. Without this a BE host (e.g., s390x) processing
+/// the LE-encoded Frame would clamp byte-swapped garbage values, typically
+/// producing α = 0 or α = 255 regardless of intent. Mirrors the
+/// `copy_alpha_plane_u16_to_u8::<BITS, BE>` endian pattern.
+///
+/// Routing pattern at the sinker layer:
+/// - **Direct-Frame paths** (e.g., `Gbrapf32Frame` → α plane consumed directly)
+///   pass `BE = false` (data is LE-encoded per the unified Frame contract).
+/// - **Post-widen paths** (e.g., `Gbrapf16Frame` widened-to-f32 scratch) pass
+///   `BE = HOST_NATIVE_BE` (scratch is host-native f32 after widen).
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], width: usize) {
+pub(crate) fn copy_alpha_plane_f32_to_u8<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [u8],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 255.0 + 0.5) as u8;
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    let v = f32::from_bits(host_bits);
+    rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8;
   }
 }
 
@@ -310,13 +339,28 @@ pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], wid
 ///
 /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 65535, and rounded
 /// with round-half-up. Only slot 3 of every 4-element tuple is written.
+///
+/// `BE` selects the **byte order** of the encoded source α plane.
+/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the
+/// direct-Frame vs post-widen routing pattern.
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], width: usize) {
+pub(crate) fn copy_alpha_plane_f32_to_u16<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [u16],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 65535.0 + 0.5) as u16;
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    let v = f32::from_bits(host_bits);
+    rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16;
   }
 }
 
@@ -325,13 +369,30 @@ pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], w
 ///
 /// No clamping, no rounding — HDR values, NaN, and Inf in the α plane are
 /// preserved bit-exact. Only slot 3 of every 4-element tuple is written.
+/// The output α is always written in **host-native** byte order (the
+/// downstream consumer of `&[f32]` expects host-native floats); this helper's
+/// `BE` only describes the **input** plane.
+///
+/// `BE` selects the **byte order** of the encoded source α plane.
+/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the
+/// direct-Frame vs post-widen routing pattern.
 // Not yet consumed by any sinker (Task 8 wires MixedSinker impls).
 #[allow(dead_code)]
-pub(crate) fn copy_alpha_plane_f32(alpha: &[f32], rgba_out: &mut [f32], width: usize) {
+pub(crate) fn copy_alpha_plane_f32<const BE: bool>(
+  alpha: &[f32],
+  rgba_out: &mut [f32],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n];
+    let bits = alpha[n].to_bits();
+    let host_bits = if BE {
+      u32::from_be(bits)
+    } else {
+      u32::from_le(bits)
+    };
+    rgba_out[n * 4 + 3] = f32::from_bits(host_bits);
   }
 }
 
@@ -589,12 +650,18 @@ mod tests {
     );
   }
 
+  /// On a LE host, `BE = false` makes the bit-normalize a no-op, so passing
+  /// host-native `f32` literals as if they were already LE-encoded reproduces
+  /// the original (pre-endian-aware) clamp+scale semantics. BE-host scalar
+  /// correctness is locked down by the `*_be_parity_with_swapped_buffer`
+  /// tests below.
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_to_u8_clamps_and_scales() {
     // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3.
     let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1];
     let mut rgba = vec![1u8; 20];
-    copy_alpha_plane_f32_to_u8(&alpha, &mut rgba, 5);
+    copy_alpha_plane_f32_to_u8::<false>(&alpha, &mut rgba, 5);
     // R, G, B slots (0, 1, 2) must be untouched; slot 3 has the alpha.
     assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0");
     assert_eq!(rgba[7], 128, "alpha[1]=0.5 → 128");
@@ -608,11 +675,12 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_to_u16_clamps_and_scales() {
     // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 32768, 65535, 65535, 0] in slot 3.
     let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1];
     let mut rgba = vec![1u16; 20];
-    copy_alpha_plane_f32_to_u16(&alpha, &mut rgba, 5);
+    copy_alpha_plane_f32_to_u16::<false>(&alpha, &mut rgba, 5);
     assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0");
     assert_eq!(rgba[7], 32768, "alpha[1]=0.5 → 32768");
     assert_eq!(rgba[11], 65535, "alpha[2]=1.0 → 65535");
@@ -625,11 +693,12 @@ mod tests {
   }
 
   #[test]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f32_lossless_passthrough() {
     // HDR (2.5), NaN, Inf, negative all preserved bit-exact.
     let alpha = vec![2.5f32, f32::NAN, f32::INFINITY, -1.0];
     let mut rgba = vec![0.0f32; 16];
-    copy_alpha_plane_f32(&alpha, &mut rgba, 4);
+    copy_alpha_plane_f32::<false>(&alpha, &mut rgba, 4);
     assert_eq!(rgba[3], 2.5, "HDR 2.5 preserved");
     assert!(rgba[7].is_nan(), "NaN preserved");
     assert!(rgba[11].is_infinite() && rgba[11] > 0.0, "+Inf preserved");
@@ -640,6 +709,71 @@ mod tests {
     assert_eq!(rgba[2], 0.0);
   }
 
+  /// BE parity for Gbrapf32 → u8 RGBA: byte-swapping the bits of every
+  /// f32 in the source α plane and toggling `BE` must produce identical
+  /// output. Locks down the codex 3rd-pass finding where a BE host
+  /// processing the LE-encoded `Gbrapf32Frame` would clamp byte-swapped
+  /// garbage values (typical result: α = 0 or α = 255 regardless of intent).
+  #[test]
+  fn copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![1u8; 32];
+    let mut rgba_be = std::vec![1u8; 32];
+    copy_alpha_plane_f32_to_u8::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32_to_u8::<true>(&alpha_be, &mut rgba_be, 8);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + bit-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Gbrapf32 → u16 RGBA.
+  #[test]
+  fn copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![7u16; 32];
+    let mut rgba_be = std::vec![7u16; 32];
+    copy_alpha_plane_f32_to_u16::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32_to_u16::<true>(&alpha_be, &mut rgba_be, 8);
+    assert_eq!(
+      rgba_le, rgba_be,
+      "BE flag + bit-swapped buffer must match LE path"
+    );
+  }
+
+  /// BE parity for Gbrapf32 → f32 RGBA (lossless α pass-through). The
+  /// output α must equal the host-native f32 bit-pattern of the LE source
+  /// regardless of the host's byte order. NaN bit-patterns may differ
+  /// across hardware after a `from_bits → to_bits` round-trip, so we
+  /// compare on the bit representation of finite, non-NaN samples only.
+  #[test]
+  fn copy_alpha_plane_f32_be_parity_with_swapped_buffer() {
+    let alpha_le: std::vec::Vec<f32> =
+      std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 2.5, -1.0, f32::INFINITY];
+    let alpha_be: std::vec::Vec<f32> = alpha_le
+      .iter()
+      .map(|v| f32::from_bits(v.to_bits().swap_bytes()))
+      .collect();
+    let mut rgba_le = std::vec![0.0f32; 32];
+    let mut rgba_be = std::vec![0.0f32; 32];
+    copy_alpha_plane_f32::<false>(&alpha_le, &mut rgba_le, 8);
+    copy_alpha_plane_f32::<true>(&alpha_be, &mut rgba_be, 8);
+    let bits_le: std::vec::Vec<u32> = rgba_le.iter().map(|v| v.to_bits()).collect();
+    let bits_be: std::vec::Vec<u32> = rgba_be.iter().map(|v| v.to_bits()).collect();
+    assert_eq!(
+      bits_le, bits_be,
+      "BE flag + bit-swapped buffer must match LE path bit-for-bit"
+    );
+  }
+
   // ---- copy_alpha_packed_u16x4_to_u8_at_3 / copy_alpha_packed_u16x4_at_3 --
 
   /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n].
diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs
index 3b9ba779..766c310b 100644
--- a/src/row/scalar/planar_gbr_f16.rs
+++ b/src/row/scalar/planar_gbr_f16.rs
@@ -198,16 +198,34 @@ pub(crate) fn gbrapf16_to_rgba_f16_row<const BE: bool>(
 /// Only slot 3 of every 4-element tuple is written; R, G, B slots are
 /// untouched. Lossless — HDR, NaN, and Inf in the α plane are preserved
 /// bit-exact.
+///
+/// `BE` selects the **byte order** of the encoded source α plane
+/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_GBRAPF16LE` per the
+/// `Gbrapf16Frame` contract; `true` = BE on disk/wire). Each raw f16 is
+/// bit-normalised to host-native order via `u16::from_le` / `u16::from_be`
+/// BEFORE the slot-3 write so the output buffer always carries host-native
+/// `half::f16` (matching the rest of the f16 row kernels). Without this a
+/// BE host processing the LE-encoded Frame would emit byte-reversed α bits.
 // Only called from the `mod tests` block which is gated on `feature = "std"`.
 // Under `cargo test --no-default-features` the test module is compiled out,
 // leaving the function without callers; suppress the resulting lint there.
 #[cfg_attr(not(feature = "std"), expect(dead_code))]
 #[cfg_attr(not(tarpaulin), inline(always))]
-pub(crate) fn copy_alpha_plane_f16(alpha: &[half::f16], rgba_out: &mut [half::f16], width: usize) {
+pub(crate) fn copy_alpha_plane_f16<const BE: bool>(
+  alpha: &[half::f16],
+  rgba_out: &mut [half::f16],
+  width: usize,
+) {
   debug_assert!(alpha.len() >= width, "alpha plane too short");
   debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
   for n in 0..width {
-    rgba_out[n * 4 + 3] = alpha[n];
+    let raw = alpha[n].to_bits();
+    let host_bits = if BE {
+      u16::from_be(raw)
+    } else {
+      u16::from_le(raw)
+    };
+    rgba_out[n * 4 + 3] = half::f16::from_bits(host_bits);
   }
 }
 
@@ -414,11 +432,12 @@ mod tests {
     miri,
     ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
   )]
+  #[cfg(target_endian = "little")]
   fn copy_alpha_plane_f16_only_writes_alpha_slot() {
     let alpha = vec![half::f16::from_f32(0.7), half::f16::from_f32(0.3)];
     let sentinel = half::f16::from_f32(0.1);
     let mut rgba = vec![sentinel; 8];
-    copy_alpha_plane_f16(&alpha, &mut rgba, 2);
+    copy_alpha_plane_f16::<false>(&alpha, &mut rgba, 2);
     // Only slot 3 written; R, G, B slots (0, 1, 2) must be untouched.
     assert_eq!(rgba[0], sentinel, "R slot 0 untouched");
     assert_eq!(rgba[1], sentinel, "G slot 0 untouched");
@@ -429,4 +448,34 @@ mod tests {
     assert_eq!(rgba[6], sentinel, "B slot 1 untouched");
     assert_eq!(rgba[7], half::f16::from_f32(0.3), "A slot 1");
   }
+
+  /// BE parity for `copy_alpha_plane_f16`: byte-swapping the bits of every
+  /// f16 in the source α plane and toggling `BE` must produce identical
+  /// output. Mirrors the f32 alpha-patch endian-aware fix.
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
+  )]
+  fn copy_alpha_plane_f16_be_parity_with_swapped_buffer() {
+    let alpha_le = vec![
+      half::f16::from_f32(0.0),
+      half::f16::from_f32(0.25),
+      half::f16::from_f32(0.5),
+      half::f16::from_f32(1.0),
+      half::f16::from_f32(2.5),
+      half::f16::from_f32(-1.0),
+    ];
+    let alpha_be = be_encode_f16(&alpha_le);
+    let mut rgba_le = vec![half::f16::ZERO; 24];
+    let mut rgba_be = vec![half::f16::ZERO; 24];
+    copy_alpha_plane_f16::<false>(&alpha_le, &mut rgba_le, 6);
+    copy_alpha_plane_f16::<true>(&alpha_be, &mut rgba_be, 6);
+    let bits_le: std::vec::Vec<u16> = rgba_le.iter().map(|v| v.to_bits()).collect();
+    let bits_be: std::vec::Vec<u16> = rgba_be.iter().map(|v| v.to_bits()).collect();
+    assert_eq!(
+      bits_le, bits_be,
+      "BE flag + bit-swapped buffer must match LE path bit-for-bit"
+    );
+  }
 }
diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index 12435e52..5e7c0dda 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -919,6 +919,16 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> {
 ///
 /// Used by `Gbrapf16` Strategy A+ and standalone-RGBA paths to overwrite
 /// the per-pixel alpha byte from the f16 source α plane.
+///
+/// Endian routing: `widen_f16_be_to_host_f32::<false>` converts the
+/// LE-encoded `Gbrapf16Frame` α plane bits into **host-native f32**
+/// scratch. The downstream `copy_alpha_plane_f32_to_u8` therefore receives
+/// host-native f32 input, not LE-encoded f32, and must be invoked with
+/// `BE = HOST_NATIVE_BE` so the kernel's `from_le` / `from_be` is a no-op
+/// on every host (no second byte-swap). This is the **post-widen** routing
+/// pattern; contrast with `planar_gbr_float.rs` which calls the same
+/// helper with `BE = false` because it consumes the **direct** LE-encoded
+/// `Gbrapf32Frame` α plane.
 #[cfg_attr(not(tarpaulin), inline(always))]
 fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8], width: usize) {
   let mut af_chunk = [0.0f32; WIDEN_CHUNK];
@@ -928,7 +938,7 @@ fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8
     // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping
     // and scaling to u8 — correct on both LE and BE hosts.
     widen_f16_be_to_host_f32::<false>(alpha_f16, offset, &mut af_chunk, n);
-    copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n);
+    copy_alpha_plane_f32_to_u8::<HOST_NATIVE_BE>(&af_chunk[..n], &mut rgba_out[offset * 4..], n);
     offset += n;
   }
 }
diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs
index cb71de5f..ddee135b 100644
--- a/src/sinker/mixed/planar_gbr_float.rs
+++ b/src/sinker/mixed/planar_gbr_float.rs
@@ -809,10 +809,20 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> {
 
     // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from
     // the source f32 α plane (clamped × 255 → u8).
+    //
+    // `BE = false`: `a_in` is the **direct** Gbrapf32Frame α plane, which
+    // is LE-encoded f32 per the Phase-1 unified Frame contract. The helper
+    // bit-normalises each f32 to host-native order before clamp/scale, so
+    // the conversion compiles to a no-op on LE hosts and a `swap_bytes` on
+    // BE hosts (e.g., s390x). Without this BE hosts would clamp byte-
+    // swapped garbage and emit α = 0 / 255 regardless of intent. Distinct
+    // from the **post-widen** routing in `planar_gbr_f16.rs`
+    // (`widen_and_scatter_f16_alpha_to_u8`), which feeds host-native f32
+    // scratch into the same helper with `BE = HOST_NATIVE_BE`.
     if let Some(buf) = rgba.as_deref_mut() {
       let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
       expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
-      copy_alpha_plane_f32_to_u8(a_in, rgba_row, w);
+      copy_alpha_plane_f32_to_u8::<false>(a_in, rgba_row, w);
     }
 
     Ok(())
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index faf8a2e1..4549ac34 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -1397,6 +1397,274 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
   );
 }
 
+// ---- LE-encoded Strategy A+ alpha-patch regressions (codex 3rd-pass) ------
+//
+// The `copy_alpha_plane_f32_to_u8` (and `copy_alpha_plane_f32_to_u16`,
+// `copy_alpha_plane_f32`) helper used to read each f32 α sample as
+// host-native, which silently corrupted the α slot on BE hosts processing
+// the LE-encoded `Gbrapf32Frame` α plane (the byte-swapped bits clamp to
+// near-zero or near-one, producing α = 0 or 255 regardless of intent).
+// Same bug class as the u16 alpha-patch helpers fixed in cf26058.
+//
+// These regressions trigger the **Strategy A+ combo path** (`with_rgb` +
+// `with_rgba`, `with_rgb_u16` + `with_rgba_u16`) on a Frame whose α plane
+// is built from explicit LE-encoded f32 bit-patterns. On a LE host the
+// `to_le` on f32 bits is identity so the test reduces to the original
+// semantics; on a BE host the kernel without `from_le` would clamp
+// byte-swapped garbage and the assertion would fail. The non-multiple-of-
+// SIMD widths (15, 17) exercise scalar-tail correctness in addition to
+// any vectorized body.
+
+/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb` + `with_rgba`)
+/// on a LE-encoded f32 α plane must reproduce standalone `with_rgba` output
+/// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::<false>`
+/// (already endian-aware), so any deviation indicates the Strategy A+
+/// alpha-patch path corrupted the α plane.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
+  // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend.
+  let w = 15usize;
+  let h = 3usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.10 + (i as f32) * 0.001).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.20 + (i as f32) * 0.002).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.30 + (i as f32) * 0.003).collect();
+  // Deliberately mix in-range, boundary, > 1, and negative α to stress
+  // clamp/scale correctness *after* the bit-normalize step.
+  let intended_a: Vec<f32> = (0..w * h)
+    .map(|i| match i % 7 {
+      0 => 0.0,
+      1 => 0.5,
+      2 => 1.0,
+      3 => 1.5,
+      4 => -0.1,
+      5 => 0.123,
+      _ => 0.876,
+    })
+    .collect();
+
+  // LE-encode every plane (per the documented `*LE` Frame contract).
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
+
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba`.
+  let mut rgba_ref = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_rgba(&mut rgba_ref)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  // Strategy A+: `with_rgb` + `with_rgba` combo (alpha-patch path).
+  let mut rgb_combo = std::vec![0u8; w * h * 3];
+  let mut rgba_combo = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_rgb(&mut rgb_combo)
+      .unwrap()
+      .with_rgba(&mut rgba_combo)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf32 Strategy A+ alpha-patch must equal standalone `with_rgba`"
+  );
+}
+
+/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb_u16` +
+/// `with_rgba_u16`) on a LE-encoded f32 α plane. Defense-in-depth: the
+/// current sinker calls `gbrapf32_to_rgba_u16_row::<false>` directly here
+/// (no alpha-patch helper invocation), but any future routing change that
+/// switches to the alpha-patch helper must keep BE-host correctness.
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
+  // 17 is non-multiple-of-{4,8,16}.
+  let w = 17usize;
+  let h = 3usize;
+  let intended_g: Vec<f32> = (0..w * h).map(|i| 0.11 + (i as f32) * 0.0011).collect();
+  let intended_b: Vec<f32> = (0..w * h).map(|i| 0.22 + (i as f32) * 0.0022).collect();
+  let intended_r: Vec<f32> = (0..w * h).map(|i| 0.33 + (i as f32) * 0.0033).collect();
+  let intended_a: Vec<f32> = (0..w * h)
+    .map(|i| match i % 5 {
+      0 => 0.0,
+      1 => 0.25,
+      2 => 1.0,
+      3 => 0.5,
+      _ => 0.75,
+    })
+    .collect();
+
+  let le = |v: &Vec<f32>| -> Vec<f32> {
+    v.iter()
+      .map(|&x| f32::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le(&intended_g);
+  let bp = le(&intended_b);
+  let rp = le(&intended_r);
+  let ap = le(&intended_a);
+
+  let src = Gbrapf32Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba_u16`.
+  let mut rgba_ref = std::vec![0u16; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_rgba_u16(&mut rgba_ref)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  // Combo: `with_rgb_u16` + `with_rgba_u16`.
+  let mut rgb_combo = std::vec![0u16; w * h * 3];
+  let mut rgba_combo = std::vec![0u16; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_rgb_u16(&mut rgb_combo)
+      .unwrap()
+      .with_rgba_u16(&mut rgba_combo)
+      .unwrap();
+    gbrapf32_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf32 Strategy A+ rgba_u16 must equal standalone `with_rgba_u16`"
+  );
+
+  // Independently assert the α slot reflects the intended values
+  // (clamp × 65535 + 0.5). This catches a hypothetical regression where
+  // both code paths share the same bug.
+  let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 };
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_combo[i * 4 + 3],
+      to_u16(intended_a[i]),
+      "α slot idx {i}"
+    );
+  }
+}
+
+/// Codex 3rd-pass regression: Gbrapf16 Strategy A+ (`with_rgb` + `with_rgba`)
+/// on a LE-encoded f16 α plane. This exercises the **post-widen** routing
+/// pattern in `widen_and_scatter_f16_alpha_to_u8`: the f16 α plane is
+/// widened to host-native f32 scratch via `widen_f16_be_to_host_f32::<false>`,
+/// then the alpha-patch helper must consume that scratch with
+/// `BE = HOST_NATIVE_BE` (no double byte-swap). The test compares the
+/// Strategy A+ combo output against the standalone `with_rgba` path, which
+/// uses the `gbrpf16_to_rgba_row::<false>` direct kernel + the same
+/// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the
+/// `widen_and_scatter` helper, so this test guards against the
+/// post-widen routing flag being wrong).
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
+  let w = 15usize;
+  let h = 3usize;
+  let intended_g: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.10 + (i as f32) * 0.001))
+    .collect();
+  let intended_b: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.20 + (i as f32) * 0.002))
+    .collect();
+  let intended_r: Vec<half::f16> = (0..w * h)
+    .map(|i| half::f16::from_f32(0.30 + (i as f32) * 0.003))
+    .collect();
+  let intended_a: Vec<half::f16> = (0..w * h)
+    .map(|i| {
+      half::f16::from_f32(match i % 5 {
+        0 => 0.0,
+        1 => 0.5,
+        2 => 1.0,
+        3 => 0.25,
+        _ => 0.75,
+      })
+    })
+    .collect();
+
+  let le_f16 = |v: &Vec<half::f16>| -> Vec<half::f16> {
+    v.iter()
+      .map(|&x| half::f16::from_bits(x.to_bits().to_le()))
+      .collect()
+  };
+  let gp = le_f16(&intended_g);
+  let bp = le_f16(&intended_b);
+  let rp = le_f16(&intended_r);
+  let ap = le_f16(&intended_a);
+
+  let src = Gbrapf16Frame::try_new(
+    &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32,
+  )
+  .unwrap();
+
+  // Reference: standalone `with_rgba` (uses `gbrpf16_to_rgba_row` then
+  // `widen_and_scatter_f16_alpha_to_u8` → exercises the post-widen helper
+  // as well, with the same routing as the combo path).
+  let mut rgba_ref = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_rgba(&mut rgba_ref)
+      .unwrap();
+    gbrapf16_to(&src, &mut sink).unwrap();
+  }
+
+  // Strategy A+: `with_rgb` + `with_rgba`.
+  let mut rgb_combo = std::vec![0u8; w * h * 3];
+  let mut rgba_combo = std::vec![0u8; w * h * 4];
+  {
+    let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_rgb(&mut rgb_combo)
+      .unwrap()
+      .with_rgba(&mut rgba_combo)
+      .unwrap();
+    gbrapf16_to(&src, &mut sink).unwrap();
+  }
+
+  assert_eq!(
+    rgba_combo, rgba_ref,
+    "Gbrapf16 Strategy A+ post-widen alpha-patch must equal standalone `with_rgba`"
+  );
+
+  // Independently assert α slot reflects the intended values
+  // (widen → clamp × 255 + 0.5).
+  let to_u8 = |v: f32| -> u8 { (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8 };
+  for i in 0..(w * h) {
+    assert_eq!(
+      rgba_combo[i * 4 + 3],
+      to_u8(intended_a[i].to_f32()),
+      "α slot idx {i}"
+    );
+  }
+}
+
 // ---- 32-bit overflow guards ------------------------------------------------
 //
 // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message

From 75cf8652d9d0a6d359b1208756703600e47fc79d Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 21:47:07 +1200
Subject: [PATCH 07/10] test(sinker): drop miri ignore on BE-contract
 regressions so BE CI exercises them
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 4th-pass review of #92 found the BE-sensitive sinker regressions skip
on miri targets — but BE CI runs via miri (powerpc64 / s390x), so the tests
effectively did not validate the BE path. The `cfg_attr(miri, ignore = ...)`
gate was added defensively, but it skipped exactly the host where BE
corruption would surface.

Audit summary: every BE-sensitive regression added in this PR calls the
high-level `MixedSinker` builder + `gbrpf32_to` / `gbrapf32_to` /
`gbrpf16_to` / `gbrapf16_to` / `rgbf32_to` / `rgbf16_to`. The dispatch layer
honours the sinker's `simd` flag — passing `with_simd(false)` bypasses every
SIMD branch and calls the scalar kernels directly. The post-widen helpers
(`widen_f16_be_to_host_f32`, `expand_rgb_to_rgba_row`, the
`copy_alpha_plane_f32*` family) are pure scalar. None of these tests
construct or invoke arch-specific SIMD intrinsics directly.

Tests un-gated (12):

* `src/sinker/mixed/tests/packed_rgb_float.rs`
  - `rgbf32_sinker_le_encoded_frame_decodes_correctly`
* `src/sinker/mixed/tests/packed_rgb_f16.rs`
  - `rgbf16_sinker_le_encoded_frame_decodes_correctly`
* `src/sinker/mixed/tests/planar_gbr_float.rs`
  - `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
  - `gbrapf32_sinker_le_encoded_frame_decodes_correctly`
  - `gbrpf16_sinker_le_encoded_frame_decodes_correctly`
  - `gbrapf16_sinker_le_encoded_frame_decodes_correctly`
  - `gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly`
  - `gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly`
  - `gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly`
  - `gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly`
  - `gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly`
  - `gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly`

Each test now starts the builder chain with `.with_simd(false)`, forcing
scalar dispatch on every chained `with_rgb*` / `with_rgba*` / `with_luma*`
sink. SIMD-using tests in `planar_gbr_high_bit.rs` (8 pre-existing miri
ignores, untouched in this PR) retain the gate; those test SIMD-dispatched
kernels with no scalar bypass and genuinely cannot run under miri.

Local miri verification (where the f16 setup path's portable software
fallback applies — i.e. all f32 tests, plus those f16 tests whose fixtures
do not call `half::f16::from_f32` on aarch64+fp16):

* `cargo miri test --lib gbrpf32_sinker_le_encoded` — 1 passed
* `cargo miri test --lib gbrapf32_sinker_le_encoded` — 1 passed
* `cargo miri test --lib gbrapf32_strategy_a_plus_le_encoded_frame` — 1 passed
* `cargo miri test --lib gbrapf32_strategy_a_plus_le_encoded_u16` — 1 passed
* `cargo miri test --lib rgbf32_sinker_le_encoded` — 1 passed

The f16 regression tests build their fixtures via `half::f16::from_f32`,
which on aarch64-apple-darwin (miri's local triple) compiles to an inline
asm path keyed on `target_feature = "fp16"` — unsupported by miri locally.
On the BE CI hosts (s390x / powerpc64), `half-2.7.1` falls through to its
portable software fallback (no asm, no SIMD), so all f16 tests will execute
correctly under `cargo miri test` on those targets — which is the entire
point of removing the `miri,ignore` gate.

Verified:

* `cargo test --target aarch64-apple-darwin --lib` (2285 pass, 0 fail).
* `cargo build --target x86_64-apple-darwin --tests` (0 warnings).
* `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests`.
* `cargo build --no-default-features`.
* `cargo fmt --check`.
* `cargo clippy --all-targets --all-features -- -D warnings`.
* `cargo check --target s390x-unknown-linux-gnu --lib` (BE-host smoke).
* `cargo miri test --lib gbrpf32_sinker_le_encoded` (and the four sibling f32 tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/tests/packed_rgb_f16.rs   | 11 ++-
 src/sinker/mixed/tests/packed_rgb_float.rs | 11 ++-
 src/sinker/mixed/tests/planar_gbr_float.rs | 95 +++++++++++++---------
 3 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs
index 0f227c14..574e482c 100644
--- a/src/sinker/mixed/tests/packed_rgb_f16.rs
+++ b/src/sinker/mixed/tests/packed_rgb_f16.rs
@@ -324,11 +324,13 @@ fn rgbf16_simd_matches_scalar_with_random_input() {
 /// from the LE-encoded bytes.
 ///
 /// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
+///
+/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn rgbf16_sinker_le_encoded_frame_decodes_correctly() {
   let vals_f32 = [0.5f32, 1.5, -0.25, 100.0];
   let intended: Vec<half::f16> = (0..16 * 4 * 3)
@@ -345,6 +347,7 @@ fn rgbf16_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3];
   let mut sink = MixedSinker::<Rgbf16>::new(16, 4)
+    .with_simd(false)
     .with_rgb_f16(&mut rgb_f16_out)
     .unwrap();
   rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs
index 38f23f28..8bd01266 100644
--- a/src/sinker/mixed/tests/packed_rgb_float.rs
+++ b/src/sinker/mixed/tests/packed_rgb_float.rs
@@ -260,11 +260,13 @@ fn rgbf32_simd_matches_scalar_with_random_input() {
 /// BE), the output would be byte-swapped relative to `intended`.
 ///
 /// Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
+///
+/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn rgbf32_sinker_le_encoded_frame_decodes_correctly() {
   // Mix HDR, in-range, and negative values — the f32 lossless path must
   // round-trip them bit-exact on every host.
@@ -288,6 +290,7 @@ fn rgbf32_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3];
   let mut sink = MixedSinker::<Rgbf32>::new(16, 4)
+    .with_simd(false)
     .with_rgb_f32(&mut rgb_f32_out)
     .unwrap();
   rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs
index 4549ac34..a8132480 100644
--- a/src/sinker/mixed/tests/planar_gbr_float.rs
+++ b/src/sinker/mixed/tests/planar_gbr_float.rs
@@ -881,11 +881,13 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() {
 // Mirrors the `Grayf32` regression added in PR #85's `52f8191`.
 
 /// LE-encoded byte contract regression for [`Gbrpf32`].
+///
+/// Forces `with_simd(false)` so the test runs purely scalar — no SIMD
+/// intrinsics — which lets it execute under `cargo miri test`. BE CI is
+/// driven by miri on s390x / powerpc64; gating it out of miri (per the
+/// codex 4th-pass finding) would skip exactly the host where BE corruption
+/// would surface.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -935,6 +937,7 @@ fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf32>::new(w, h)
+    .with_simd(false)
     .with_rgb_f32(&mut rgb_f32)
     .unwrap();
   gbrpf32_to(&src, &mut sink).unwrap();
@@ -960,11 +963,11 @@ fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() {
 
 /// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA
 /// pass-through, including the α plane).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -990,6 +993,7 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
   let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+    .with_simd(false)
     .with_rgba_f32(&mut rgba_f32)
     .unwrap();
   gbrapf32_to(&src, &mut sink).unwrap();
@@ -1019,11 +1023,11 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() {
 }
 
 /// LE-encoded byte contract regression for [`Gbrpf16`].
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1073,6 +1077,7 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgb_f16 = std::vec![half::f16::ZERO; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+    .with_simd(false)
     .with_rgb_f16(&mut rgb_f16)
     .unwrap();
   gbrpf16_to(&src, &mut sink).unwrap();
@@ -1098,11 +1103,11 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() {
 
 /// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA
 /// pass-through, including the α plane).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1135,6 +1140,7 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
 
   let mut rgba_f16 = std::vec![half::f16::ZERO; w * h * 4];
   let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+    .with_simd(false)
     .with_rgba_f16(&mut rgba_f16)
     .unwrap();
   gbrapf16_to(&src, &mut sink).unwrap();
@@ -1171,11 +1177,11 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() {
 /// regression that drops the bit-normalize-first step in
 /// `widen_f16_be_to_host_f32::<false>` would interpret byte-swapped bits as
 /// host-native f16 and decode to wildly wrong f32 values.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1225,6 +1231,7 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 
   let mut rgb_f32 = std::vec![0.0f32; w * h * 3];
   let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+    .with_simd(false)
     .with_rgb_f32(&mut rgb_f32)
     .unwrap();
   gbrpf16_to(&src, &mut sink).unwrap();
@@ -1239,11 +1246,11 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 /// LE-encoded byte contract regression for [`Gbrapf16`] **widening path**
 /// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 →
 /// f32 widen step — same bit-normalise-first contract as the no-α variant.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1276,6 +1283,7 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 
   let mut rgba_f32 = std::vec![0.0f32; w * h * 4];
   let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+    .with_simd(false)
     .with_rgba_f32(&mut rgba_f32)
     .unwrap();
   gbrapf16_to(&src, &mut sink).unwrap();
@@ -1301,11 +1309,11 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() {
 /// through `::<HOST_NATIVE_BE>` (`true` on BE, `false` on LE), which makes
 /// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch
 /// the double-swap on BE.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
   let w = 16usize;
   let h = 4usize;
@@ -1361,6 +1369,7 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
   let mut luma_u16 = std::vec![0u16; w * h];
   {
     let mut sink = MixedSinker::<Gbrpf16>::new(w, h)
+      .with_simd(false)
       .with_rgb_u16(&mut rgb_u16)
       .unwrap()
       .with_luma_u16(&mut luma_u16)
@@ -1420,11 +1429,11 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() {
 /// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::<false>`
 /// (already endian-aware), so any deviation indicates the Strategy A+
 /// alpha-patch path corrupted the α plane.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
   // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend.
   let w = 15usize;
@@ -1466,6 +1475,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
   let mut rgba_ref = std::vec![0u8; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
       .with_rgba(&mut rgba_ref)
       .unwrap();
     gbrapf32_to(&src, &mut sink).unwrap();
@@ -1476,6 +1486,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
   let mut rgba_combo = std::vec![0u8; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
       .with_rgb(&mut rgb_combo)
       .unwrap()
       .with_rgba(&mut rgba_combo)
@@ -1494,11 +1505,11 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() {
 /// current sinker calls `gbrapf32_to_rgba_u16_row::<false>` directly here
 /// (no alpha-patch helper invocation), but any future routing change that
 /// switches to the alpha-patch helper must keep BE-host correctness.
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
   // 17 is non-multiple-of-{4,8,16}.
   let w = 17usize;
@@ -1535,6 +1546,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
   let mut rgba_ref = std::vec![0u16; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
       .with_rgba_u16(&mut rgba_ref)
       .unwrap();
     gbrapf32_to(&src, &mut sink).unwrap();
@@ -1545,6 +1557,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
   let mut rgba_combo = std::vec![0u16; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf32>::new(w, h)
+      .with_simd(false)
       .with_rgb_u16(&mut rgb_combo)
       .unwrap()
       .with_rgba_u16(&mut rgba_combo)
@@ -1581,11 +1594,11 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() {
 /// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the
 /// `widen_and_scatter` helper, so this test guards against the
 /// post-widen routing flag being wrong).
+///
+/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host
+/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly`
+/// docstring for the rationale.
 #[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
 fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
   let w = 15usize;
   let h = 3usize;
@@ -1631,6 +1644,7 @@ fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
   let mut rgba_ref = std::vec![0u8; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_simd(false)
       .with_rgba(&mut rgba_ref)
       .unwrap();
     gbrapf16_to(&src, &mut sink).unwrap();
@@ -1641,6 +1655,7 @@ fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() {
   let mut rgba_combo = std::vec![0u8; w * h * 4];
   {
     let mut sink = MixedSinker::<Gbrapf16>::new(w, h)
+      .with_simd(false)
       .with_rgb(&mut rgb_combo)
       .unwrap()
       .with_rgba(&mut rgba_combo)

From fd797f94a052d107007e54a0032b07464d7ecfee Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 22:01:15 +1200
Subject: [PATCH 08/10] docs(frame): correct AV_PIX_FMT_RGBF32 binding
 statement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex 5th-pass review of PR #92 caught: the prior doc said FFmpeg "does
not define AV_PIX_FMT_RGBF32" — incorrect. FFmpeg actually defines
AV_PIX_FMT_RGBF32BE, AV_PIX_FMT_RGBF32LE, and an unsuffixed alias
AV_PIX_FMT_RGBF32 that is target-endian (resolves to LE on LE hosts and
BE on BE hosts).

Corrected: Rgbf32Frame maps to AV_PIX_FMT_RGBF32LE explicitly, with a
warning that BE-host callers holding target-endian RGBF32 bytes must
convert to LE before constructing the frame (otherwise the LE-decode
contract would re-interpret BE bytes as LE → byte-swapped float data).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/frame/packed_rgb_float.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs
index f002ecd2..7ff0b660 100644
--- a/src/frame/packed_rgb_float.rs
+++ b/src/frame/packed_rgb_float.rs
@@ -65,16 +65,18 @@ pub enum Rgbf32FrameError {
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
 ///
-/// # Endian contract — **LE-encoded bytes**
+/// # Endian contract — **LE-encoded bytes** (`AV_PIX_FMT_RGBF32LE`)
 ///
 /// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as
-/// `f32`. Note that, unlike RGBA float and the f16 variants, FFmpeg does
-/// **not** define an `AV_PIX_FMT_RGBF32` constant for non-α 3-channel f32
-/// (only the 4-channel `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE`
-/// pair exists). `colconv` pins this frame to the canonical FFmpeg `*LE`
-/// byte-order convention — i.e. callers should pass LE-encoded bytes
-/// regardless of host endianness, exactly as they would for the existing
-/// `AV_PIX_FMT_RGBAF32LE` flavour.
+/// `f32`. This frame maps to FFmpeg `AV_PIX_FMT_RGBF32LE`. FFmpeg also
+/// defines `AV_PIX_FMT_RGBF32BE` and an unsuffixed `AV_PIX_FMT_RGBF32`
+/// alias that is **target-endian** (resolves to `RGBF32LE` on LE hosts and
+/// `RGBF32BE` on BE hosts). **Callers on a BE host who hold target-endian
+/// `AV_PIX_FMT_RGBF32` bytes must convert them to LE before constructing
+/// this frame** — otherwise the LE-decode contract here would re-interpret
+/// the BE bytes as LE and produce byte-swapped float data. The 4-channel
+/// `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` pair follows the same
+/// `*LE` convention; this frame uses the analogous LE binding.
 ///
 /// On a little-endian host (every CI runner today) LE bytes _are_
 /// host-native, so `&[f32]` is also a host-native float slice; on a

From fbb49a199232e0a9d0cbe5bd4c91538b6284669a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 22:21:57 +1200
Subject: [PATCH 09/10] test: gate or force-scalar Ya16 sinker tests that hit
 vld2q_u16 under miri
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #92's CI miri jobs (aarch64-apple-darwin) failed with `unsupported
operation: can't call foreign function llvm.aarch64.neon.ld2.v8i16.p0`.
Stack pointed at the `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded`
test calling `MixedSinker::<Ya16>::process` → `dispatch::ya16::ya16_to_rgb_row`
→ `arch::neon::gray::ya16_to_rgb_row::<false>` → `vld2q_u16` (NEON
intrinsic Miri does not implement).

Both `ya16_combined_*` tests added in `cf26058` (this PR) use width=8 —
the exact threshold where the NEON kernel kicks in (`while x + 8 <= width`)
— so on aarch64 hosts the SIMD body executes. Other Ya16 sinker tests in
this module use width 1 / 2 which never reaches the `vld2q_u16` block.

These two tests are BE-contract regressions: they verify combined
(`with_rgb` + `with_rgba`) and standalone (`with_rgba`) paths agree on
the LE-encoded `Ya16Frame`, locking down the endian-aware
`copy_alpha_ya_u16_to_u8::<false>` and `copy_alpha_ya_u16::<false>`
helpers. They must execute on BE-host miri (s390x / powerpc64) — gating
them with `cfg_attr(miri, ignore)` would skip exactly the host where BE
corruption surfaces. So they get the same Option A treatment 75cf865
applied to the planar / packed-rgb BE-contract regressions: prepend
`.with_simd(false)` to every `MixedSinker::new(..)` builder chain so the
sinker dispatch routes to the scalar kernel — no SIMD intrinsic, miri
clean, BE validation preserved.

Audit summary (other tests in `sinker::mixed::gray::tests`):

* All `gray*` / `gray*_walker_smoke_test` / `gray*_limited_range_*` /
  `grayf32_with_*` integration tests use width <= 4 — far below the
  NEON SIMD threshold (8 lanes) — so they never invoke `vld2q_u16`.
* `gray*_walker_smoke_test` and similar use the gray-family kernels
  (`gray9` / `gray12` / `gray14`) which dispatch through different row
  kernels with different SIMD-eligibility thresholds; none hit
  `vld2q_u16` at width=4.
* `grayf32_sinker_le_encoded_frame_decodes_correctly` retains its
  existing `cfg_attr(miri, ignore = "...")` (added in `cf26058`) — left
  untouched in this commit; if a follow-up wants to align it with the
  75cf865 un-gating pattern it can be done independently.
* `ya8_*` tests use the Ya8 kernels (no `vld2q_u16` — Ya8 deinterleave
  uses `vld2_u8` which Miri *does* support) and existing width=128/130
  smoke is already `cfg_attr(miri, ignore)`.
* `ya16_with_*` tests (rgba_u16 / rgba_u8 / rgb_and_rgba / hsv) all use
  width 1 / 2 — below the SIMD threshold — so they execute the scalar
  tail unconditionally on every host.
* `ya16_width_128_and_130_smoke` is already `cfg_attr(miri, ignore)`
  (Option B) and stays so — it explicitly validates the SIMD-dispatched
  smoke path.

Tests fixed (2 — both via Option A):

* `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded`
* `ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded`

Each gains `.with_simd(false)` on both the combined sinker (with_rgb +
with_rgba builder chain) and the standalone sinker (with_rgba only),
matching the pattern in `tests/packed_rgb_float.rs::rgbf32_sinker_le_encoded_frame_decodes_correctly`.

Verified:

* `cargo test --target aarch64-apple-darwin --lib` (2285 pass, 0 fail).
* `cargo build --target x86_64-apple-darwin --tests` (0 warnings).
* `cargo build --no-default-features`.
* `cargo fmt --check`.
* `cargo clippy --all-targets --all-features -- -D warnings`.
* `cargo miri test --lib sinker::mixed::gray::tests::ya16_combined`
  (2 pass — confirms the fix on aarch64-apple-darwin miri, the exact
  CI host that previously failed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/gray.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs
index 4cd03d13..b07b9191 100644
--- a/src/sinker/mixed/gray.rs
+++ b/src/sinker/mixed/gray.rs
@@ -2453,11 +2453,16 @@ mod tests {
     let frame = Ya16Frame::new(&packed, w, h, w * 2);
 
     // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the
-    // newly endian-aware `copy_alpha_ya_u16_to_u8::<false>`.
+    // newly endian-aware `copy_alpha_ya_u16_to_u8::<false>`. Forces
+    // `with_simd(false)` so the test runs purely scalar — no SIMD intrinsics
+    // — which lets it execute under `cargo miri test`. BE CI is driven by
+    // miri on s390x / powerpc64; gating it out of miri would skip exactly
+    // the host where BE corruption would surface.
     let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize];
     let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize];
     {
       let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
         .with_rgb(&mut rgb_combined)
         .unwrap()
         .with_rgba(&mut rgba_combined)
@@ -2466,10 +2471,11 @@ mod tests {
     }
 
     // Run standalone (with_rgba only) — exercises the endian-aware
-    // `ya16_to_rgba_row::<false>` kernel.
+    // `ya16_to_rgba_row::<false>` kernel. Same scalar-only rationale.
     let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize];
     {
       let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
         .with_rgba(&mut rgba_standalone)
         .unwrap();
       ya16_to(&frame, FR, M, &mut sink).unwrap();
@@ -2506,10 +2512,15 @@ mod tests {
       .collect();
     let frame = Ya16Frame::new(&packed, w, h, w * 2);
 
+    // Forces `with_simd(false)` so this test runs purely scalar — no SIMD
+    // intrinsics — which lets it execute under `cargo miri test`. BE CI is
+    // driven by miri on s390x / powerpc64; gating it out of miri would skip
+    // exactly the host where BE corruption would surface.
     let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize];
     let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize];
     {
       let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
         .with_rgb_u16(&mut rgb_combined)
         .unwrap()
         .with_rgba_u16(&mut rgba_combined)
@@ -2520,6 +2531,7 @@ mod tests {
     let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize];
     {
       let mut sink = MixedSinker::<crate::yuv::Ya16>::new(w as usize, h as usize)
+        .with_simd(false)
         .with_rgba_u16(&mut rgba_standalone)
         .unwrap();
       ya16_to(&frame, FR, M, &mut sink).unwrap();

From cd73821046161afb4bed241550c1c3421ff42475 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Fri, 8 May 2026 22:23:57 +1200
Subject: [PATCH 10/10] docs(sinker): correct stale post-widen routing comment
 in planar_gbr_f16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review on PR #92 caught: the comment at planar_gbr_f16.rs:359
said the downstream gbrpf32_to_* kernel was invoked with `BE = false`,
but cf26058 changed the routing to `BE = HOST_NATIVE_BE` (correct for
host-native f32 scratch produced by the bit-normalising widen helper).

Updated comment to reflect actual routing + reinforced the distinction
between post-widen scratch (host-native → HOST_NATIVE_BE) and direct-
Frame paths (LE-encoded → ::<false>) so future readers don't conflate
the two contracts.

Copilot findings 2 + 3 (Ya16 tests at gray.rs:2470,2528 missing
.with_simd(false)) were already addressed in fbb49a1 — Copilot
reviewed pre-fix state.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sinker/mixed/planar_gbr_f16.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs
index 5e7c0dda..22159b92 100644
--- a/src/sinker/mixed/planar_gbr_f16.rs
+++ b/src/sinker/mixed/planar_gbr_f16.rs
@@ -355,8 +355,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> {
       while offset < w {
         let n = (w - offset).min(WIDEN_CHUNK);
         // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the
-        // downstream `gbrpf32_to_*` kernel (invoked with `BE = false`) sees
-        // host-native f32 on every host. See the canonical helper's docs.
+        // downstream `gbrpf32_to_*` kernel (invoked with `BE = HOST_NATIVE_BE`
+        // — see module-scope constant) sees host-native f32 on every host.
+        // The post-widen scratch is host-native, distinct from the direct-
+        // Frame paths which use `<false>` per the LE-encoded byte contract.
         widen_f16_be_to_host_f32::<false>(g_in, offset, &mut gf_chunk, n);
         widen_f16_be_to_host_f32::<false>(b_in, offset, &mut bf_chunk, n);
         widen_f16_be_to_host_f32::<false>(r_in, offset, &mut rf_chunk, n);