From fb3c4bc8d8bc8577847b3a5238cbd6f3ba55ac0b Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 19:36:30 +1200 Subject: [PATCH 01/10] docs(frame): clarify LE-encoded byte contract on f32/f16 frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Affected types: Rgbf32Frame, Rgbf16Frame, Gbrpf32Frame, Gbrapf32Frame, Gbrpf16Frame, Gbrapf16Frame. Each doc-comment now explicitly states the LE-encoded byte contract, mirroring the established Grayf32Frame doc pattern: the &[f32] / &[f16] plane is the LE-encoded byte layout reinterpreted as f32 / f16, matching FFmpeg's canonical *LE pixel-format suffixes (AV_PIX_FMT_GBRPF32LE etc). Adds the bytemuck::cast_slice + linesize-division-by-element-size instruction so callers wiring up FFmpeg buffers don't have to guess. Codex 3rd-pass review of PR #85 caught that PR #83 + PR #84 introduced sinker routings that assumed the planes are already host-native f32 / f16 — a wrong reading of the *LE contract that would corrupt every output path on a BE host. Pinning the doc-level contract here so future readers don't repeat the mistake; sinker revert lands in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/packed_rgb_f16.rs | 17 ++++++++++- src/frame/packed_rgb_float.rs | 17 ++++++++++- src/frame/planar_gbr_float.rs | 56 +++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs index 1469df86..ecf9e8c6 100644 --- a/src/frame/packed_rgb_f16.rs +++ b/src/frame/packed_rgb_f16.rs @@ -50,7 +50,8 @@ pub enum Rgbf16FrameError { }, } -/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`). +/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`, +/// FFmpeg canonical `*LE` convention — see endian note below). /// One plane, 3 × `f16` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -65,6 +66,20 @@ pub enum Rgbf16FrameError { /// `stride` is in **`f16` elements** (≥ `3 * width`), matching the /// per-format convention that stride aligns with the underlying slice /// element type. No width parity constraint. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted +/// as `f16`, matching the FFmpeg `*LE` pixel-format convention. On a +/// little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[half::f16]` is also a host-native f16 slice; on a big-endian host +/// the bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgbf16Frame<'a> { rgb: &'a [half::f16], diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs index d1f79cef..166b0848 100644 --- a/src/frame/packed_rgb_float.rs +++ b/src/frame/packed_rgb_float.rs @@ -50,7 +50,8 @@ pub enum Rgbf32FrameError { }, } -/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`). +/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`, +/// FFmpeg canonical `*LE` convention — see endian note below). /// One plane, 3 × `f32` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -64,6 +65,20 @@ pub enum Rgbf32FrameError { /// `stride` is in **`f32` elements** (≥ `3 * width`), matching the /// per-format convention that stride aligns with the underlying slice /// element type. No width parity constraint. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as +/// `f32`, matching the FFmpeg `*LE` pixel-format convention. On a +/// little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[f32]` is also a host-native float slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgbf32Frame<'a> { rgb: &'a [f32], diff --git a/src/frame/planar_gbr_float.rs b/src/frame/planar_gbr_float.rs index d612e715..505a7bd7 100644 --- a/src/frame/planar_gbr_float.rs +++ b/src/frame/planar_gbr_float.rs @@ -147,6 +147,20 @@ const fn check_plane( /// `f32` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are /// preserved bit-exact on lossless pass-through outputs and clamped to /// `[0.0, 1.0]` on integer-output paths. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The three `&[f32]` planes are the **LE-encoded byte layout** reinterpreted +/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format +/// name. On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so the slices are also host-native float slices; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrpf32Frame<'a> { g: &'a [f32], @@ -250,6 +264,20 @@ impl<'a> Gbrpf32Frame<'a> { /// Four full-resolution `f32` planes in **G, B, R, A** order. Alpha is /// real per-pixel; nominal range `[0.0, 1.0]` (opaque = 1.0). Stride is /// in `f32` elements. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The four `&[f32]` planes are the **LE-encoded byte layout** reinterpreted +/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format +/// name. On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so the slices are also host-native float slices; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrapf32Frame<'a> { g: &'a [f32], @@ -372,6 +400,20 @@ impl<'a> Gbrapf32Frame<'a> { /// Three full-resolution [`half::f16`] planes in **G, B, R** order. Stride /// is in `f16` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are /// permitted (saturation to `+Inf` occurs on f16→f32 narrowing paths). +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The three `&[half::f16]` planes are the **LE-encoded byte layout** +/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in +/// the format name. On a little-endian host (every CI runner today) LE +/// bytes _are_ host-native, so the slices are also host-native f16 slices; +/// on a big-endian host the bytes have to be byte-swapped back to +/// host-native before arithmetic. Downstream row kernels handle this +/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrpf16Frame<'a> { g: &'a [half::f16], @@ -475,6 +517,20 @@ impl<'a> Gbrpf16Frame<'a> { /// Four full-resolution [`half::f16`] planes in **G, B, R, A** order. /// Alpha is real per-pixel; nominal range `[0.0, 1.0]`. Stride is in /// `f16` elements. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The four `&[half::f16]` planes are the **LE-encoded byte layout** +/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in +/// the format name. On a little-endian host (every CI runner today) LE +/// bytes _are_ host-native, so the slices are also host-native f16 slices; +/// on a big-endian host the bytes have to be byte-swapped back to +/// host-native before arithmetic. Downstream row kernels handle this +/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrapf16Frame<'a> { g: &'a [half::f16], From 740bc0d696918ad70fb1a50f9d4287d73faa9e1f Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 19:40:09 +1200 Subject: [PATCH 02/10] fix(sinker): revert HOST_NATIVE_BE routing for f32/f16 RGB & GBR sinkers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors PR #85's Grayf32 sinker revert (52f8191) for the other six float frame types affected by PR #83 (dcf40a3) and PR #84 (8627280): Rgbf32, Rgbf16, Gbrpf32, Gbrapf32, Gbrpf16, Gbrapf16. The earlier PRs introduced sinker-layer `HOST_NATIVE_BE` consts that made the row-kernel `BE` parameter a no-op byte-swap on whichever host the build targeted. That assumed the Frame's plane bytes are already host-native f32/f16 — which contradicts the LE-encoded byte contract clarified in the previous commit. On a BE host the routing skipped the required `u32::from_le` / `u16::from_le` swap and would have corrupted every output path. Reverted call sites (all `::` -> `::`): - src/sinker/mixed/packed_rgb_float.rs: 6 sites (Rgbf32) - src/sinker/mixed/packed_rgb_f16.rs: 7 sites (Rgbf16) - src/sinker/mixed/planar_gbr_float.rs: 22 sites (Gbrpf32 + Gbrapf32) - src/sinker/mixed/planar_gbr_f16.rs: 22 sites (Gbrpf16 + Gbrapf16) Removes the four `const HOST_NATIVE_BE: bool = cfg!(target_endian = "big")` definitions and their explanatory doc-comments. Rewrites the `widen_f16_to_f32` doc in planar_gbr_f16.rs to accurately describe the LE-encoded contract (helper is correct on LE host; cross-endian widen should use the bit-normalising `widen_f16_be_to_host_f32` pattern). Untouched: `BE == HOST_NATIVE_BE` `copy_from_slice` fast paths in scalar row kernels (`rgbf32_to_rgb_f32_row`, `gbrpf32_to_rgb_f32_row`, etc.) — generic in `BE`, correct under either contract. Per-arch SIMD kernels and `widen_f16_be_to_host_f32` left alone — different layer. Verification (aarch64-apple-darwin): cargo test --target aarch64-apple-darwin --lib # 2246 ok Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/packed_rgb_f16.rs | 33 ++------- src/sinker/mixed/packed_rgb_float.rs | 31 ++------ src/sinker/mixed/planar_gbr_f16.rs | 102 +++++++++++---------------- src/sinker/mixed/planar_gbr_float.rs | 74 ++++++------------- 4 files changed, 75 insertions(+), 165 deletions(-) diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs index 62ab1cc8..e349f130 100644 --- a/src/sinker/mixed/packed_rgb_f16.rs +++ b/src/sinker/mixed/packed_rgb_f16.rs @@ -34,25 +34,6 @@ use crate::{ yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink}, }; -/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as -/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`] -/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the -/// caller hands us already-decoded half-floats. The kernel `BE` parameter, -/// however, names the **encoded** byte order (so `BE = false` means "decode -/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout -/// is LE, so `BE = false` is correct; on a BE host the host-native layout is -/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap. -/// Without this routing the loaders would byte-swap an already-decoded host- -/// native `f16` on BE hosts, corrupting every output path. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` — same truth table, different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Rgbf16 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf16> { @@ -253,27 +234,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if let Some(buf) = rgb_f16.as_deref_mut() { let f16_start = one_plane_start * 3; let f16_end = one_plane_end * 3; - rgbf16_to_rgb_f16_row::(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); + rgbf16_to_rgb_f16_row::(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); } // Lossless f32 widen — also independent of integer conversion paths. if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf16_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf16_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf16_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf16_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma / @@ -288,7 +269,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -307,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { w, h, )?; - rgbf16_to_rgb_row::(rgb_in, rgb_row, w, use_simd); + rgbf16_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -347,7 +328,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { // over `rgb_row` via `expand_rgb_to_rgba_row`. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs index f189e5ab..e1c17a39 100644 --- a/src/sinker/mixed/packed_rgb_float.rs +++ b/src/sinker/mixed/packed_rgb_float.rs @@ -31,25 +31,6 @@ use crate::{ yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink}, }; -/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as -/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`] -/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller -/// hands us already-decoded floats. The kernel `BE` parameter, however, names -/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes" -/// via `u32::from_le`). On a LE host the host-native layout is LE, so -/// `BE = false` is correct; on a BE host the host-native layout is BE, so we -/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this -/// routing the loaders would byte-swap an already-decoded host-native `f32` on -/// BE hosts, corrupting every output path. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` — same truth table, different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Rgbf32 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf32> { @@ -228,20 +209,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf32_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf32_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct float→u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf32_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf32_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct float→u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float→u8 conversion when @@ -256,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -276,7 +257,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { w, h, )?; - rgbf32_to_rgb_row::(rgb_in, rgb_row, w, use_simd); + rgbf32_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -318,7 +299,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { // less memory pass for combined `with_rgb + with_rgba` callers. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index c041b901..6a0a351e 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -64,49 +64,27 @@ const GBR_F16_FULL_RANGE: bool = true; // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated). const WIDEN_CHUNK: usize = 64; -/// `BE` value that makes the `gbrpf16_to_*` / `gbrapf16_to_*` row dispatchers -/// (and the widened `gbrpf32_to_*` chain after `widen_f16_to_f32`) treat -/// their input as **host-native** (a no-op byte-swap). -/// -/// [`crate::frame::Gbrpf16Frame`] / [`crate::frame::Gbrapf16Frame`] expose -/// `&[half::f16]` plane rows in **host-native** layout — the API contract -/// is that the caller hands us already-decoded half-floats. The kernel `BE` -/// parameter, however, names the **encoded** byte order (so `BE = false` -/// means "decode LE-encoded bytes" via `u16::from_le`). On a LE host the -/// host-native layout is LE, so `BE = false` is correct; on a BE host the -/// host-native layout is BE, so we must request `BE = true` to make -/// `u16::from_be` no-op the swap. Without this routing the loaders would -/// byte-swap an already-decoded host-native `f16` on BE hosts, corrupting -/// every output path (codex PR #84 Finding 3). -/// -/// Crucially, the **widened f32 chain** must also use `HOST_NATIVE_BE`: -/// after [`widen_f16_to_f32`] (which calls `half::f16::to_f32` on host-native -/// f16 bits) the scratch is host-native f32, so the downstream -/// `gbrpf32_to_*` kernel's `from_le`/`from_be` loader must be a no-op — -/// achieved by routing with `HOST_NATIVE_BE`. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced in `c3a6478` and the `Rgbf16` sinker fix in -/// `dcf40a3`. Same truth table: +/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements). /// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. +/// The source slice is `&[half::f16]` from a [`Gbrpf16Frame`] / +/// [`Gbrapf16Frame`] plane — i.e. **LE-encoded bytes** reinterpreted as +/// `half::f16` (per the FFmpeg `*LE` contract). On a little-endian host +/// LE bytes _are_ host-native, so `half::f16::to_f32` interprets the bits +/// correctly and the downstream `gbrpf32_to_*` kernel — invoked with +/// `BE = false` — sees host-native f32 (its `u32::from_le` is a no-op on +/// LE host) → correct. /// -/// The α-plane scatter for [`Gbrapf16`] (Strategy A+ / standalone-RGBA) -/// widens the host-native f16 α plane to host-native f32 via -/// [`widen_f16_to_f32`] then calls `copy_alpha_plane_f32_to_u8` — both -/// operations are endian-agnostic. Mix-mode corruption (LE-decoded RGB + -/// host-native α) is therefore eliminated by routing the RGB chain via -/// `HOST_NATIVE_BE`. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - -/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements). +/// On a big-endian host this helper would mis-interpret the LE-encoded +/// bits as host-native (a separate bug from the sinker-layer routing +/// reverted in this PR); the bit-normalising scalar helper +/// [`crate::row::scalar::widen_f16_be_to_host_f32`] is the correct pattern +/// for cross-endian widen-then-convert chains and is what the per-arch +/// SIMD backends use. Wiring this sinker-local helper to that bit- +/// normalising version is left as a future improvement — every CI runner +/// today is LE and exercises the correct path here. /// -/// The source slice is `&[half::f16]` in **host-native** layout (per the -/// `Gbrpf16Frame` / `Gbrapf16Frame` API contract); `to_f32` interprets the -/// bits as host-native and emits host-native `f32`. Downstream `gbrpf32_to_*` -/// callers must therefore route with [`HOST_NATIVE_BE`] (not the encoded -/// `BE`) to avoid double byte-swapping. +/// [`Gbrpf16Frame`]: crate::frame::Gbrpf16Frame +/// [`Gbrapf16Frame`]: crate::frame::Gbrapf16Frame #[cfg_attr(not(tarpaulin), inline(always))] fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) { for i in 0..count { @@ -351,7 +329,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -363,7 +341,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { height: h, channels: 4, })?; - gbrpf16_to_rgba_f16_row::( + gbrpf16_to_rgba_f16_row::( g_in, b_in, r_in, @@ -407,29 +385,29 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -442,7 +420,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -455,7 +433,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( gf, bf, rf, @@ -480,7 +458,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -504,7 +482,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { w, h, )?; - gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A: expand RGB → RGBA (constant α = 0xFF). if let Some(buf) = rgba.as_deref_mut() { @@ -761,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { // rgb_f16: no source α — use the no-α kernel (lossless scatter). let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -774,7 +752,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { height: h, channels: 4, })?; - gbrapf16_to_rgba_f16_row::( + gbrapf16_to_rgba_f16_row::( g_in, b_in, r_in, @@ -821,14 +799,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { // gbrapf32_to_rgba_f32_row with widened source α (lossless). let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_f32_row::( + gbrapf32_to_rgba_f32_row::( gf, bf, rf, @@ -842,14 +820,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { // gbrapf32_to_rgba_u16_row with widened source α. let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_u16_row::( + gbrapf32_to_rgba_u16_row::( gf, bf, rf, @@ -861,7 +839,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -874,7 +852,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -887,7 +865,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( gf, bf, rf, @@ -918,7 +896,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; // Write opaque RGB → RGBA (α = 0xFF), then overwrite α from source. - gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); // Scatter f16 α → u8 slot 3: widen + clamp + scale. widen_and_scatter_f16_alpha_to_u8(a_in, rgba_row, w); return Ok(()); @@ -944,7 +922,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { w, h, )?; - gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from source. if let Some(buf) = rgba.as_deref_mut() { diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs index 7b979049..361eee76 100644 --- a/src/sinker/mixed/planar_gbr_float.rs +++ b/src/sinker/mixed/planar_gbr_float.rs @@ -53,36 +53,6 @@ use crate::{ const GBR_FLOAT_LUMA_MATRIX: ColorMatrix = ColorMatrix::Bt709; const GBR_FLOAT_FULL_RANGE: bool = true; -/// `BE` value that makes the `gbrpf32_to_*` / `gbrapf32_to_*` row dispatchers -/// treat their input as **host-native** (a no-op byte-swap). -/// -/// [`crate::frame::Gbrpf32Frame`] / [`crate::frame::Gbrapf32Frame`] expose -/// `&[f32]` plane rows in **host-native** layout — the API contract is that -/// the caller hands us already-decoded floats. The kernel `BE` parameter, -/// however, names the **encoded** byte order (so `BE = false` means "decode -/// LE-encoded bytes" via `u32::from_le`). On a LE host the host-native layout -/// is LE, so `BE = false` is correct; on a BE host the host-native layout is -/// BE, so we must request `BE = true` to make `u32::from_be` no-op the swap. -/// Without this routing the loaders would byte-swap an already-decoded host- -/// native `f32` on BE hosts, corrupting every output path (codex PR #84 -/// Finding 2). -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` and the `Rgbf32` sinker fix in `dcf40a3`. Same truth table, -/// different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -/// -/// The α-plane scatter (Strategy A+ / standalone-RGBA) consumes the host- -/// native `&[f32]` α plane via `copy_alpha_plane_f32_to_u8`, which is endian- -/// agnostic — there's no BE branching needed for the α path because it does -/// not byte-load through `from_le`/`from_be`. Mix-mode corruption (LE-decoded -/// RGB + host-native α) is therefore eliminated by routing the RGB chain via -/// `HOST_NATIVE_BE`. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Gbrpf32 accessor impl block ---------------------------------------- impl<'a> MixedSinker<'a, Gbrpf32> { @@ -321,7 +291,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -333,7 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f32_row::( + gbrpf32_to_rgba_f32_row::( g_in, b_in, r_in, @@ -348,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -360,7 +330,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f16_row::( + gbrpf32_to_rgba_f16_row::( g_in, b_in, r_in, @@ -375,12 +345,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_u16_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_u16_row::(g_in, b_in, r_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path (no RGB / luma / HSV needed) ------- @@ -395,7 +365,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -422,10 +392,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { w, h, )?; - gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -438,7 +408,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -451,7 +421,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, @@ -721,7 +691,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -733,7 +703,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f32_row::( + gbrapf32_to_rgba_f32_row::( g_in, b_in, r_in, @@ -749,7 +719,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -761,7 +731,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f16_row::( + gbrapf32_to_rgba_f16_row::( g_in, b_in, r_in, @@ -777,14 +747,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGBA path (direct — source α clamped + scaled) ------------- if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_u16_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_u16_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path ------------------------------------ @@ -799,7 +769,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); return Ok(()); } @@ -826,10 +796,10 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { w, h, )?; - gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -842,7 +812,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -855,7 +825,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, From da794cedfa61508dcc1d85fefffdb76b140665e7 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 19:46:55 +1200 Subject: [PATCH 03/10] test(sinker): replace wrong-contract f32/f16 sinker tests with LE-encoded regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the host-native-contract tests that PR #83 + PR #84 added — they were typed against an incorrect host-native-f32/f16 reading of the Frame contract and would mask the BE-host corruption introduced by the now- reverted `::` sinker routings: - rgbf32_kernel_host_native_be_matches_false_on_le_host (deleted) - rgbf32_sinker_host_native_contract_lossless_passthrough (deleted) - rgbf16_kernel_host_native_be_matches_false_on_le_host (deleted) - rgbf16_sinker_host_native_contract_lossless_passthrough (deleted) - gbrpf32_kernel_host_native_be_matches_false_on_le_host (deleted) - gbrpf32_sinker_host_native_contract_lossless_passthrough (deleted) - gbrpf16_kernel_host_native_be_matches_false_on_le_host (deleted) - gbrpf16_sinker_host_native_contract_lossless_passthrough (deleted) - gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha (deleted) - gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha (deleted) Adds LE-encoded byte contract regressions following PR #85's `52f8191` pattern (`grayf32_sinker_le_encoded_frame_decodes_correctly`): - rgbf32_sinker_le_encoded_frame_decodes_correctly (new) - rgbf16_sinker_le_encoded_frame_decodes_correctly (new) - gbrpf32_sinker_le_encoded_frame_decodes_correctly (new) - gbrapf32_sinker_le_encoded_frame_decodes_correctly (new) - gbrpf16_sinker_le_encoded_frame_decodes_correctly (new) - gbrapf16_sinker_le_encoded_frame_decodes_correctly (new) Each test constructs an explicitly LE-encoded plane via `f32::from_bits(intended.to_bits().to_le())` (or the f16 analogue), builds the Frame, runs the sinker's lossless pass-through, and asserts the output equals the host-native intended values. Vacuous on LE hosts (where `to_le` is identity) but on a BE host any regression that drops the `::` kernel routing would fail fast. Also picks up two trivial pre-existing trailing-blank-line cleanups in `src/row/arch/{neon,x86_sse41}/endian.rs` produced by `cargo fmt` while running it across the modified files. Verification (aarch64-apple-darwin, x86_64-apple-darwin, wasm32): cargo test --target aarch64-apple-darwin --lib # 2242 ok cargo build --target x86_64-apple-darwin --tests # ok RUSTFLAGS="-C target-feature=+simd128" \ cargo build --target wasm32-unknown-unknown --tests # ok cargo build --no-default-features # ok cargo fmt --check # ok cargo clippy --all-targets --all-features -- -D warnings # ok Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/planar_gbr_f16.rs | 39 +- src/sinker/mixed/planar_gbr_float.rs | 38 +- src/sinker/mixed/tests/packed_rgb_f16.rs | 133 ++---- src/sinker/mixed/tests/packed_rgb_float.rs | 146 ++----- src/sinker/mixed/tests/planar_gbr_float.rs | 454 ++++++++------------- 5 files changed, 225 insertions(+), 585 deletions(-) diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 6a0a351e..150ef729 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -341,14 +341,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { height: h, channels: 4, })?; - gbrpf16_to_rgba_f16_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf16_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -752,15 +745,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { height: h, channels: 4, })?; - gbrapf16_to_rgba_f16_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf16_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -806,15 +791,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { // gbrapf32_to_rgba_f32_row with widened source α (lossless). let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_f32_row::( - gf, - bf, - rf, - af, - &mut buf[start..end], - n, - use_simd, - ); + gbrapf32_to_rgba_f32_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgb_u16.as_deref_mut() { @@ -827,15 +804,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { // gbrapf32_to_rgba_u16_row with widened source α. let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_u16_row::( - gf, - bf, - rf, - af, - &mut buf[start..end], - n, - use_simd, - ); + gbrapf32_to_rgba_u16_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.luma.as_deref_mut() { diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs index 361eee76..cb71de5f 100644 --- a/src/sinker/mixed/planar_gbr_float.rs +++ b/src/sinker/mixed/planar_gbr_float.rs @@ -303,14 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f32_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf32_to_rgba_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -330,14 +323,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f16_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf32_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB / RGBA path (direct float → u16, no staging) ----------- @@ -703,15 +689,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f32_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf32_to_rgba_f32_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -731,15 +709,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f16_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf32_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB path (direct, no staging) ------------------------------ diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs index 1cd09a1f..0f227c14 100644 --- a/src/sinker/mixed/tests/packed_rgb_f16.rs +++ b/src/sinker/mixed/tests/packed_rgb_f16.rs @@ -311,122 +311,36 @@ fn rgbf16_simd_matches_scalar_with_random_input() { assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless"); } -/// Sinker-layer host-native-`f16` regression for the bug fixed alongside -/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to -/// hardcode `::` when calling the row dispatchers, telling them to -/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native -/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted -/// every output path on BE hosts (the `u16` loaders would byte-swap an -/// already-decoded f16 bit-pattern). The fix replaces those `::` with -/// `::`, which is `false` on LE and `true` on BE — a no-op -/// byte-swap on either host. +/// LE-encoded byte contract regression: builds an [`Rgbf16Frame`] from a +/// `&[half::f16]` plane explicitly encoded as LE bytes (per the FFmpeg +/// `AV_PIX_FMT_*LE` convention documented on `Rgbf16Frame`), runs it +/// through the sinker's `with_rgb_f16` lossless pass-through, and asserts +/// the output equals the host-native intended values. /// -/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), -/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte -/// identical to `::`, so this test cannot distinguish the broken vs -/// fixed code on LE. It instead documents the equivalence at the **kernel -/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both -/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) -/// must produce identical output on the active host. +/// Vacuous on LE hosts (where `to_le` on a `u16` is a no-op so the LE- +/// encoded plane *is* host-native), but on a BE host this would fail fast +/// for any regression that drops the `::` kernel routing — the +/// kernel must apply `u16::from_le` to recover host-native f16 bit-patterns +/// from the LE-encoded bytes. /// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the -/// equality `::` ≡ `::` is _false_ — `::` -/// decodes the host-native fixture as if it were LE-encoded (byte-swap), -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. The dispatch-equivalence claim is specifically -/// about the LE host-routing pattern; the BE-host correctness of the routing -/// change is verified instead by -/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the -/// row-kernel BE parity tests in `src/row/arch/*/tests/`. +/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. #[test] -#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn rgbf16_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row, - rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row, - }; - - // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. - // Compute it locally so the test asserts the same condition without taking - // a dependency on a private const. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33 covers SIMD main loop + scalar tail across every backend. - let w = 33usize; - let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25]; - let pix: std::vec::Vec = (0..w * 3) - .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()])) - .collect(); - - // u8 RGB. - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - rgbf16_to_rgb_row::(&pix, &mut rgb_false, w, true); - rgbf16_to_rgb_row::(&pix, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); - - // u8 RGBA. - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - rgbf16_to_rgba_row::(&pix, &mut rgba_false, w, true); - rgbf16_to_rgba_row::(&pix, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); - - // u16 RGB. - let mut rgb_u16_false = std::vec![0u16; w * 3]; - let mut rgb_u16_host = std::vec![0u16; w * 3]; - rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); - rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); - assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); - - // u16 RGBA. - let mut rgba_u16_false = std::vec![0u16; w * 4]; - let mut rgba_u16_host = std::vec![0u16; w * 4]; - rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); - rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); - assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); - - // f16 lossless pass-through. - let mut f16_false = std::vec![half::f16::ZERO; w * 3]; - let mut f16_host = std::vec![half::f16::ZERO; w * 3]; - rgbf16_to_rgb_f16_row::(&pix, &mut f16_false, w, true); - rgbf16_to_rgb_f16_row::(&pix, &mut f16_host, w, true); - assert_eq!(f16_false, f16_host, "f16 RGB diverges"); - if !HOST_NATIVE_BE { - assert_eq!( - f16_host, pix, - "f16 lossless pass-through corrupted on LE host" - ); - } - - // f32 lossless widen. - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - rgbf16_to_rgb_f32_row::(&pix, &mut f32_false, w, true); - rgbf16_to_rgb_f32_row::(&pix, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 widen diverges"); -} - -/// End-to-end sinker contract test: feeding host-native `half::f16` through -/// [`MixedSinker`] must round-trip the f16 input bit-exact via -/// `with_rgb_f16` on every host. Documents the public-API contract that the -/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level -/// test above; together they cover both the dispatch boundary and the public -/// sinker boundary. -#[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf16_sinker_host_native_contract_lossless_passthrough() { +fn rgbf16_sinker_le_encoded_frame_decodes_correctly() { let vals_f32 = [0.5f32, 1.5, -0.25, 100.0]; - let pix: std::vec::Vec = (0..16 * 4 * 3) + let intended: Vec = (0..16 * 4 * 3) .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()])) .collect(); + // Encode the plane as LE bytes reinterpreted as f16 (the documented + // `*LE` Frame contract). On LE host: identity. On BE host: byte-swapped + // bit-patterns the kernel must `from_le` back to host-native. + let pix: Vec = intended + .iter() + .map(|&v| half::f16::from_bits(v.to_bits().to_le())) + .collect(); let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3]; @@ -435,7 +349,8 @@ fn rgbf16_sinker_host_native_contract_lossless_passthrough() { .unwrap(); rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - // Bit-exact pass-through on every host — broken `::` routing - // would byte-swap on a BE host; the fixed routing keeps the f16 intact. - assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted"); + assert_eq!( + rgb_f16_out, intended, + "Rgbf16 sinker LE-encoded plane decoded incorrectly" + ); } diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs index fd4df2da..38f23f28 100644 --- a/src/sinker/mixed/tests/packed_rgb_float.rs +++ b/src/sinker/mixed/tests/packed_rgb_float.rs @@ -246,130 +246,44 @@ fn rgbf32_simd_matches_scalar_with_random_input() { assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless"); } -/// Sinker-layer host-native-`f32` regression for the bug fixed alongside -/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to -/// hardcode `::` when calling the row dispatchers, telling them to -/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native -/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every -/// output path on BE hosts (the loaders would byte-swap an already-decoded -/// f32). The fix replaces those `::` with `::`, which -/// is `false` on LE and `true` on BE — a no-op byte-swap on either host. +/// LE-encoded byte contract regression: builds an [`Rgbf32Frame`] from a +/// `&[f32]` plane explicitly encoded as LE bytes (per the FFmpeg +/// `AV_PIX_FMT_*LE` convention documented on `Rgbf32Frame`), runs it +/// through the sinker's `with_rgb_f32` lossless pass-through, and asserts +/// the output equals the host-native intended values. /// -/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), -/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte -/// identical to `::`, so this test cannot distinguish the broken vs -/// fixed code on LE. It instead documents the equivalence at the **kernel -/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both -/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) -/// must produce identical output on the active host. +/// Vacuous on LE hosts (where `f32::to_le_bytes` is a no-op so the LE- +/// encoded plane *is* host-native), but on a BE host this would fail fast +/// for any regression that drops the `::` kernel routing — the +/// kernel must apply `u32::from_le` to recover host-native f32 from the +/// LE-encoded bytes; if it skipped the swap (e.g. `::` on +/// BE), the output would be byte-swapped relative to `intended`. /// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the -/// equality `::` ≡ `::` is _false_ — `::` -/// decodes the host-native fixture as if it were LE-encoded (byte-swap), -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. The dispatch-equivalence claim is specifically -/// about the LE host-routing pattern; the BE-host correctness of the routing -/// change is verified instead by -/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the -/// row-kernel BE parity tests in `src/row/arch/*/tests/`. +/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. #[test] -#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn rgbf32_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row, - rgbf32_to_rgba_u16_row, - }; - - // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. - // Compute it locally so the test asserts the same condition without taking - // a dependency on a private const. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33 covers SIMD main loop + scalar tail across every backend. - let w = 33usize; - let mut pix = std::vec![0.0f32; w * 3]; - for (i, v) in pix.iter_mut().enumerate() { - // Mix in-range, HDR, and negative values to exercise every clamp branch. - *v = match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }; - } - - // u8 RGB. - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - rgbf32_to_rgb_row::(&pix, &mut rgb_false, w, true); - rgbf32_to_rgb_row::(&pix, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); - - // u8 RGBA. - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - rgbf32_to_rgba_row::(&pix, &mut rgba_false, w, true); - rgbf32_to_rgba_row::(&pix, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); - - // u16 RGB. - let mut rgb_u16_false = std::vec![0u16; w * 3]; - let mut rgb_u16_host = std::vec![0u16; w * 3]; - rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); - rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); - assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); - - // u16 RGBA. - let mut rgba_u16_false = std::vec![0u16; w * 4]; - let mut rgba_u16_host = std::vec![0u16; w * 4]; - rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); - rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); - assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); - - // f32 lossless pass-through. - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - rgbf32_to_rgb_f32_row::(&pix, &mut f32_false, w, true); - rgbf32_to_rgb_f32_row::(&pix, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 RGB diverges"); - // And on the host (LE on every CI runner) both must equal `pix` bit-exact. - if !HOST_NATIVE_BE { - assert_eq!( - f32_host, pix, - "f32 lossless pass-through corrupted on LE host" - ); - } -} - -/// End-to-end sinker contract test: feeding host-native `f32` through -/// [`MixedSinker`] must produce the same output every other sinker -/// would expect from a host-native source — specifically, `with_rgb_f32` -/// must be bit-exact identical to the input on every host. Documents the -/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves. -/// Pairs with the kernel-level test above; together they cover both the -/// dispatch boundary and the public sinker boundary. -#[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf32_sinker_host_native_contract_lossless_passthrough() { +fn rgbf32_sinker_le_encoded_frame_decodes_correctly() { // Mix HDR, in-range, and negative values — the f32 lossless path must // round-trip them bit-exact on every host. - let mut pix = std::vec![0.0f32; 16 * 4 * 3]; - for (i, v) in pix.iter_mut().enumerate() { - *v = match i % 4 { + let intended: Vec = (0..16 * 4 * 3) + .map(|i| match i % 4 { 0 => 0.5, 1 => 1.5, 2 => -0.25, _ => 100.0, - }; - } + }) + .collect(); + // Construct the plane as LE-encoded bytes reinterpreted as f32 (the + // documented `*LE` Frame contract). On LE host this is identity; on BE + // host the bit-pattern is byte-swapped so the kernel must `from_le` it + // back to host-native. + let pix: Vec = intended + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3]; @@ -378,8 +292,10 @@ fn rgbf32_sinker_host_native_contract_lossless_passthrough() { .unwrap(); rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - // Bit-exact pass-through on every host. On the buggy `::` routing - // a BE host would see byte-swapped output here; on the fixed routing the - // assertion holds on both LE and BE. - assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted"); + // Output must be host-native intended values. On a BE host with a + // regressed `::` routing this would be byte-swapped. + assert_eq!( + rgb_f32_out, intended, + "Rgbf32 sinker LE-encoded plane decoded incorrectly" + ); } diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index e6420e59..0676c34e 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -862,291 +862,172 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() { ); } -// ---- HOST_NATIVE_BE routing parity (codex PR #84 Findings 1-3) ------------- +// ---- LE-encoded byte contract regressions (post-#83/#84/#85 audit) -------- // -// LE-host routing-equivalence and host-native sinker-contract tests for the -// `Gbrpf32` / `Gbrapf32` / `Gbrpf16` / `Gbrapf16` sinkers. Mirrors the -// `Rgbf32` / `Rgbf16` sinker tests added for PR #83's `dcf40a3` (sinker -// HOST_NATIVE_BE routing) and `c3a6478` (dispatch f16-widen HOST_NATIVE_BE -// routing). +// Each of the four float planar GBR Frame types is documented as +// LE-encoded bytes reinterpreted as `f32` / `half::f16` (FFmpeg `*LE` +// pixel-format convention). The sinker row-kernel dispatch must apply +// `u32::from_le` / `u16::from_le` (kernel `BE = false`) to recover host- +// native arithmetic from those bytes. These tests build a plane explicitly +// from LE-encoded bit patterns (`f32::from_bits(intended.to_bits().to_le())` +// and the f16 analogue) and assert the lossless pass-through output equals +// the host-native intended values. // -// On a LE host `HOST_NATIVE_BE = false`, so the kernel-level test below is -// a routing sanity check (proving the dispatcher / sinker substitute the -// correct `BE` template parameter); BE-host correctness of the routing is -// verified by the existing row-kernel BE parity tests in -// `src/row/arch/*/tests/` and by the contract tests below (which assert -// host-native pass-through end-to-end on every host). - -/// Kernel-level test: on a LE host, `gbrpf32_to_*::` and -/// `gbrpf32_to_*::` must produce byte-identical output for -/// every Tier 10 float planar GBR dispatcher across every output type -/// (u8 RGB / u8 RGBA / u16 RGB / u16 RGBA / f32 lossless). Width 33 covers -/// SIMD main loop + scalar tail across every backend; width 5 covers tail- -/// only paths. -/// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host -/// `::` decodes the host-native fixture as LE-encoded (byte-swap) -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. This sinker-routing-equivalence claim is -/// specifically about the LE host pattern; BE-host correctness of the -/// routing change is verified by the contract tests below and the row- -/// kernel BE parity tests in `src/row/arch/*/tests/`. -#[test] -#[cfg(target_endian = "little")] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrpf32_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_row, - gbrpf32_to_rgba_u16_row, - }; - - // Sinker-layer `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`; on - // the LE-host gate this evaluates to `false`. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33: SIMD main loop + scalar tail. Width 5: tail-only path. Run both - // to cover SIMD-tail-aware backends. - for w in [5usize, 7usize, 33usize] { - let mut gp = std::vec![0.0f32; w]; - let mut bp = std::vec![0.0f32; w]; - let mut rp = std::vec![0.0f32; w]; - for (i, (g, (b, r))) in gp - .iter_mut() - .zip(bp.iter_mut().zip(rp.iter_mut())) - .enumerate() - { - *g = match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }; - *b = match i % 5 { - 0 => 0.25, - 1 => 0.75, - 2 => 1.5, - 3 => 0.0, - _ => -0.5, - }; - *r = match i % 5 { - 0 => 1.0, - 1 => 0.5, - 2 => 0.0, - 3 => -0.25, - _ => 1.25, - }; - } - - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - gbrpf32_to_rgb_row::(&gp, &bp, &rp, &mut rgb_false, w, true); - gbrpf32_to_rgb_row::(&gp, &bp, &rp, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges (w = {w})"); - - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - gbrpf32_to_rgba_row::(&gp, &bp, &rp, &mut rgba_false, w, true); - gbrpf32_to_rgba_row::(&gp, &bp, &rp, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges (w = {w})"); - - let mut u16_false = std::vec![0u16; w * 3]; - let mut u16_host = std::vec![0u16; w * 3]; - gbrpf32_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_false, w, true); - gbrpf32_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_host, w, true); - assert_eq!(u16_false, u16_host, "u16 RGB diverges (w = {w})"); - - let mut u16a_false = std::vec![0u16; w * 4]; - let mut u16a_host = std::vec![0u16; w * 4]; - gbrpf32_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_false, w, true); - gbrpf32_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_host, w, true); - assert_eq!(u16a_false, u16a_host, "u16 RGBA diverges (w = {w})"); - - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - gbrpf32_to_rgb_f32_row::(&gp, &bp, &rp, &mut f32_false, w, true); - gbrpf32_to_rgb_f32_row::(&gp, &bp, &rp, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 RGB diverges (w = {w})"); - } -} +// Vacuous on LE host (where `to_le` is identity so the LE-encoded plane is +// host-native already), but on a BE host any regression that drops the +// `::` routing would be caught here — kernel without `from_le` would +// emit byte-swapped bit-patterns, failing the bit-exact assertion below. +// +// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. -/// Sinker contract test: feeding host-native `f32` planes through -/// [`MixedSinker`] must produce the same output every other sinker -/// would expect from a host-native source — specifically, `with_rgb_f32` -/// must be bit-exact identical to the source on every host. Documents the -/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves. -/// Pairs with the kernel-level test above; together they cover both the -/// dispatch boundary and the public sinker boundary. +/// LE-encoded byte contract regression for [`Gbrpf32`]. #[test] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn gbrpf32_sinker_host_native_contract_lossless_passthrough() { +fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; // Mix HDR, in-range, and negative values — the f32 lossless path must // round-trip them bit-exact on every host. - let mut gp = std::vec![0.0f32; w * h]; - let mut bp = std::vec![0.0f32; w * h]; - let mut rp = std::vec![0.0f32; w * h]; - for (i, (g, (b, r))) in gp - .iter_mut() - .zip(bp.iter_mut().zip(rp.iter_mut())) - .enumerate() - { - *g = match i % 4 { + let intended_g: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 0.5, 1 => 1.5, 2 => -0.25, _ => 100.0, - }; - *b = match i % 4 { + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 0.0, 1 => 0.25, 2 => 1.0, _ => f32::INFINITY, - }; - *r = match i % 4 { + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 1.0, 1 => -1.0, 2 => 65505.0, _ => 0.5, - }; - } + }) + .collect(); + // LE-encode each plane (per the documented `*LE` Frame contract). + let gp: Vec = intended_g + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); + let bp: Vec = intended_b + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); + let rp: Vec = intended_r + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); let src = Gbrpf32Frame::try_new( &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, ) .unwrap(); - // rgb_f32 lossless: each pixel `(R, G, B)` packed in source plane order. let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; let mut sink = MixedSinker::::new(w, h) .with_rgb_f32(&mut rgb_f32) .unwrap(); gbrpf32_to(&src, &mut sink).unwrap(); - // The lossless scatter writes `(R, G, B)` per pixel in plane-index order. - // Bit-exact equality on every host. Buggy `::` routing on a BE host - // would byte-swap the output here; the fix keeps it bit-exact. for i in 0..(w * h) { - assert_eq!(rgb_f32[i * 3], rp[i], "R mismatch at idx {i}"); - assert_eq!(rgb_f32[i * 3 + 1], gp[i], "G mismatch at idx {i}"); - assert_eq!(rgb_f32[i * 3 + 2], bp[i], "B mismatch at idx {i}"); + assert_eq!( + rgb_f32[i * 3].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgb_f32[i * 3 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgb_f32[i * 3 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); } } -/// Same as [`gbrpf32_kernel_host_native_be_matches_false_on_le_host`] but -/// for the `Gbrpf16` family — covers both `use_simd = false` (dispatch's -/// scalar widen-fallback) and `use_simd = true` (SIMD widen path) at tail -/// widths 5, 7, 33 to exercise every backend's main loop + scalar tail. +/// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA +/// pass-through, including the α plane). #[test] -#[cfg(target_endian = "little")] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn gbrpf16_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - gbrpf16_to_rgb_row, gbrpf16_to_rgb_u16_row, gbrpf16_to_rgba_row, gbrpf16_to_rgba_u16_row, +fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h).map(|i| 0.1 + (i as f32) * 0.001).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.2 + (i as f32) * 0.002).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.3 + (i as f32) * 0.003).collect(); + let intended_a: Vec = (0..w * h).map(|i| 0.5 + (i as f32) * 0.0005).collect(); + + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - for w in [5usize, 7usize, 33usize] { - let gp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }) - }) - .collect(); - let bp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 0.25, - 1 => 0.75, - 2 => 1.5, - 3 => 0.0, - _ => -0.5, - }) - }) - .collect(); - let rp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 1.0, - 1 => 0.5, - 2 => 0.0, - 3 => -0.25, - _ => 1.25, - }) - }) - .collect(); - - // Both `use_simd = false` and `use_simd = true` to cover dispatch's - // scalar widen-fallback and the SIMD widen path on every backend. - for use_simd in [false, true] { - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - gbrpf16_to_rgb_row::(&gp, &bp, &rp, &mut rgb_false, w, use_simd); - gbrpf16_to_rgb_row::(&gp, &bp, &rp, &mut rgb_host, w, use_simd); - assert_eq!( - rgb_false, rgb_host, - "u8 RGB diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - gbrpf16_to_rgba_row::(&gp, &bp, &rp, &mut rgba_false, w, use_simd); - gbrpf16_to_rgba_row::(&gp, &bp, &rp, &mut rgba_host, w, use_simd); - assert_eq!( - rgba_false, rgba_host, - "u8 RGBA diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut u16_false = std::vec![0u16; w * 3]; - let mut u16_host = std::vec![0u16; w * 3]; - gbrpf16_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_false, w, use_simd); - gbrpf16_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_host, w, use_simd); - assert_eq!( - u16_false, u16_host, - "u16 RGB diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut u16a_false = std::vec![0u16; w * 4]; - let mut u16a_host = std::vec![0u16; w * 4]; - gbrpf16_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_false, w, use_simd); - gbrpf16_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_host, w, use_simd); - assert_eq!( - u16a_false, u16a_host, - "u16 RGBA diverges (w = {w}, use_simd = {use_simd})" - ); - } + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_rgba_f32(&mut rgba_f32) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!( + rgba_f32[i * 4].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 3].to_bits(), + intended_a[i].to_bits(), + "A idx {i}" + ); } } -/// Sinker contract: host-native `half::f16` source through [`MixedSinker`] -/// `with_rgb_f16` must round-trip the planes bit-exact on every host. The -/// `::` routing keeps the lossless interleave a no-op in the -/// BE-load layer; the buggy `::` routing on a BE host would byte-swap -/// every f16 element. +/// LE-encoded byte contract regression for [`Gbrpf16`]. #[test] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { +fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; - let gp: Vec = (0..w * h) + let intended_g: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 0.5, @@ -1156,7 +1037,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); - let bp: Vec = (0..w * h) + let intended_b: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 0.0, @@ -1166,7 +1047,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); - let rp: Vec = (0..w * h) + let intended_r: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 1.0, @@ -1176,6 +1057,14 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); let src = Gbrpf16Frame::try_new( &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, @@ -1191,46 +1080,53 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { for i in 0..(w * h) { assert_eq!( rgb_f16[i * 3].to_bits(), - rp[i].to_bits(), - "R mismatch at idx {i}" + intended_r[i].to_bits(), + "R idx {i}" ); assert_eq!( rgb_f16[i * 3 + 1].to_bits(), - gp[i].to_bits(), - "G mismatch at idx {i}" + intended_g[i].to_bits(), + "G idx {i}" ); assert_eq!( rgb_f16[i * 3 + 2].to_bits(), - bp[i].to_bits(), - "B mismatch at idx {i}" + intended_b[i].to_bits(), + "B idx {i}" ); } } -/// Sinker contract: [`MixedSinker`] `with_rgba_f16` must round-trip -/// the source α plane bit-exact alongside the G/B/R planes, on every host. -/// Validates Strategy A+ alpha consistency under the `HOST_NATIVE_BE` -/// routing — the previous mix-mode (LE-decoded RGB + host-native α) is gone. +/// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA +/// pass-through, including the α plane). #[test] #[cfg_attr( miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() { +fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; - let gp: Vec = (0..w * h) + let intended_g: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001)) .collect(); - let bp: Vec = (0..w * h) + let intended_b: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002)) .collect(); - let rp: Vec = (0..w * h) + let intended_r: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003)) .collect(); - let ap: Vec = (0..w * h) + let intended_a: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001)) .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); let src = Gbrapf16Frame::try_new( &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, @@ -1244,52 +1140,26 @@ fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() { gbrapf16_to(&src, &mut sink).unwrap(); for i in 0..(w * h) { - assert_eq!(rgba_f16[i * 4].to_bits(), rp[i].to_bits(), "R idx {i}"); - assert_eq!(rgba_f16[i * 4 + 1].to_bits(), gp[i].to_bits(), "G idx {i}"); - assert_eq!(rgba_f16[i * 4 + 2].to_bits(), bp[i].to_bits(), "B idx {i}"); - assert_eq!(rgba_f16[i * 4 + 3].to_bits(), ap[i].to_bits(), "A idx {i}"); - } -} - -/// Sinker contract: [`MixedSinker`] `with_rgba_f32` lossless -/// pass-through plus α — confirms Strategy A+ alpha consistency when the -/// f32 RGB chain routes via `HOST_NATIVE_BE`. The α plane is host-native -/// f32, also routed via `HOST_NATIVE_BE`, eliminating any mix-mode. -#[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha() { - let w = 16usize; - let h = 4usize; - let mut gp = std::vec![0.0f32; w * h]; - let mut bp = std::vec![0.0f32; w * h]; - let mut rp = std::vec![0.0f32; w * h]; - let mut ap = std::vec![0.0f32; w * h]; - for i in 0..(w * h) { - gp[i] = 0.1 + (i as f32) * 0.001; - bp[i] = 0.2 + (i as f32) * 0.002; - rp[i] = 0.3 + (i as f32) * 0.003; - ap[i] = 0.5 + (i as f32) * 0.0005; - } - - let src = Gbrapf32Frame::try_new( - &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, - ) - .unwrap(); - - let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; - let mut sink = MixedSinker::::new(w, h) - .with_rgba_f32(&mut rgba_f32) - .unwrap(); - gbrapf32_to(&src, &mut sink).unwrap(); - - for i in 0..(w * h) { - assert_eq!(rgba_f32[i * 4], rp[i], "R idx {i}"); - assert_eq!(rgba_f32[i * 4 + 1], gp[i], "G idx {i}"); - assert_eq!(rgba_f32[i * 4 + 2], bp[i], "B idx {i}"); - assert_eq!(rgba_f32[i * 4 + 3], ap[i], "A idx {i}"); + assert_eq!( + rgba_f16[i * 4].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 3].to_bits(), + intended_a[i].to_bits(), + "A idx {i}" + ); } } From 78f1c2f8ae0562b41a5635e1cb4014a19dbf69ba Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 20:35:28 +1200 Subject: [PATCH 04/10] fix(sinker): replace LE-host-only widen_f16_to_f32 helper with bit-normalize-first canonical helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the BE-host correctness gap flagged in the original body of #92: the sinker-local `widen_f16_to_f32` in `src/sinker/mixed/planar_gbr_f16.rs` called `half::f16::to_f32` directly on f16 plane bits, treating the bytes as host-native. That assumption only holds on a little-endian host with an LE-encoded source — on a big-endian host the LE-encoded f16 bits would be byte-swapped from the intended values, and `to_f32` would decode them as arbitrarily wrong magnitudes / signs / NaNs. Replaces every call site with the canonical bit-normalize-first helper `crate::row::scalar::planar_gbr_f16::widen_f16_be_to_host_f32::` (BE = false because Frame plane bytes are LE-encoded per the contract documented in Phase 1 of this PR). The shared helper does `f32 = f16::from_bits(u16::from_le(plane_bits)).to_f32()` — correct on both LE hosts (where `from_le` is identity) and BE hosts (where it byte- swaps the LE-encoded plane bits back to host-native f16 bits before the f16 → f32 conversion). The downstream `gbrpf32_to_*` row kernels are still invoked with `BE = false` since the widened scratch is now host-native f32 (its `u32::from_le` is then a no-op on LE host and the right byte-swap on BE host — same contract as the dispatch f16-widen fallback and the per- backend SIMD scalar tails already use). Updated 7 call sites: - 3 in the `Gbrpf16` `process` widen loop (G, B, R planes). - 4 in the `Gbrapf16` `process` widen loop (G, B, R, A planes). - 1 in the `widen_and_scatter_f16_alpha_to_u8` u8 RGBA fast-path helper (counted as the seventh distinct call site; the other six are the two widen-loop blocks). Removed the local `widen_f16_to_f32` helper (and its now-stale LE-host-only-caveat doc-comment) entirely — no remaining callers. Adds two LE-encoded byte-contract regression tests covering the widening path (the existing PR #92 tests covered only the lossless f16 pass-through path which doesn't go through the widen helper): - gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly (Gbrpf16 → with_rgb_f32; exercises the 3-plane widen loop) - gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly (Gbrapf16 → with_rgba_f32; exercises the 4-plane widen loop incl. α) Each test constructs LE-encoded `&[half::f16]` planes via `half::f16::from_bits(intended.to_bits().to_le())`, builds a Frame, runs the widening sinker path, and asserts the f32 output equals the host- native expected values. Vacuous on LE hosts (where `to_le` is identity) but on a BE host any regression that drops the bit-normalize-first step in `widen_f16_be_to_host_f32::` would fail fast. Verification (aarch64-apple-darwin, x86_64-apple-darwin, wasm32): cargo test --target aarch64-apple-darwin --lib # 2269 ok cargo build --target x86_64-apple-darwin --tests # 0 warnings RUSTFLAGS="-C target-feature=+simd128" \ cargo build --target wasm32-unknown-unknown --tests # ok cargo build --no-default-features # ok cargo fmt --check # ok cargo clippy --all-targets --all-features -- -D warnings # ok Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/planar_gbr_f16.rs | 54 +++------ src/sinker/mixed/tests/planar_gbr_float.rs | 125 +++++++++++++++++++++ 2 files changed, 142 insertions(+), 37 deletions(-) diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 150ef729..94fc963f 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -51,7 +51,8 @@ use crate::{ gbrapf32_to_rgba_u16_row, gbrpf16_to_rgb_f16_row, gbrpf16_to_rgb_row, gbrpf16_to_rgba_f16_row, gbrpf16_to_rgba_row, gbrpf32_to_hsv_row, gbrpf32_to_luma_row, gbrpf32_to_luma_u16_row, gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_f32_row, - gbrpf32_to_rgba_u16_row, scalar::alpha_extract::copy_alpha_plane_f32_to_u8, + gbrpf32_to_rgba_u16_row, + scalar::{alpha_extract::copy_alpha_plane_f32_to_u8, planar_gbr_f16::widen_f16_be_to_host_f32}, }, yuv::{Gbrapf16, Gbrapf16Row, Gbrapf16Sink, Gbrpf16, Gbrpf16Row, Gbrpf16Sink}, }; @@ -64,34 +65,6 @@ const GBR_F16_FULL_RANGE: bool = true; // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated). const WIDEN_CHUNK: usize = 64; -/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements). -/// -/// The source slice is `&[half::f16]` from a [`Gbrpf16Frame`] / -/// [`Gbrapf16Frame`] plane — i.e. **LE-encoded bytes** reinterpreted as -/// `half::f16` (per the FFmpeg `*LE` contract). On a little-endian host -/// LE bytes _are_ host-native, so `half::f16::to_f32` interprets the bits -/// correctly and the downstream `gbrpf32_to_*` kernel — invoked with -/// `BE = false` — sees host-native f32 (its `u32::from_le` is a no-op on -/// LE host) → correct. -/// -/// On a big-endian host this helper would mis-interpret the LE-encoded -/// bits as host-native (a separate bug from the sinker-layer routing -/// reverted in this PR); the bit-normalising scalar helper -/// [`crate::row::scalar::widen_f16_be_to_host_f32`] is the correct pattern -/// for cross-endian widen-then-convert chains and is what the per-arch -/// SIMD backends use. Wiring this sinker-local helper to that bit- -/// normalising version is left as a future improvement — every CI runner -/// today is LE and exercises the correct path here. -/// -/// [`Gbrpf16Frame`]: crate::frame::Gbrpf16Frame -/// [`Gbrapf16Frame`]: crate::frame::Gbrapf16Frame -#[cfg_attr(not(tarpaulin), inline(always))] -fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) { - for i in 0..count { - dst[i] = src[i].to_f32(); - } -} - // ---- Gbrpf16 accessor impl block ---------------------------------------- impl<'a> MixedSinker<'a, Gbrpf16> { @@ -365,9 +338,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { let mut offset = 0; while offset < w { let n = (w - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n); - widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n); - widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n); + // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the + // downstream `gbrpf32_to_*` kernel (invoked with `BE = false`) sees + // host-native f32 on every host. See the canonical helper's docs. + widen_f16_be_to_host_f32::(g_in, offset, &mut gf_chunk, n); + widen_f16_be_to_host_f32::(b_in, offset, &mut bf_chunk, n); + widen_f16_be_to_host_f32::(r_in, offset, &mut rf_chunk, n); let gf = &gf_chunk[..n]; let bf = &bf_chunk[..n]; let rf = &rf_chunk[..n]; @@ -769,10 +745,12 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { let mut offset = 0; while offset < w { let n = (w - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n); - widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n); - widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n); - widen_f16_to_f32(&a_in[offset..], &mut af_chunk, n); + // Bit-normalise LE-encoded f16 plane bits → host-native f32 (see the + // canonical helper's docs); downstream kernel uses `BE = false`. + widen_f16_be_to_host_f32::(g_in, offset, &mut gf_chunk, n); + widen_f16_be_to_host_f32::(b_in, offset, &mut bf_chunk, n); + widen_f16_be_to_host_f32::(r_in, offset, &mut rf_chunk, n); + widen_f16_be_to_host_f32::(a_in, offset, &mut af_chunk, n); let gf = &gf_chunk[..n]; let bf = &bf_chunk[..n]; let rf = &rf_chunk[..n]; @@ -915,7 +893,9 @@ fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8 let mut offset = 0; while offset < width { let n = (width - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&alpha_f16[offset..], &mut af_chunk, n); + // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping + // and scaling to u8 — correct on both LE and BE hosts. + widen_f16_be_to_host_f32::(alpha_f16, offset, &mut af_chunk, n); copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n); offset += n; } diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index 0676c34e..7673e0a2 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -1163,6 +1163,131 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { } } +/// LE-encoded byte contract regression for [`Gbrpf16`] **widening path** +/// (`with_rgb_f32`). Exercises the f16 → f32 widen step in the sinker — which +/// must bit-normalise LE-encoded f16 plane bits before converting to f32. +/// +/// Vacuous on LE hosts (where `to_le` is identity); on a BE host any +/// regression that drops the bit-normalize-first step in +/// `widen_f16_be_to_host_f32::` would interpret byte-swapped bits as +/// host-native f16 and decode to wildly wrong f32 values. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.5, + 1 => 0.25, + 2 => 0.0, + _ => 1.0, + }) + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.125, + 1 => 0.75, + 2 => 0.0625, + _ => 0.875, + }) + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.375, + 1 => 0.625, + 2 => 0.9375, + _ => 0.03125, + }) + }) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + + let src = Gbrpf16Frame::try_new( + &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; + let mut sink = MixedSinker::::new(w, h) + .with_rgb_f32(&mut rgb_f32) + .unwrap(); + gbrpf16_to(&src, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgb_f32[i * 3], intended_r[i].to_f32(), "R idx {i}"); + assert_eq!(rgb_f32[i * 3 + 1], intended_g[i].to_f32(), "G idx {i}"); + assert_eq!(rgb_f32[i * 3 + 2], intended_b[i].to_f32(), "B idx {i}"); + } +} + +/// LE-encoded byte contract regression for [`Gbrapf16`] **widening path** +/// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 → +/// f32 widen step — same bit-normalise-first contract as the no-α variant. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001)) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002)) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003)) + .collect(); + let intended_a: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001)) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); + + let src = Gbrapf16Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_rgba_f32(&mut rgba_f32) + .unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgba_f32[i * 4], intended_r[i].to_f32(), "R idx {i}"); + assert_eq!(rgba_f32[i * 4 + 1], intended_g[i].to_f32(), "G idx {i}"); + assert_eq!(rgba_f32[i * 4 + 2], intended_b[i].to_f32(), "B idx {i}"); + assert_eq!(rgba_f32[i * 4 + 3], intended_a[i].to_f32(), "A idx {i}"); + } +} + // ---- 32-bit overflow guards ------------------------------------------------ // // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message From cf2605812c681d800b574c3fe99c682289385d8a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 21:01:31 +1200 Subject: [PATCH 05/10] fix(sinker): post-widen HOST_NATIVE_BE routing + Ya16 alpha endian-aware + frame docs cite LE names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses three follow-up findings from Codex's review of PR #92. Finding 1 — Post-widen f32 routing on Gbrpf16 / Gbrapf16. The Phase 2 follow-up replaced the local `widen_f16_to_f32` helper with the canonical `widen_f16_be_to_host_f32::`, but the immediate downstream `gbrpf32_to_*::` calls still told the f32 row kernel to apply `from_le` AGAIN — on a BE host this would byte-swap the already-host-native f32 scratch, corrupting output. Introduce a module-scope `HOST_NATIVE_BE = cfg!(target_endian = "big")` constant in `src/sinker/mixed/planar_gbr_f16.rs` and route ALL post-widen `gbrpf32_to_*` / `gbrapf32_to_*` calls through `::` (14 sites total across both Gbrpf16 and Gbrapf16 sinker bodies). The kernel's `from_le` / `from_be` then becomes a no-op on every host since post-widen scratch already matches the host byte order. Distinct from the Phase 2 revert: direct Frame-to-row-kernel calls (`gbrpf16_to_*::` for u8 / f16 outputs) still use `` per the LE-encoded Frame contract — those receive raw LE-encoded `&[half::f16]` plane bytes, so the kernel must apply `from_le`. Post-widen scratch is host-native f32, so it must use `` to keep the swap a no-op everywhere. The module-level comment block makes this distinction explicit. Finding 2 — Ya16 RGB+RGBA combined-path alpha not endian-aware. When `MixedSinker` attached BOTH `with_rgb` and `with_rgba`, the sinker ran `ya16_to_rgb_*::` (correct), expanded the RGB output to RGBA prefix, and called `copy_alpha_ya_u16` / `copy_alpha_ya_u16_to_u8` to overlay the alpha plane from the source — but those helpers read raw `packed[n*2+1]` directly without `u16::from_le`. On a BE host processing the LE-encoded `Ya16Frame`, the host-native u16 read would interpret LE-encoded bytes as host-native (BE), yielding the wrong alpha byte. Audit found four alpha-patch helpers with the same pattern across three formats — Ya16 (`copy_alpha_ya_u16` / `_to_u8`), AYUV64 (`copy_alpha_packed_u16x4_at_0` / `_to_u8_at_0`), and Rgba64 / Bgra64 (`copy_alpha_packed_u16x4_at_3` / `_to_u8_at_3`). All six were made endian-aware via a `` parameter applying `u16::from_le(packed[..])` (or `from_be` per BE) before scaling / storing, mirroring the pre-existing `copy_alpha_plane_u16` / `copy_alpha_plane_u16_to_u8` pattern from PR #81. Updated all 8 sinker call sites to pass `` (Frame contract is LE). The two AYUV64 helpers also have SIMD dispatchers — those gained a `` generic and the `safe_for_simd = !BE && cfg!(target_endian = "little")` gate that already exists for `copy_alpha_plane_u16_to_u8`, falling back to scalar in the BE-host or BE-data quadrants. The five SIMD scalar tails were pinned to `::` matching the precedent from PR #81 / PR #82. Finding 3 — Public Frame docs cite wrong FFmpeg names. `Rgbf16Frame` docs cited `AV_PIX_FMT_RGBF16` (target-endian alias — `RGBF16LE` on LE, `RGBF16BE` on BE). `Rgbf32Frame` docs cited `AV_PIX_FMT_RGBF32` which doesn't exist as an unsuffixed alias; FFmpeg has only the alpha-bearing `RGBAF32LE` / `RGBAF32BE` pair. A BE caller following the unsuffixed contract could pass native-BE bytes into an LE-decoder. Updated `src/frame/packed_rgb_f16.rs` to cite `AV_PIX_FMT_RGBF16LE` explicitly, with a parenthetical noting that the unsuffixed name is a target-endian alias to avoid future confusion. Updated `src/frame/packed_rgb_float.rs` to drop the citation of the non-existent `AV_PIX_FMT_RGBF32` constant entirely and instead document the contract as "LE-encoded f32 bytes — `colconv` pins to the canonical FFmpeg `*LE` byte-order convention regardless of host endianness, exactly as the existing `AV_PIX_FMT_RGBAF32LE` flavour". Tests added: - `gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly` exercises the Gbrpf16 widen → f32 → u16 chain (with `with_luma_u16` alongside) under the LE-encoded Frame contract — vacuous on LE, catches the double-swap on BE. - `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded` and the u16 sibling assert that Strategy A+ (combined `with_rgb` + `with_rgba`) produces α bytes byte-identical to the standalone `with_rgba` path on every host. Builds the `&[u16]` plane via `u16::from_ne_bytes(x.to_le_bytes())` so the in-memory bytes spell the LE-encoded contract on both LE and BE hosts. - 8 new BE-parity tests in `src/row/scalar/alpha_extract.rs` for the newly endian-aware AYUV64 / Rgba64 / Ya16 helpers (4 helpers × u8 / u16 variants), all using the `swap_bytes`-fixture pattern that exists for `copy_alpha_plane_u16` so the BE arm runs on every host without needing a real BE runner. Verified: - `cargo test --target aarch64-apple-darwin --lib` — 2278 passed - `cargo build --target x86_64-apple-darwin --tests` — clean - `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` — clean - `cargo build --no-default-features` — clean - `cargo fmt --check` — clean - `cargo clippy --all-targets --all-features -- -D warnings` — clean - `cargo check --target s390x-unknown-linux-gnu --lib` — clean (BE-host smoke) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/packed_rgb_f16.rs | 20 +- src/frame/packed_rgb_float.rs | 22 +- src/row/arch/neon/alpha_extract.rs | 8 +- src/row/arch/wasm_simd128/alpha_extract.rs | 8 +- src/row/arch/x86_avx2/alpha_extract.rs | 8 +- src/row/arch/x86_avx512/alpha_extract.rs | 8 +- src/row/arch/x86_sse41/alpha_extract.rs | 8 +- src/row/dispatch/alpha_extract.rs | 35 ++-- src/row/scalar/alpha_extract.rs | 224 +++++++++++++++++++-- src/sinker/mixed/ayuv64.rs | 11 +- src/sinker/mixed/gray.rs | 129 +++++++++++- src/sinker/mixed/packed_rgb_16bit.rs | 28 ++- src/sinker/mixed/planar_gbr_f16.rs | 60 ++++-- src/sinker/mixed/tests/planar_gbr_float.rs | 109 ++++++++++ 14 files changed, 587 insertions(+), 91 deletions(-) diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs index ecf9e8c6..d1decc2b 100644 --- a/src/frame/packed_rgb_f16.rs +++ b/src/frame/packed_rgb_f16.rs @@ -50,8 +50,7 @@ pub enum Rgbf16FrameError { }, } -/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`, -/// FFmpeg canonical `*LE` convention — see endian note below). +/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16LE`). /// One plane, 3 × `f16` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -70,12 +69,17 @@ pub enum Rgbf16FrameError { /// # Endian contract — **LE-encoded bytes** /// /// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted -/// as `f16`, matching the FFmpeg `*LE` pixel-format convention. On a -/// little-endian host (every CI runner today) LE bytes _are_ host-native, -/// so `&[half::f16]` is also a host-native f16 slice; on a big-endian host -/// the bytes have to be byte-swapped back to host-native before arithmetic. -/// Downstream row kernels handle this byte-swap (or no-op on LE) under the -/// hood — callers do **not** pre-swap. +/// as `f16`, matching the FFmpeg **`AV_PIX_FMT_RGBF16LE`** pixel-format +/// convention. (FFmpeg's unsuffixed `AV_PIX_FMT_RGBF16` is a *target-endian* +/// alias — `RGBF16LE` on a little-endian host, `RGBF16BE` on a big-endian +/// host — so this contract pins the canonical `*LE` byte order regardless +/// of host endianness.) +/// +/// On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so `&[half::f16]` is also a host-native f16 slice; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. /// /// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer /// from FFmpeg should cast via `bytemuck::cast_slice` and divide diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs index 166b0848..f002ecd2 100644 --- a/src/frame/packed_rgb_float.rs +++ b/src/frame/packed_rgb_float.rs @@ -50,8 +50,7 @@ pub enum Rgbf32FrameError { }, } -/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`, -/// FFmpeg canonical `*LE` convention — see endian note below). +/// A validated packed **RGBF32** frame. /// One plane, 3 × `f32` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -69,12 +68,19 @@ pub enum Rgbf32FrameError { /// # Endian contract — **LE-encoded bytes** /// /// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as -/// `f32`, matching the FFmpeg `*LE` pixel-format convention. On a -/// little-endian host (every CI runner today) LE bytes _are_ host-native, -/// so `&[f32]` is also a host-native float slice; on a big-endian host the -/// bytes have to be byte-swapped back to host-native before arithmetic. -/// Downstream row kernels handle this byte-swap (or no-op on LE) under the -/// hood — callers do **not** pre-swap. +/// `f32`. Note that, unlike RGBA float and the f16 variants, FFmpeg does +/// **not** define an `AV_PIX_FMT_RGBF32` constant for non-α 3-channel f32 +/// (only the 4-channel `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` +/// pair exists). `colconv` pins this frame to the canonical FFmpeg `*LE` +/// byte-order convention — i.e. callers should pass LE-encoded bytes +/// regardless of host endianness, exactly as they would for the existing +/// `AV_PIX_FMT_RGBAF32LE` flavour. +/// +/// On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so `&[f32]` is also a host-native float slice; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. /// /// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer /// from FFmpeg should cast via `bytemuck::cast_slice` and divide diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs index ffb04e6a..5135b3d6 100644 --- a/src/row/arch/neon/alpha_extract.rs +++ b/src/row/arch/neon/alpha_extract.rs @@ -116,7 +116,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -154,7 +154,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -357,7 +357,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -375,7 +375,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs index b999b618..bee7633d 100644 --- a/src/row/arch/wasm_simd128/alpha_extract.rs +++ b/src/row/arch/wasm_simd128/alpha_extract.rs @@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -226,7 +226,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -518,7 +518,7 @@ mod tests { unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w); } - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -538,7 +538,7 @@ mod tests { unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w); } - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs index 1ebe97c1..2c58d3e9 100644 --- a/src/row/arch/x86_avx2/alpha_extract.rs +++ b/src/row/arch/x86_avx2/alpha_extract.rs @@ -213,7 +213,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -636,7 +636,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -657,7 +657,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs index 203e08e3..45743fb5 100644 --- a/src/row/arch/x86_avx512/alpha_extract.rs +++ b/src/row/arch/x86_avx512/alpha_extract.rs @@ -206,7 +206,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -604,7 +604,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -627,7 +627,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs index d327e299..4b1d800b 100644 --- a/src/row/arch/x86_sse41/alpha_extract.rs +++ b/src/row/arch/x86_sse41/alpha_extract.rs @@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -227,7 +227,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -521,7 +521,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -542,7 +542,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs index 00ecb61e..1e0351c5 100644 --- a/src/row/dispatch/alpha_extract.rs +++ b/src/row/dispatch/alpha_extract.rs @@ -95,17 +95,26 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3( /// Runtime-dispatched α-extract for AYUV64 → u8 RGBA: gather α from /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u8) via `>> 8`. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source `packed` plane byte order (`false` = LE on +/// disk/wire — matching the LE-encoded `Ayuv64Frame` contract; +/// `true` = BE). Like [`copy_alpha_plane_u16_to_u8`], the existing SIMD +/// helpers use host-native u16 loads with no `from_le` / `from_be` +/// normalisation, so SIMD is only correct on LE host processing LE +/// source. The dispatcher computes +/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and falls +/// back to the target-endian-aware scalar in every other quadrant. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( packed: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width); + // SIMD α-extract helpers use host-native u16 loads. Force scalar in + // any quadrant where source byte order doesn't match host byte order. + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_packed_u16x4_to_u8_at_0::(packed, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -141,7 +150,7 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( }, _ => {} } - scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(packed, rgba_out, width); } // --------------------------------------------------------------------------- @@ -152,17 +161,19 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u16). No depth /// conversion. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source `packed` plane byte order. See +/// [`copy_alpha_packed_u16x4_to_u8_at_0`] for the rationale: SIMD is +/// only correct on LE host with LE source; scalar is target-endian-aware. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_packed_u16x4_at_0( +pub(crate) fn copy_alpha_packed_u16x4_at_0( packed: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width); + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_packed_u16x4_at_0::(packed, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -198,7 +209,7 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0( }, _ => {} } - scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width); + scalar::copy_alpha_packed_u16x4_at_0::(packed, rgba_out, width); } // --------------------------------------------------------------------------- diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs index 6c77346a..bbf5922b 100644 --- a/src/row/scalar/alpha_extract.rs +++ b/src/row/scalar/alpha_extract.rs @@ -27,7 +27,16 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], wi /// into `rgba_out[3 + 4*n]` (u8 element) with depth-conv `>> 8`. /// /// AYUV64 layout per pixel: `[A(16), Y(16), U(16), V(16)]` — α is at slot 0. -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_AYUV64LE` per the Frame +/// contract; `true` = BE on disk/wire). Each raw u16 is normalised to +/// host-native order via `u16::from_le` / `u16::from_be` before the +/// `>> 8` depth conversion. On a host whose endianness matches the +/// source the conversion compiles to a no-op; otherwise it is a +/// `swap_bytes`. Without this a BE host (e.g., s390x) processing the +/// LE-encoded Frame would emit a byte-reversed α byte. +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -35,17 +44,34 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 4] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 4]) + } else { + u16::from_le(packed[n * 4]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } /// AYUV64 → u16 RGBA: gather α from `packed[0 + 4*n]` (u16) into /// `rgba_out[3 + 4*n]` (u16). No depth conversion. -pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_packed_u16x4_to_u8_at_0`] for the full rationale. +pub(crate) fn copy_alpha_packed_u16x4_at_0( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = packed[n * 4]; + let raw = if BE { + u16::from_be(packed[n * 4]) + } else { + u16::from_le(packed[n * 4]) + }; + rgba_out[n * 4 + 3] = raw; } } @@ -58,8 +84,15 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], /// Used in Strategy A+: after `expand_rgb_to_rgba_row` fills the RGBA buffer /// with a forced-opaque alpha, this helper overwrites only the α slot with the /// real source alpha, depth-converted to u8. +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_RGBA64LE` / +/// `AV_PIX_FMT_BGRA64LE` per the Frame contract; `true` = BE). Each raw +/// u16 is normalised to host-native order via `u16::from_le` / +/// `u16::from_be` before the `>> 8` depth conversion. Without this a BE +/// host processing the LE-encoded Frame would emit a byte-reversed α byte. #[allow(dead_code)] // wired in sinker Task 10 -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -67,7 +100,12 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 4 + 3] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 4 + 3]) + } else { + u16::from_le(packed[n * 4 + 3]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } @@ -77,12 +115,24 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( /// Used in Strategy A+: after `expand_rgb_u16_to_rgba_u16_row` fills the /// RGBA buffer, this helper overwrites only the α slot with the real source /// alpha at native 16-bit depth. +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_packed_u16x4_to_u8_at_3`] for the full rationale. #[allow(dead_code)] // wired in sinker Task 10 -pub(crate) fn copy_alpha_packed_u16x4_at_3(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn copy_alpha_packed_u16x4_at_3( + packed: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short"); for n in 0..width { - rgba_u16_out[n * 4 + 3] = packed[n * 4 + 3]; + let raw = if BE { + u16::from_be(packed[n * 4 + 3]) + } else { + u16::from_le(packed[n * 4 + 3]) + }; + rgba_u16_out[n * 4 + 3] = raw; } } @@ -195,21 +245,49 @@ pub(crate) fn copy_alpha_ya_u8(packed: &[u8], rgba_out: &mut [u8], width: usize) /// into `rgba_out[3 + 4*n]` (u8). /// /// Ya16 layout per pixel: `[Y(16), A(16)]` — α is at odd u16 offsets (slot 1). -pub(crate) fn copy_alpha_ya_u16_to_u8(packed: &[u16], rgba_out: &mut [u8], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_YA16LE` per the +/// `Ya16Frame` contract; `true` = BE). Each raw u16 is normalised to +/// host-native order via `u16::from_le` / `u16::from_be` before the +/// `>> 8` depth conversion. Without this a BE host processing the +/// LE-encoded Frame would emit a byte-reversed α byte. +pub(crate) fn copy_alpha_ya_u16_to_u8( + packed: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 2 + 1] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 2 + 1]) + } else { + u16::from_le(packed[n * 2 + 1]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } /// Ya16 → u16 RGBA: gather A from `packed[1 + 2*n]` (u16) into /// `rgba_out[3 + 4*n]` (u16). No depth conversion. -pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_ya_u16_to_u8`] for the full rationale. +pub(crate) fn copy_alpha_ya_u16( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = packed[n * 2 + 1]; + let raw = if BE { + u16::from_be(packed[n * 2 + 1]) + } else { + u16::from_le(packed[n * 2 + 1]) + }; + rgba_out[n * 4 + 3] = raw; } } @@ -270,21 +348,56 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_0_depth_converts_correctly() { let packed: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0x12, 1, 1, 1, 0xAB]); } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_0_preserves_native_u16() { let packed: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,]; let mut rgba = std::vec![1u16; 8]; - copy_alpha_packed_u16x4_at_0(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0x1234, 1, 1, 1, 0xABCD]); } + /// BE parity for AYUV64 alpha-at-slot-0 → u8 RGBA: byte-swapping the + /// packed source and toggling the `BE` flag must yield byte-for-byte + /// identical output. Locks down the corruption where a BE host + /// processing the LE-encoded Frame contract would emit a byte-reversed α. + #[test] + fn copy_alpha_packed_u16x4_to_u8_at_0_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 8]; + let mut rgba_be = std::vec![1u8; 8]; + copy_alpha_packed_u16x4_to_u8_at_0::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_to_u8_at_0::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for AYUV64 alpha-at-slot-0 → u16 RGBA. + #[test] + fn copy_alpha_packed_u16x4_at_0_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 8]; + let mut rgba_be = std::vec![7u16; 8]; + copy_alpha_packed_u16x4_at_0::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_at_0::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + #[test] fn copy_alpha_plane_u8_scatters_into_rgba_alpha_slot() { let alpha = std::vec![50u8, 60, 70, 80]; @@ -423,22 +536,59 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_ya_u16_to_u8_depth_converts_via_high_byte() { // Ya16 packed → u8 RGBA: α >> 8 selects the high byte. let packed: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_ya_u16_to_u8(&packed, &mut rgba, 2); + copy_alpha_ya_u16_to_u8::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0xFF]); } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_ya_u16_preserves_native_u16() { let packed: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC]; let mut rgba = std::vec![1u16; 8]; - copy_alpha_ya_u16(&packed, &mut rgba, 2); + copy_alpha_ya_u16::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xABCD, 1, 1, 1, 0x9ABC]); } + /// BE parity for Ya16 → u8 RGBA: byte-swapping the packed source and + /// toggling the `BE` flag must yield byte-for-byte identical output. + /// Locks down the codex-flagged corruption where a BE host (e.g. + /// s390x) processing the LE-encoded `Ya16Frame` would otherwise emit + /// a byte-reversed α byte under the combined `with_rgb + with_rgba` + /// Strategy A+ path. + #[test] + fn copy_alpha_ya_u16_to_u8_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00, 0x0001, 0x00FF]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 12]; + let mut rgba_be = std::vec![1u8; 12]; + copy_alpha_ya_u16_to_u8::(&packed_le, &mut rgba_le, 3); + copy_alpha_ya_u16_to_u8::(&packed_be, &mut rgba_be, 3); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for Ya16 → u16 RGBA (16-bit α path). + #[test] + fn copy_alpha_ya_u16_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC, 0x0001, 0x00FF]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 12]; + let mut rgba_be = std::vec![7u16; 12]; + copy_alpha_ya_u16::(&packed_le, &mut rgba_le, 3); + copy_alpha_ya_u16::(&packed_be, &mut rgba_be, 3); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + #[test] fn copy_alpha_plane_f32_to_u8_clamps_and_scales() { // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3. @@ -494,39 +644,73 @@ mod tests { /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n]. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_3_narrows_correctly() { let packed: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0x12]); } /// Alpha at slot 3 is copied verbatim (no depth conversion). #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_3_copies_verbatim() { let packed: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; let mut rgba_u16 = std::vec![1u16; 8]; - copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 2); + copy_alpha_packed_u16x4_at_3::(&packed, &mut rgba_u16, 2); assert_eq!(rgba_u16, std::vec![1, 1, 1, 0xABFF, 1, 1, 1, 0x1234]); } /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_3_touches_only_alpha_slot() { let packed: std::vec::Vec = std::vec![0, 0, 0, 0xFFFF]; let mut rgba = std::vec![42u8; 4]; - copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 1); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed, &mut rgba, 1); assert_eq!(rgba[..3], [42, 42, 42]); assert_eq!(rgba[3], 0xFF); } /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_3_touches_only_alpha_slot() { let packed: std::vec::Vec = std::vec![0, 0, 0, 0xBEEF]; let mut rgba_u16 = std::vec![99u16; 4]; - copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 1); + copy_alpha_packed_u16x4_at_3::(&packed, &mut rgba_u16, 1); assert_eq!(rgba_u16[..3], [99, 99, 99]); assert_eq!(rgba_u16[3], 0xBEEF); } + + /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u8 RGBA. + #[test] + fn copy_alpha_packed_u16x4_to_u8_at_3_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 8]; + let mut rgba_be = std::vec![1u8; 8]; + copy_alpha_packed_u16x4_to_u8_at_3::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u16 RGBA. + #[test] + fn copy_alpha_packed_u16x4_at_3_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 8]; + let mut rgba_be = std::vec![7u16; 8]; + copy_alpha_packed_u16x4_at_3::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_at_3::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } } diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs index 7782994d..6ab55456 100644 --- a/src/sinker/mixed/ayuv64.rs +++ b/src/sinker/mixed/ayuv64.rs @@ -350,7 +350,8 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0( + // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`. + crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0::( packed, rgba_row, w, use_simd, ); } @@ -404,7 +405,13 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0(packed, rgba_u16_row, w, use_simd); + // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`. + crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0::( + packed, + rgba_u16_row, + w, + use_simd, + ); } } diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs index 5d6c44e1..4cd03d13 100644 --- a/src/sinker/mixed/gray.rs +++ b/src/sinker/mixed/gray.rs @@ -1499,8 +1499,9 @@ impl PixelSink for MixedSinker<'_, Ya16> { let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - // Patch α from source (native u16 depth). - copy_alpha_ya_u16(packed, rgba_u16_row, w); + // Patch α from source (native u16 depth). `Ya16Frame` is LE-encoded + // per the unified Frame contract → `BE = false`. + copy_alpha_ya_u16::(packed, rgba_u16_row, w); } } @@ -1562,7 +1563,8 @@ impl PixelSink for MixedSinker<'_, Ya16> { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); // Overwrite the α channel with real source α (>> 8 for u8 output). - copy_alpha_ya_u16_to_u8(packed, rgba_row, w); + // `Ya16Frame` is LE-encoded per the unified Frame contract → `BE = false`. + copy_alpha_ya_u16_to_u8::(packed, rgba_row, w); } Ok(()) @@ -2408,6 +2410,127 @@ mod tests { assert_eq!(v, [0x80, 0xFF]); } + /// Strategy A+ (combined `with_rgb` + `with_rgba`) must produce α bytes + /// byte-identical to the standalone `with_rgba` path. Locks down the + /// codex-flagged corruption where a BE host processing the LE-encoded + /// `Ya16Frame` would otherwise diverge between the two paths: standalone + /// uses the endian-aware `ya16_to_rgba_row::` kernel; combined + /// expanded RGB → RGBA then patched α via `copy_alpha_ya_u16_to_u8` which + /// previously read raw `packed[n*2+1]` host-native and so emitted a + /// byte-reversed α byte on BE. After the fix, `copy_alpha_ya_u16_to_u8` + /// is target-endian-aware (`` for the LE Frame contract) and the + /// two paths agree on every host. + /// + /// To exercise the LE-encoded byte contract on every host we build the + /// `&[u16]` plane by bit-casting LE bytes — `u16::from_le_bytes` per + /// sample. On LE hosts that's a no-op; on BE hosts it byte-swaps so the + /// in-memory bytes match the FFmpeg `AV_PIX_FMT_YA16LE` layout. + #[test] + fn ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded() { + let w: u32 = 8; + let h: u32 = 1; + // Logical samples (Y, A) per pixel. + let samples: [(u16, u16); 8] = [ + (0x0000, 0xFFFF), + (0x8000, 0x4000), + (0xFFFF, 0x0000), + (0x1234, 0xABCD), + (0x00FF, 0xFF00), + (0x5A5A, 0xA5A5), + (0x7FFF, 0x8000), + (0xC000, 0x3FFF), + ]; + // Build the `&[u16]` plane such that its in-memory bytes match the + // FFmpeg `AV_PIX_FMT_YA16LE` byte layout on every host. We want a + // host-native u16 whose underlying bytes spell `[low, high]` (LE): + // `u16::from_ne_bytes(x.to_le_bytes())` is `x` on LE and `x.swap_bytes()` + // on BE — the right value to store in either case. + let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) }; + let packed: std::vec::Vec = samples + .iter() + .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)]) + .collect(); + let frame = Ya16Frame::new(&packed, w, h, w * 2); + + // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the + // newly endian-aware `copy_alpha_ya_u16_to_u8::`. + let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize]; + let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_rgb(&mut rgb_combined) + .unwrap() + .with_rgba(&mut rgba_combined) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + // Run standalone (with_rgba only) — exercises the endian-aware + // `ya16_to_rgba_row::` kernel. + let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_rgba(&mut rgba_standalone) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combined, rgba_standalone, + "combined (with_rgb+with_rgba) RGBA must equal standalone with_rgba" + ); + } + + /// u16 RGBA variant of the combined-vs-standalone parity check. Locks + /// down `copy_alpha_ya_u16::` (the u16 alpha-patch helper for + /// 16-bit RGBA outputs). + #[test] + fn ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded() { + let w: u32 = 8; + let h: u32 = 1; + let samples: [(u16, u16); 8] = [ + (0x0000, 0xFFFF), + (0x8000, 0x4000), + (0xFFFF, 0x0000), + (0x1234, 0xABCD), + (0x00FF, 0xFF00), + (0x5A5A, 0xA5A5), + (0x7FFF, 0x8000), + (0xC000, 0x3FFF), + ]; + // See sibling test for the `le_encoded` rationale. + let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) }; + let packed: std::vec::Vec = samples + .iter() + .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)]) + .collect(); + let frame = Ya16Frame::new(&packed, w, h, w * 2); + + let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize]; + let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_rgb_u16(&mut rgb_combined) + .unwrap() + .with_rgba_u16(&mut rgba_combined) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_rgba_u16(&mut rgba_standalone) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combined, rgba_standalone, + "combined (with_rgb_u16+with_rgba_u16) RGBA u16 must equal standalone" + ); + } + #[test] #[cfg_attr( miri, diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index 837b8246..49e34d84 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -726,7 +726,11 @@ impl PixelSink for MixedSinker<'_, Rgba64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::( + in64, rgba_row, w, + ); } } @@ -759,7 +763,13 @@ impl PixelSink for MixedSinker<'_, Rgba64> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::( + in64, + rgba_u16_row, + w, + ); } } @@ -995,7 +1005,11 @@ impl PixelSink for MixedSinker<'_, Bgra64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::( + in64, rgba_row, w, + ); } } @@ -1024,7 +1038,13 @@ impl PixelSink for MixedSinker<'_, Bgra64> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::( + in64, + rgba_u16_row, + w, + ); } } diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 94fc963f..12435e52 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -65,6 +65,22 @@ const GBR_F16_FULL_RANGE: bool = true; // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated). const WIDEN_CHUNK: usize = 64; +// Endianness routing for **post-widen** `gbrpf32_to_*` calls. +// +// `widen_f16_be_to_host_f32::` produces **host-native f32 scratch** from +// LE-encoded f16 plane bits (it normalises bits before widening), so the +// downstream `gbrpf32_to_*::` kernel sees input that already +// matches the host's byte order. The kernel's `from_le` / `from_be` then +// becomes a no-op on every host — correct. +// +// Distinct from the **direct** Frame-to-row-kernel pattern elsewhere in this +// file (the `gbrpf16_to_*::` u8/f16 calls): those receive raw LE-encoded +// `&[half::f16]` plane bytes per the unified Frame contract, so they pass +// `BE = false` to tell the kernel to apply `from_le`. Post-widen scratch is +// already host-native, so it must use `BE = HOST_NATIVE_BE` to keep the kernel +// byte-swap a no-op on every host. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Gbrpf16 accessor impl block ---------------------------------------- impl<'a> MixedSinker<'a, Gbrpf16> { @@ -354,29 +370,29 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -389,7 +405,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -402,7 +418,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( gf, bf, rf, @@ -762,31 +778,47 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { // gbrapf32_to_rgba_f32_row with widened source α (lossless). let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_f32_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); + gbrapf32_to_rgba_f32_row::( + gf, + bf, + rf, + af, + &mut buf[start..end], + n, + use_simd, + ); } if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { // gbrapf32_to_rgba_u16_row with widened source α. let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_u16_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); + gbrapf32_to_rgba_u16_row::( + gf, + bf, + rf, + af, + &mut buf[start..end], + n, + use_simd, + ); } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -799,7 +831,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -812,7 +844,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( gf, bf, rf, diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index 7673e0a2..faf8a2e1 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -1288,6 +1288,115 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { } } +/// LE-encoded byte contract regression for [`Gbrpf16`] **widening → narrow +/// chain** (`with_rgb_u16` and `with_rgba`). Covers the post-widen routing +/// where `gbrpf32_to_rgb_u16_row` / `gbrpf32_to_rgba_u16_row` / +/// `gbrpf32_to_rgb_row` are invoked on **host-native f32 scratch** produced +/// by `widen_f16_be_to_host_f32::`. +/// +/// On a BE host this would have been corrupted under the prior +/// `gbrpf32_to_*::` post-widen routing — that kernel applied +/// `from_le` to scratch that was already host-native, byte-swapping the +/// f32 representation before scaling. Fixed by routing post-widen calls +/// through `::` (`true` on BE, `false` on LE), which makes +/// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch +/// the double-swap on BE. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.5, + 1 => 0.25, + 2 => 0.0, + _ => 1.0, + }) + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.125, + 1 => 0.75, + 2 => 0.0625, + _ => 0.875, + }) + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.375, + 1 => 0.625, + 2 => 0.9375, + _ => 0.03125, + }) + }) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + + let src = Gbrpf16Frame::try_new( + &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Exercise the u16 narrow path (post-widen → gbrpf32_to_rgb_u16_row). + let mut rgb_u16 = std::vec![0u16; w * h * 3]; + // Exercise the u8 narrow path via with_rgba (Strategy A: post-widen + // is unused for u8 since rgba=opaque-α; we trigger the SAME post-widen + // path by also attaching luma_u16 alongside u16). + let mut luma_u16 = std::vec![0u16; w * h]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgb_u16(&mut rgb_u16) + .unwrap() + .with_luma_u16(&mut luma_u16) + .unwrap(); + gbrpf16_to(&src, &mut sink).unwrap(); + } + + // Assert RGB u16 output matches the intended (clamp+scale × 65535) values. + let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 }; + for i in 0..(w * h) { + assert_eq!( + rgb_u16[i * 3], + to_u16(intended_r[i].to_f32()), + "RGB u16 R idx {i}" + ); + assert_eq!( + rgb_u16[i * 3 + 1], + to_u16(intended_g[i].to_f32()), + "RGB u16 G idx {i}" + ); + assert_eq!( + rgb_u16[i * 3 + 2], + to_u16(intended_b[i].to_f32()), + "RGB u16 B idx {i}" + ); + } + // Sanity: luma_u16 (post-widen narrow) is non-zero — locks down that + // the post-widen luma kernel also sees host-native f32 scratch. + assert!( + luma_u16.iter().any(|&v| v > 0), + "luma_u16 must contain non-zero samples — \ + a corrupted byte-swap would still emit non-zero output but the rgb_u16 \ + assertion above is the primary guard" + ); +} + // ---- 32-bit overflow guards ------------------------------------------------ // // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message From f226d5a0a8861689d3b9f992e6a797f2b1d3fd80 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 21:16:44 +1200 Subject: [PATCH 06/10] fix(sinker): endian-aware f32 alpha-patch helpers + dual routing for direct-Frame vs post-widen paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex 3rd-pass review of #92 found that the f32 alpha-patch helpers `copy_alpha_plane_f32_to_u8`, `copy_alpha_plane_f32_to_u16`, and `copy_alpha_plane_f32` read each f32 α sample as host-native, which silently corrupted the α slot on BE hosts (e.g., s390x) processing the LE-encoded `Gbrapf32Frame` α plane. The byte-swapped f32 bits clamp to near-zero or near-one, so BE hosts emitted α = 0 or 255 regardless of the intended value. Same bug class as the u16 alpha-patch helpers fixed in cf26058, but for the f32 path; the standalone `with_rgba_f32` test didn't cover this because Strategy A+ uses a different code path (`expand_rgb_to_rgba_row` + alpha-patch). Helpers updated (4): * `copy_alpha_plane_f32_to_u8` — added ``, normalises each f32's bits via `f32::from_bits(u32::from_le/be(bits))` before clamp / scale / round-half-up. * `copy_alpha_plane_f32_to_u16` — same pattern. * `copy_alpha_plane_f32` — same pattern (lossless pass-through). * `copy_alpha_plane_f16` (planar_gbr_f16.rs) — added `` for consistency. Currently only test-called; future-proofs against a Strategy A+ wiring that consumes it. Sinker call sites updated, dual routing pattern (4 sites total): * `src/sinker/mixed/planar_gbr_float.rs` — Gbrapf32 Strategy A+ u8 RGBA path: `` (direct-Frame, LE-encoded per Phase-1 contract). * `src/sinker/mixed/planar_gbr_f16.rs` — `widen_and_scatter_f16_alpha_to_u8` helper: `` (post-widen, scratch is host-native f32 after `widen_f16_be_to_host_f32::`). Audit summary: only one direct-Frame call site (Gbrapf32 u8 RGBA) and one post-widen call site (Gbrapf16 u8 RGBA) currently invoke the f32 alpha-patch helpers. The Gbrapf32 RGBA u16 / RGBA f32 / RGBA f16 paths all use direct kernels (`gbrapf32_to_rgba_u16_row` / `gbrapf32_to_rgba_f32_row` / `gbrapf32_to_rgba_f16_row`) and were already endian-aware via their `` parameter; same for the Gbrapf16 widened RGBA u16 / f32 paths. Tests added (7): * `copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer` * `copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer` * `copy_alpha_plane_f32_be_parity_with_swapped_buffer` * `copy_alpha_plane_f16_be_parity_with_swapped_buffer` * `gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly` (w = 15, exercises scalar tail; compares Strategy A+ combo to standalone `with_rgba`). * `gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly` (w = 17, defense-in-depth for the u16 RGBA combo). * `gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly` (w = 15, verifies HOST_NATIVE_BE routing on the post-widen helper). The 3 prior `copy_alpha_plane_f32*_clamps_and_scales` / `_lossless_passthrough` tests are now `#[cfg(target_endian = "little")]` (host-native f32 fixtures); BE-host scalar correctness is locked down by the dedicated `*_be_parity_with_swapped_buffer` tests. Verified: * `cargo test --target aarch64-apple-darwin --lib` (2285 pass, +7). * `cargo build --target x86_64-apple-darwin --tests` (0 warnings). * `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests`. * `cargo build --no-default-features`. * `cargo fmt --check`. * `cargo clippy --all-targets --all-features -- -D warnings`. * `cargo check --target s390x-unknown-linux-gnu --lib` (BE-host smoke). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/scalar/alpha_extract.rs | 152 +++++++++++- src/row/scalar/planar_gbr_f16.rs | 55 ++++- src/sinker/mixed/planar_gbr_f16.rs | 12 +- src/sinker/mixed/planar_gbr_float.rs | 12 +- src/sinker/mixed/tests/planar_gbr_float.rs | 268 +++++++++++++++++++++ 5 files changed, 485 insertions(+), 14 deletions(-) diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs index bbf5922b..6fc664ac 100644 --- a/src/row/scalar/alpha_extract.rs +++ b/src/row/scalar/alpha_extract.rs @@ -296,13 +296,42 @@ pub(crate) fn copy_alpha_ya_u16( /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 255, and rounded /// with round-half-up (`+ 0.5` then truncate). Only slot 3 of every 4-element /// tuple is written; R, G, B slots are untouched. +/// +/// `BE` selects the **byte order** of the encoded source α plane: +/// `false` = LE on disk/wire (e.g., `AV_PIX_FMT_GBRAPF32LE` per the +/// `Gbrapf32Frame` contract; this also matches the case where the f32 +/// scratch is already host-native and the host is little-endian); +/// `true` = BE on disk/wire (or host-native scratch on a BE host). Each +/// raw f32 is bit-normalised to host-native order via +/// `f32::from_bits(u32::from_le(bits))` (or `from_be`) BEFORE the clamp / +/// scale / round-half-up. Without this a BE host (e.g., s390x) processing +/// the LE-encoded Frame would clamp byte-swapped garbage values, typically +/// producing α = 0 or α = 255 regardless of intent. Mirrors the +/// `copy_alpha_plane_u16_to_u8::` endian pattern. +/// +/// Routing pattern at the sinker layer: +/// - **Direct-Frame paths** (e.g., `Gbrapf32Frame` → α plane consumed directly) +/// pass `BE = false` (data is LE-encoded per the unified Frame contract). +/// - **Post-widen paths** (e.g., `Gbrapf16Frame` widened-to-f32 scratch) pass +/// `BE = HOST_NATIVE_BE` (scratch is host-native f32 after widen). // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) fn copy_alpha_plane_f32_to_u8( + alpha: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 255.0 + 0.5) as u8; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + let v = f32::from_bits(host_bits); + rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8; } } @@ -310,13 +339,28 @@ pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], wid /// /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 65535, and rounded /// with round-half-up. Only slot 3 of every 4-element tuple is written. +/// +/// `BE` selects the **byte order** of the encoded source α plane. +/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the +/// direct-Frame vs post-widen routing pattern. // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) fn copy_alpha_plane_f32_to_u16( + alpha: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 65535.0 + 0.5) as u16; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + let v = f32::from_bits(host_bits); + rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16; } } @@ -325,13 +369,30 @@ pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], w /// /// No clamping, no rounding — HDR values, NaN, and Inf in the α plane are /// preserved bit-exact. Only slot 3 of every 4-element tuple is written. +/// The output α is always written in **host-native** byte order (the +/// downstream consumer of `&[f32]` expects host-native floats); this helper's +/// `BE` only describes the **input** plane. +/// +/// `BE` selects the **byte order** of the encoded source α plane. +/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the +/// direct-Frame vs post-widen routing pattern. // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32(alpha: &[f32], rgba_out: &mut [f32], width: usize) { +pub(crate) fn copy_alpha_plane_f32( + alpha: &[f32], + rgba_out: &mut [f32], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = alpha[n]; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + rgba_out[n * 4 + 3] = f32::from_bits(host_bits); } } @@ -589,12 +650,18 @@ mod tests { ); } + /// On a LE host, `BE = false` makes the bit-normalize a no-op, so passing + /// host-native `f32` literals as if they were already LE-encoded reproduces + /// the original (pre-endian-aware) clamp+scale semantics. BE-host scalar + /// correctness is locked down by the `*_be_parity_with_swapped_buffer` + /// tests below. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_to_u8_clamps_and_scales() { // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3. let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1]; let mut rgba = vec![1u8; 20]; - copy_alpha_plane_f32_to_u8(&alpha, &mut rgba, 5); + copy_alpha_plane_f32_to_u8::(&alpha, &mut rgba, 5); // R, G, B slots (0, 1, 2) must be untouched; slot 3 has the alpha. assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0"); assert_eq!(rgba[7], 128, "alpha[1]=0.5 → 128"); @@ -608,11 +675,12 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_to_u16_clamps_and_scales() { // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 32768, 65535, 65535, 0] in slot 3. let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1]; let mut rgba = vec![1u16; 20]; - copy_alpha_plane_f32_to_u16(&alpha, &mut rgba, 5); + copy_alpha_plane_f32_to_u16::(&alpha, &mut rgba, 5); assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0"); assert_eq!(rgba[7], 32768, "alpha[1]=0.5 → 32768"); assert_eq!(rgba[11], 65535, "alpha[2]=1.0 → 65535"); @@ -625,11 +693,12 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_lossless_passthrough() { // HDR (2.5), NaN, Inf, negative all preserved bit-exact. let alpha = vec![2.5f32, f32::NAN, f32::INFINITY, -1.0]; let mut rgba = vec![0.0f32; 16]; - copy_alpha_plane_f32(&alpha, &mut rgba, 4); + copy_alpha_plane_f32::(&alpha, &mut rgba, 4); assert_eq!(rgba[3], 2.5, "HDR 2.5 preserved"); assert!(rgba[7].is_nan(), "NaN preserved"); assert!(rgba[11].is_infinite() && rgba[11] > 0.0, "+Inf preserved"); @@ -640,6 +709,71 @@ mod tests { assert_eq!(rgba[2], 0.0); } + /// BE parity for Gbrapf32 → u8 RGBA: byte-swapping the bits of every + /// f32 in the source α plane and toggling `BE` must produce identical + /// output. Locks down the codex 3rd-pass finding where a BE host + /// processing the LE-encoded `Gbrapf32Frame` would clamp byte-swapped + /// garbage values (typical result: α = 0 or α = 255 regardless of intent). + #[test] + fn copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![1u8; 32]; + let mut rgba_be = std::vec![1u8; 32]; + copy_alpha_plane_f32_to_u8::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32_to_u8::(&alpha_be, &mut rgba_be, 8); + assert_eq!( + rgba_le, rgba_be, + "BE flag + bit-swapped buffer must match LE path" + ); + } + + /// BE parity for Gbrapf32 → u16 RGBA. + #[test] + fn copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![7u16; 32]; + let mut rgba_be = std::vec![7u16; 32]; + copy_alpha_plane_f32_to_u16::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32_to_u16::(&alpha_be, &mut rgba_be, 8); + assert_eq!( + rgba_le, rgba_be, + "BE flag + bit-swapped buffer must match LE path" + ); + } + + /// BE parity for Gbrapf32 → f32 RGBA (lossless α pass-through). The + /// output α must equal the host-native f32 bit-pattern of the LE source + /// regardless of the host's byte order. NaN bit-patterns may differ + /// across hardware after a `from_bits → to_bits` round-trip, so we + /// compare on the bit representation of finite, non-NaN samples only. + #[test] + fn copy_alpha_plane_f32_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = + std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 2.5, -1.0, f32::INFINITY]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![0.0f32; 32]; + let mut rgba_be = std::vec![0.0f32; 32]; + copy_alpha_plane_f32::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32::(&alpha_be, &mut rgba_be, 8); + let bits_le: std::vec::Vec = rgba_le.iter().map(|v| v.to_bits()).collect(); + let bits_be: std::vec::Vec = rgba_be.iter().map(|v| v.to_bits()).collect(); + assert_eq!( + bits_le, bits_be, + "BE flag + bit-swapped buffer must match LE path bit-for-bit" + ); + } + // ---- copy_alpha_packed_u16x4_to_u8_at_3 / copy_alpha_packed_u16x4_at_3 -- /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n]. diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs index 3b9ba779..766c310b 100644 --- a/src/row/scalar/planar_gbr_f16.rs +++ b/src/row/scalar/planar_gbr_f16.rs @@ -198,16 +198,34 @@ pub(crate) fn gbrapf16_to_rgba_f16_row( /// Only slot 3 of every 4-element tuple is written; R, G, B slots are /// untouched. Lossless — HDR, NaN, and Inf in the α plane are preserved /// bit-exact. +/// +/// `BE` selects the **byte order** of the encoded source α plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_GBRAPF16LE` per the +/// `Gbrapf16Frame` contract; `true` = BE on disk/wire). Each raw f16 is +/// bit-normalised to host-native order via `u16::from_le` / `u16::from_be` +/// BEFORE the slot-3 write so the output buffer always carries host-native +/// `half::f16` (matching the rest of the f16 row kernels). Without this a +/// BE host processing the LE-encoded Frame would emit byte-reversed α bits. // Only called from the `mod tests` block which is gated on `feature = "std"`. // Under `cargo test --no-default-features` the test module is compiled out, // leaving the function without callers; suppress the resulting lint there. #[cfg_attr(not(feature = "std"), expect(dead_code))] #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_plane_f16(alpha: &[half::f16], rgba_out: &mut [half::f16], width: usize) { +pub(crate) fn copy_alpha_plane_f16( + alpha: &[half::f16], + rgba_out: &mut [half::f16], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = alpha[n]; + let raw = alpha[n].to_bits(); + let host_bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + rgba_out[n * 4 + 3] = half::f16::from_bits(host_bits); } } @@ -414,11 +432,12 @@ mod tests { miri, ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" )] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f16_only_writes_alpha_slot() { let alpha = vec![half::f16::from_f32(0.7), half::f16::from_f32(0.3)]; let sentinel = half::f16::from_f32(0.1); let mut rgba = vec![sentinel; 8]; - copy_alpha_plane_f16(&alpha, &mut rgba, 2); + copy_alpha_plane_f16::(&alpha, &mut rgba, 2); // Only slot 3 written; R, G, B slots (0, 1, 2) must be untouched. assert_eq!(rgba[0], sentinel, "R slot 0 untouched"); assert_eq!(rgba[1], sentinel, "G slot 0 untouched"); @@ -429,4 +448,34 @@ mod tests { assert_eq!(rgba[6], sentinel, "B slot 1 untouched"); assert_eq!(rgba[7], half::f16::from_f32(0.3), "A slot 1"); } + + /// BE parity for `copy_alpha_plane_f16`: byte-swapping the bits of every + /// f16 in the source α plane and toggling `BE` must produce identical + /// output. Mirrors the f32 alpha-patch endian-aware fix. + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn copy_alpha_plane_f16_be_parity_with_swapped_buffer() { + let alpha_le = vec![ + half::f16::from_f32(0.0), + half::f16::from_f32(0.25), + half::f16::from_f32(0.5), + half::f16::from_f32(1.0), + half::f16::from_f32(2.5), + half::f16::from_f32(-1.0), + ]; + let alpha_be = be_encode_f16(&alpha_le); + let mut rgba_le = vec![half::f16::ZERO; 24]; + let mut rgba_be = vec![half::f16::ZERO; 24]; + copy_alpha_plane_f16::(&alpha_le, &mut rgba_le, 6); + copy_alpha_plane_f16::(&alpha_be, &mut rgba_be, 6); + let bits_le: std::vec::Vec = rgba_le.iter().map(|v| v.to_bits()).collect(); + let bits_be: std::vec::Vec = rgba_be.iter().map(|v| v.to_bits()).collect(); + assert_eq!( + bits_le, bits_be, + "BE flag + bit-swapped buffer must match LE path bit-for-bit" + ); + } } diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 12435e52..5e7c0dda 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -919,6 +919,16 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { /// /// Used by `Gbrapf16` Strategy A+ and standalone-RGBA paths to overwrite /// the per-pixel alpha byte from the f16 source α plane. +/// +/// Endian routing: `widen_f16_be_to_host_f32::` converts the +/// LE-encoded `Gbrapf16Frame` α plane bits into **host-native f32** +/// scratch. The downstream `copy_alpha_plane_f32_to_u8` therefore receives +/// host-native f32 input, not LE-encoded f32, and must be invoked with +/// `BE = HOST_NATIVE_BE` so the kernel's `from_le` / `from_be` is a no-op +/// on every host (no second byte-swap). This is the **post-widen** routing +/// pattern; contrast with `planar_gbr_float.rs` which calls the same +/// helper with `BE = false` because it consumes the **direct** LE-encoded +/// `Gbrapf32Frame` α plane. #[cfg_attr(not(tarpaulin), inline(always))] fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8], width: usize) { let mut af_chunk = [0.0f32; WIDEN_CHUNK]; @@ -928,7 +938,7 @@ fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8 // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping // and scaling to u8 — correct on both LE and BE hosts. widen_f16_be_to_host_f32::(alpha_f16, offset, &mut af_chunk, n); - copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n); + copy_alpha_plane_f32_to_u8::(&af_chunk[..n], &mut rgba_out[offset * 4..], n); offset += n; } } diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs index cb71de5f..ddee135b 100644 --- a/src/sinker/mixed/planar_gbr_float.rs +++ b/src/sinker/mixed/planar_gbr_float.rs @@ -809,10 +809,20 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from // the source f32 α plane (clamped × 255 → u8). + // + // `BE = false`: `a_in` is the **direct** Gbrapf32Frame α plane, which + // is LE-encoded f32 per the Phase-1 unified Frame contract. The helper + // bit-normalises each f32 to host-native order before clamp/scale, so + // the conversion compiles to a no-op on LE hosts and a `swap_bytes` on + // BE hosts (e.g., s390x). Without this BE hosts would clamp byte- + // swapped garbage and emit α = 0 / 255 regardless of intent. Distinct + // from the **post-widen** routing in `planar_gbr_f16.rs` + // (`widen_and_scatter_f16_alpha_to_u8`), which feeds host-native f32 + // scratch into the same helper with `BE = HOST_NATIVE_BE`. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - copy_alpha_plane_f32_to_u8(a_in, rgba_row, w); + copy_alpha_plane_f32_to_u8::(a_in, rgba_row, w); } Ok(()) diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index faf8a2e1..4549ac34 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -1397,6 +1397,274 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { ); } +// ---- LE-encoded Strategy A+ alpha-patch regressions (codex 3rd-pass) ------ +// +// The `copy_alpha_plane_f32_to_u8` (and `copy_alpha_plane_f32_to_u16`, +// `copy_alpha_plane_f32`) helper used to read each f32 α sample as +// host-native, which silently corrupted the α slot on BE hosts processing +// the LE-encoded `Gbrapf32Frame` α plane (the byte-swapped bits clamp to +// near-zero or near-one, producing α = 0 or 255 regardless of intent). +// Same bug class as the u16 alpha-patch helpers fixed in cf26058. +// +// These regressions trigger the **Strategy A+ combo path** (`with_rgb` + +// `with_rgba`, `with_rgb_u16` + `with_rgba_u16`) on a Frame whose α plane +// is built from explicit LE-encoded f32 bit-patterns. On a LE host the +// `to_le` on f32 bits is identity so the test reduces to the original +// semantics; on a BE host the kernel without `from_le` would clamp +// byte-swapped garbage and the assertion would fail. The non-multiple-of- +// SIMD widths (15, 17) exercise scalar-tail correctness in addition to +// any vectorized body. + +/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb` + `with_rgba`) +/// on a LE-encoded f32 α plane must reproduce standalone `with_rgba` output +/// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::` +/// (already endian-aware), so any deviation indicates the Strategy A+ +/// alpha-patch path corrupted the α plane. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { + // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend. + let w = 15usize; + let h = 3usize; + let intended_g: Vec = (0..w * h).map(|i| 0.10 + (i as f32) * 0.001).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.20 + (i as f32) * 0.002).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.30 + (i as f32) * 0.003).collect(); + // Deliberately mix in-range, boundary, > 1, and negative α to stress + // clamp/scale correctness *after* the bit-normalize step. + let intended_a: Vec = (0..w * h) + .map(|i| match i % 7 { + 0 => 0.0, + 1 => 0.5, + 2 => 1.0, + 3 => 1.5, + 4 => -0.1, + 5 => 0.123, + _ => 0.876, + }) + .collect(); + + // LE-encode every plane (per the documented `*LE` Frame contract). + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); + + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba`. + let mut rgba_ref = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_ref) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + // Strategy A+: `with_rgb` + `with_rgba` combo (alpha-patch path). + let mut rgb_combo = std::vec![0u8; w * h * 3]; + let mut rgba_combo = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb_combo) + .unwrap() + .with_rgba(&mut rgba_combo) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf32 Strategy A+ alpha-patch must equal standalone `with_rgba`" + ); +} + +/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb_u16` + +/// `with_rgba_u16`) on a LE-encoded f32 α plane. Defense-in-depth: the +/// current sinker calls `gbrapf32_to_rgba_u16_row::` directly here +/// (no alpha-patch helper invocation), but any future routing change that +/// switches to the alpha-patch helper must keep BE-host correctness. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { + // 17 is non-multiple-of-{4,8,16}. + let w = 17usize; + let h = 3usize; + let intended_g: Vec = (0..w * h).map(|i| 0.11 + (i as f32) * 0.0011).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.22 + (i as f32) * 0.0022).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.33 + (i as f32) * 0.0033).collect(); + let intended_a: Vec = (0..w * h) + .map(|i| match i % 5 { + 0 => 0.0, + 1 => 0.25, + 2 => 1.0, + 3 => 0.5, + _ => 0.75, + }) + .collect(); + + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); + + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba_u16`. + let mut rgba_ref = std::vec![0u16; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgba_u16(&mut rgba_ref) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + // Combo: `with_rgb_u16` + `with_rgba_u16`. + let mut rgb_combo = std::vec![0u16; w * h * 3]; + let mut rgba_combo = std::vec![0u16; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgb_u16(&mut rgb_combo) + .unwrap() + .with_rgba_u16(&mut rgba_combo) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf32 Strategy A+ rgba_u16 must equal standalone `with_rgba_u16`" + ); + + // Independently assert the α slot reflects the intended values + // (clamp × 65535 + 0.5). This catches a hypothetical regression where + // both code paths share the same bug. + let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 }; + for i in 0..(w * h) { + assert_eq!( + rgba_combo[i * 4 + 3], + to_u16(intended_a[i]), + "α slot idx {i}" + ); + } +} + +/// Codex 3rd-pass regression: Gbrapf16 Strategy A+ (`with_rgb` + `with_rgba`) +/// on a LE-encoded f16 α plane. This exercises the **post-widen** routing +/// pattern in `widen_and_scatter_f16_alpha_to_u8`: the f16 α plane is +/// widened to host-native f32 scratch via `widen_f16_be_to_host_f32::`, +/// then the alpha-patch helper must consume that scratch with +/// `BE = HOST_NATIVE_BE` (no double byte-swap). The test compares the +/// Strategy A+ combo output against the standalone `with_rgba` path, which +/// uses the `gbrpf16_to_rgba_row::` direct kernel + the same +/// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the +/// `widen_and_scatter` helper, so this test guards against the +/// post-widen routing flag being wrong). +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { + let w = 15usize; + let h = 3usize; + let intended_g: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.10 + (i as f32) * 0.001)) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.20 + (i as f32) * 0.002)) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.30 + (i as f32) * 0.003)) + .collect(); + let intended_a: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 5 { + 0 => 0.0, + 1 => 0.5, + 2 => 1.0, + 3 => 0.25, + _ => 0.75, + }) + }) + .collect(); + + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); + + let src = Gbrapf16Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba` (uses `gbrpf16_to_rgba_row` then + // `widen_and_scatter_f16_alpha_to_u8` → exercises the post-widen helper + // as well, with the same routing as the combo path). + let mut rgba_ref = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_ref) + .unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + } + + // Strategy A+: `with_rgb` + `with_rgba`. + let mut rgb_combo = std::vec![0u8; w * h * 3]; + let mut rgba_combo = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb_combo) + .unwrap() + .with_rgba(&mut rgba_combo) + .unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf16 Strategy A+ post-widen alpha-patch must equal standalone `with_rgba`" + ); + + // Independently assert α slot reflects the intended values + // (widen → clamp × 255 + 0.5). + let to_u8 = |v: f32| -> u8 { (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8 }; + for i in 0..(w * h) { + assert_eq!( + rgba_combo[i * 4 + 3], + to_u8(intended_a[i].to_f32()), + "α slot idx {i}" + ); + } +} + // ---- 32-bit overflow guards ------------------------------------------------ // // Feeding width = usize::MAX / 2 + 1 to a dispatcher must panic with a message From 75cf8652d9d0a6d359b1208756703600e47fc79d Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 21:47:07 +1200 Subject: [PATCH 07/10] test(sinker): drop miri ignore on BE-contract regressions so BE CI exercises them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex 4th-pass review of #92 found the BE-sensitive sinker regressions skip on miri targets — but BE CI runs via miri (powerpc64 / s390x), so the tests effectively did not validate the BE path. The `cfg_attr(miri, ignore = ...)` gate was added defensively, but it skipped exactly the host where BE corruption would surface. Audit summary: every BE-sensitive regression added in this PR calls the high-level `MixedSinker` builder + `gbrpf32_to` / `gbrapf32_to` / `gbrpf16_to` / `gbrapf16_to` / `rgbf32_to` / `rgbf16_to`. The dispatch layer honours the sinker's `simd` flag — passing `with_simd(false)` bypasses every SIMD branch and calls the scalar kernels directly. The post-widen helpers (`widen_f16_be_to_host_f32`, `expand_rgb_to_rgba_row`, the `copy_alpha_plane_f32*` family) are pure scalar. None of these tests construct or invoke arch-specific SIMD intrinsics directly. Tests un-gated (12): * `src/sinker/mixed/tests/packed_rgb_float.rs` - `rgbf32_sinker_le_encoded_frame_decodes_correctly` * `src/sinker/mixed/tests/packed_rgb_f16.rs` - `rgbf16_sinker_le_encoded_frame_decodes_correctly` * `src/sinker/mixed/tests/planar_gbr_float.rs` - `gbrpf32_sinker_le_encoded_frame_decodes_correctly` - `gbrapf32_sinker_le_encoded_frame_decodes_correctly` - `gbrpf16_sinker_le_encoded_frame_decodes_correctly` - `gbrapf16_sinker_le_encoded_frame_decodes_correctly` - `gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly` - `gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly` - `gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly` - `gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly` - `gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly` - `gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly` Each test now starts the builder chain with `.with_simd(false)`, forcing scalar dispatch on every chained `with_rgb*` / `with_rgba*` / `with_luma*` sink. SIMD-using tests in `planar_gbr_high_bit.rs` (8 pre-existing miri ignores, untouched in this PR) retain the gate; those test SIMD-dispatched kernels with no scalar bypass and genuinely cannot run under miri. Local miri verification (where the f16 setup path's portable software fallback applies — i.e. all f32 tests, plus those f16 tests whose fixtures do not call `half::f16::from_f32` on aarch64+fp16): * `cargo miri test --lib gbrpf32_sinker_le_encoded` — 1 passed * `cargo miri test --lib gbrapf32_sinker_le_encoded` — 1 passed * `cargo miri test --lib gbrapf32_strategy_a_plus_le_encoded_frame` — 1 passed * `cargo miri test --lib gbrapf32_strategy_a_plus_le_encoded_u16` — 1 passed * `cargo miri test --lib rgbf32_sinker_le_encoded` — 1 passed The f16 regression tests build their fixtures via `half::f16::from_f32`, which on aarch64-apple-darwin (miri's local triple) compiles to an inline asm path keyed on `target_feature = "fp16"` — unsupported by miri locally. On the BE CI hosts (s390x / powerpc64), `half-2.7.1` falls through to its portable software fallback (no asm, no SIMD), so all f16 tests will execute correctly under `cargo miri test` on those targets — which is the entire point of removing the `miri,ignore` gate. Verified: * `cargo test --target aarch64-apple-darwin --lib` (2285 pass, 0 fail). * `cargo build --target x86_64-apple-darwin --tests` (0 warnings). * `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests`. * `cargo build --no-default-features`. * `cargo fmt --check`. * `cargo clippy --all-targets --all-features -- -D warnings`. * `cargo check --target s390x-unknown-linux-gnu --lib` (BE-host smoke). * `cargo miri test --lib gbrpf32_sinker_le_encoded` (and the four sibling f32 tests). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/tests/packed_rgb_f16.rs | 11 ++- src/sinker/mixed/tests/packed_rgb_float.rs | 11 ++- src/sinker/mixed/tests/planar_gbr_float.rs | 95 +++++++++++++--------- 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs index 0f227c14..574e482c 100644 --- a/src/sinker/mixed/tests/packed_rgb_f16.rs +++ b/src/sinker/mixed/tests/packed_rgb_f16.rs @@ -324,11 +324,13 @@ fn rgbf16_simd_matches_scalar_with_random_input() { /// from the LE-encoded bytes. /// /// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn rgbf16_sinker_le_encoded_frame_decodes_correctly() { let vals_f32 = [0.5f32, 1.5, -0.25, 100.0]; let intended: Vec = (0..16 * 4 * 3) @@ -345,6 +347,7 @@ fn rgbf16_sinker_le_encoded_frame_decodes_correctly() { let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3]; let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) .with_rgb_f16(&mut rgb_f16_out) .unwrap(); rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs index 38f23f28..8bd01266 100644 --- a/src/sinker/mixed/tests/packed_rgb_float.rs +++ b/src/sinker/mixed/tests/packed_rgb_float.rs @@ -260,11 +260,13 @@ fn rgbf32_simd_matches_scalar_with_random_input() { /// BE), the output would be byte-swapped relative to `intended`. /// /// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn rgbf32_sinker_le_encoded_frame_decodes_correctly() { // Mix HDR, in-range, and negative values — the f32 lossless path must // round-trip them bit-exact on every host. @@ -288,6 +290,7 @@ fn rgbf32_sinker_le_encoded_frame_decodes_correctly() { let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3]; let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) .with_rgb_f32(&mut rgb_f32_out) .unwrap(); rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index 4549ac34..a8132480 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -881,11 +881,13 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() { // Mirrors the `Grayf32` regression added in PR #85's `52f8191`. /// LE-encoded byte contract regression for [`Gbrpf32`]. +/// +/// Forces `with_simd(false)` so the test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -935,6 +937,7 @@ fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() { let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_f32(&mut rgb_f32) .unwrap(); gbrpf32_to(&src, &mut sink).unwrap(); @@ -960,11 +963,11 @@ fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() { /// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA /// pass-through, including the α plane). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -990,6 +993,7 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_f32(&mut rgba_f32) .unwrap(); gbrapf32_to(&src, &mut sink).unwrap(); @@ -1019,11 +1023,11 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { } /// LE-encoded byte contract regression for [`Gbrpf16`]. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1073,6 +1077,7 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { let mut rgb_f16 = std::vec![half::f16::ZERO; w * h * 3]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_f16(&mut rgb_f16) .unwrap(); gbrpf16_to(&src, &mut sink).unwrap(); @@ -1098,11 +1103,11 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { /// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA /// pass-through, including the α plane). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1135,6 +1140,7 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { let mut rgba_f16 = std::vec![half::f16::ZERO; w * h * 4]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_f16(&mut rgba_f16) .unwrap(); gbrapf16_to(&src, &mut sink).unwrap(); @@ -1171,11 +1177,11 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { /// regression that drops the bit-normalize-first step in /// `widen_f16_be_to_host_f32::` would interpret byte-swapped bits as /// host-native f16 and decode to wildly wrong f32 values. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1225,6 +1231,7 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_f32(&mut rgb_f32) .unwrap(); gbrpf16_to(&src, &mut sink).unwrap(); @@ -1239,11 +1246,11 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { /// LE-encoded byte contract regression for [`Gbrapf16`] **widening path** /// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 → /// f32 widen step — same bit-normalise-first contract as the no-α variant. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1276,6 +1283,7 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_f32(&mut rgba_f32) .unwrap(); gbrapf16_to(&src, &mut sink).unwrap(); @@ -1301,11 +1309,11 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { /// through `::` (`true` on BE, `false` on LE), which makes /// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch /// the double-swap on BE. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1361,6 +1369,7 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { let mut luma_u16 = std::vec![0u16; w * h]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_u16(&mut rgb_u16) .unwrap() .with_luma_u16(&mut luma_u16) @@ -1420,11 +1429,11 @@ fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { /// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::` /// (already endian-aware), so any deviation indicates the Strategy A+ /// alpha-patch path corrupted the α plane. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend. let w = 15usize; @@ -1466,6 +1475,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { let mut rgba_ref = std::vec![0u8; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba(&mut rgba_ref) .unwrap(); gbrapf32_to(&src, &mut sink).unwrap(); @@ -1476,6 +1486,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { let mut rgba_combo = std::vec![0u8; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb(&mut rgb_combo) .unwrap() .with_rgba(&mut rgba_combo) @@ -1494,11 +1505,11 @@ fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { /// current sinker calls `gbrapf32_to_rgba_u16_row::` directly here /// (no alpha-patch helper invocation), but any future routing change that /// switches to the alpha-patch helper must keep BE-host correctness. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { // 17 is non-multiple-of-{4,8,16}. let w = 17usize; @@ -1535,6 +1546,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { let mut rgba_ref = std::vec![0u16; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_u16(&mut rgba_ref) .unwrap(); gbrapf32_to(&src, &mut sink).unwrap(); @@ -1545,6 +1557,7 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { let mut rgba_combo = std::vec![0u16; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_u16(&mut rgb_combo) .unwrap() .with_rgba_u16(&mut rgba_combo) @@ -1581,11 +1594,11 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { /// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the /// `widen_and_scatter` helper, so this test guards against the /// post-widen routing flag being wrong). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { let w = 15usize; let h = 3usize; @@ -1631,6 +1644,7 @@ fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { let mut rgba_ref = std::vec![0u8; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba(&mut rgba_ref) .unwrap(); gbrapf16_to(&src, &mut sink).unwrap(); @@ -1641,6 +1655,7 @@ fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { let mut rgba_combo = std::vec![0u8; w * h * 4]; { let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb(&mut rgb_combo) .unwrap() .with_rgba(&mut rgba_combo) From fd797f94a052d107007e54a0032b07464d7ecfee Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 22:01:15 +1200 Subject: [PATCH 08/10] docs(frame): correct AV_PIX_FMT_RGBF32 binding statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex 5th-pass review of PR #92 caught: the prior doc said FFmpeg "does not define AV_PIX_FMT_RGBF32" — incorrect. FFmpeg actually defines AV_PIX_FMT_RGBF32BE, AV_PIX_FMT_RGBF32LE, and an unsuffixed alias AV_PIX_FMT_RGBF32 that is target-endian (resolves to LE on LE hosts and BE on BE hosts). Corrected: Rgbf32Frame maps to AV_PIX_FMT_RGBF32LE explicitly, with a warning that BE-host callers holding target-endian RGBF32 bytes must convert to LE before constructing the frame (otherwise the LE-decode contract would re-interpret BE bytes as LE → byte-swapped float data). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/packed_rgb_float.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs index f002ecd2..7ff0b660 100644 --- a/src/frame/packed_rgb_float.rs +++ b/src/frame/packed_rgb_float.rs @@ -65,16 +65,18 @@ pub enum Rgbf32FrameError { /// per-format convention that stride aligns with the underlying slice /// element type. No width parity constraint. /// -/// # Endian contract — **LE-encoded bytes** +/// # Endian contract — **LE-encoded bytes** (`AV_PIX_FMT_RGBF32LE`) /// /// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as -/// `f32`. Note that, unlike RGBA float and the f16 variants, FFmpeg does -/// **not** define an `AV_PIX_FMT_RGBF32` constant for non-α 3-channel f32 -/// (only the 4-channel `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` -/// pair exists). `colconv` pins this frame to the canonical FFmpeg `*LE` -/// byte-order convention — i.e. callers should pass LE-encoded bytes -/// regardless of host endianness, exactly as they would for the existing -/// `AV_PIX_FMT_RGBAF32LE` flavour. +/// `f32`. This frame maps to FFmpeg `AV_PIX_FMT_RGBF32LE`. FFmpeg also +/// defines `AV_PIX_FMT_RGBF32BE` and an unsuffixed `AV_PIX_FMT_RGBF32` +/// alias that is **target-endian** (resolves to `RGBF32LE` on LE hosts and +/// `RGBF32BE` on BE hosts). **Callers on a BE host who hold target-endian +/// `AV_PIX_FMT_RGBF32` bytes must convert them to LE before constructing +/// this frame** — otherwise the LE-decode contract here would re-interpret +/// the BE bytes as LE and produce byte-swapped float data. The 4-channel +/// `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` pair follows the same +/// `*LE` convention; this frame uses the analogous LE binding. /// /// On a little-endian host (every CI runner today) LE bytes _are_ /// host-native, so `&[f32]` is also a host-native float slice; on a From fbb49a199232e0a9d0cbe5bd4c91538b6284669a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 22:21:57 +1200 Subject: [PATCH 09/10] test: gate or force-scalar Ya16 sinker tests that hit vld2q_u16 under miri MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #92's CI miri jobs (aarch64-apple-darwin) failed with `unsupported operation: can't call foreign function llvm.aarch64.neon.ld2.v8i16.p0`. Stack pointed at the `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded` test calling `MixedSinker::::process` → `dispatch::ya16::ya16_to_rgb_row` → `arch::neon::gray::ya16_to_rgb_row::` → `vld2q_u16` (NEON intrinsic Miri does not implement). Both `ya16_combined_*` tests added in `cf26058` (this PR) use width=8 — the exact threshold where the NEON kernel kicks in (`while x + 8 <= width`) — so on aarch64 hosts the SIMD body executes. Other Ya16 sinker tests in this module use width 1 / 2 which never reaches the `vld2q_u16` block. These two tests are BE-contract regressions: they verify combined (`with_rgb` + `with_rgba`) and standalone (`with_rgba`) paths agree on the LE-encoded `Ya16Frame`, locking down the endian-aware `copy_alpha_ya_u16_to_u8::` and `copy_alpha_ya_u16::` helpers. They must execute on BE-host miri (s390x / powerpc64) — gating them with `cfg_attr(miri, ignore)` would skip exactly the host where BE corruption surfaces. So they get the same Option A treatment 75cf865 applied to the planar / packed-rgb BE-contract regressions: prepend `.with_simd(false)` to every `MixedSinker::new(..)` builder chain so the sinker dispatch routes to the scalar kernel — no SIMD intrinsic, miri clean, BE validation preserved. Audit summary (other tests in `sinker::mixed::gray::tests`): * All `gray*` / `gray*_walker_smoke_test` / `gray*_limited_range_*` / `grayf32_with_*` integration tests use width <= 4 — far below the NEON SIMD threshold (8 lanes) — so they never invoke `vld2q_u16`. * `gray*_walker_smoke_test` and similar use the gray-family kernels (`gray9` / `gray12` / `gray14`) which dispatch through different row kernels with different SIMD-eligibility thresholds; none hit `vld2q_u16` at width=4. * `grayf32_sinker_le_encoded_frame_decodes_correctly` retains its existing `cfg_attr(miri, ignore = "...")` (added in `cf26058`) — left untouched in this commit; if a follow-up wants to align it with the 75cf865 un-gating pattern it can be done independently. * `ya8_*` tests use the Ya8 kernels (no `vld2q_u16` — Ya8 deinterleave uses `vld2_u8` which Miri *does* support) and existing width=128/130 smoke is already `cfg_attr(miri, ignore)`. * `ya16_with_*` tests (rgba_u16 / rgba_u8 / rgb_and_rgba / hsv) all use width 1 / 2 — below the SIMD threshold — so they execute the scalar tail unconditionally on every host. * `ya16_width_128_and_130_smoke` is already `cfg_attr(miri, ignore)` (Option B) and stays so — it explicitly validates the SIMD-dispatched smoke path. Tests fixed (2 — both via Option A): * `ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded` * `ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded` Each gains `.with_simd(false)` on both the combined sinker (with_rgb + with_rgba builder chain) and the standalone sinker (with_rgba only), matching the pattern in `tests/packed_rgb_float.rs::rgbf32_sinker_le_encoded_frame_decodes_correctly`. Verified: * `cargo test --target aarch64-apple-darwin --lib` (2285 pass, 0 fail). * `cargo build --target x86_64-apple-darwin --tests` (0 warnings). * `cargo build --no-default-features`. * `cargo fmt --check`. * `cargo clippy --all-targets --all-features -- -D warnings`. * `cargo miri test --lib sinker::mixed::gray::tests::ya16_combined` (2 pass — confirms the fix on aarch64-apple-darwin miri, the exact CI host that previously failed). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/gray.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs index 4cd03d13..b07b9191 100644 --- a/src/sinker/mixed/gray.rs +++ b/src/sinker/mixed/gray.rs @@ -2453,11 +2453,16 @@ mod tests { let frame = Ya16Frame::new(&packed, w, h, w * 2); // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the - // newly endian-aware `copy_alpha_ya_u16_to_u8::`. + // newly endian-aware `copy_alpha_ya_u16_to_u8::`. Forces + // `with_simd(false)` so the test runs purely scalar — no SIMD intrinsics + // — which lets it execute under `cargo miri test`. BE CI is driven by + // miri on s390x / powerpc64; gating it out of miri would skip exactly + // the host where BE corruption would surface. let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize]; let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize]; { let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) .with_rgb(&mut rgb_combined) .unwrap() .with_rgba(&mut rgba_combined) @@ -2466,10 +2471,11 @@ mod tests { } // Run standalone (with_rgba only) — exercises the endian-aware - // `ya16_to_rgba_row::` kernel. + // `ya16_to_rgba_row::` kernel. Same scalar-only rationale. let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize]; { let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) .with_rgba(&mut rgba_standalone) .unwrap(); ya16_to(&frame, FR, M, &mut sink).unwrap(); @@ -2506,10 +2512,15 @@ mod tests { .collect(); let frame = Ya16Frame::new(&packed, w, h, w * 2); + // Forces `with_simd(false)` so this test runs purely scalar — no SIMD + // intrinsics — which lets it execute under `cargo miri test`. BE CI is + // driven by miri on s390x / powerpc64; gating it out of miri would skip + // exactly the host where BE corruption would surface. let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize]; let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize]; { let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) .with_rgb_u16(&mut rgb_combined) .unwrap() .with_rgba_u16(&mut rgba_combined) @@ -2520,6 +2531,7 @@ mod tests { let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize]; { let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) .with_rgba_u16(&mut rgba_standalone) .unwrap(); ya16_to(&frame, FR, M, &mut sink).unwrap(); From cd73821046161afb4bed241550c1c3421ff42475 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 22:23:57 +1200 Subject: [PATCH 10/10] docs(sinker): correct stale post-widen routing comment in planar_gbr_f16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot review on PR #92 caught: the comment at planar_gbr_f16.rs:359 said the downstream gbrpf32_to_* kernel was invoked with `BE = false`, but cf26058 changed the routing to `BE = HOST_NATIVE_BE` (correct for host-native f32 scratch produced by the bit-normalising widen helper). Updated comment to reflect actual routing + reinforced the distinction between post-widen scratch (host-native → HOST_NATIVE_BE) and direct- Frame paths (LE-encoded → ::) so future readers don't conflate the two contracts. Copilot findings 2 + 3 (Ya16 tests at gray.rs:2470,2528 missing .with_simd(false)) were already addressed in fbb49a1 — Copilot reviewed pre-fix state. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sinker/mixed/planar_gbr_f16.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 5e7c0dda..22159b92 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -355,8 +355,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { while offset < w { let n = (w - offset).min(WIDEN_CHUNK); // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the - // downstream `gbrpf32_to_*` kernel (invoked with `BE = false`) sees - // host-native f32 on every host. See the canonical helper's docs. + // downstream `gbrpf32_to_*` kernel (invoked with `BE = HOST_NATIVE_BE` + // — see module-scope constant) sees host-native f32 on every host. + // The post-widen scratch is host-native, distinct from the direct- + // Frame paths which use `` per the LE-encoded byte contract. widen_f16_be_to_host_f32::(g_in, offset, &mut gf_chunk, n); widen_f16_be_to_host_f32::(b_in, offset, &mut bf_chunk, n); widen_f16_be_to_host_f32::(r_in, offset, &mut rf_chunk, n);