diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs index 1469df86..d1decc2b 100644 --- a/src/frame/packed_rgb_f16.rs +++ b/src/frame/packed_rgb_f16.rs @@ -50,7 +50,7 @@ pub enum Rgbf16FrameError { }, } -/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`). +/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16LE`). /// One plane, 3 × `f16` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -65,6 +65,25 @@ pub enum Rgbf16FrameError { /// `stride` is in **`f16` elements** (≥ `3 * width`), matching the /// per-format convention that stride aligns with the underlying slice /// element type. No width parity constraint. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted +/// as `f16`, matching the FFmpeg **`AV_PIX_FMT_RGBF16LE`** pixel-format +/// convention. (FFmpeg's unsuffixed `AV_PIX_FMT_RGBF16` is a *target-endian* +/// alias — `RGBF16LE` on a little-endian host, `RGBF16BE` on a big-endian +/// host — so this contract pins the canonical `*LE` byte order regardless +/// of host endianness.) +/// +/// On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so `&[half::f16]` is also a host-native f16 slice; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgbf16Frame<'a> { rgb: &'a [half::f16], diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs index d1f79cef..7ff0b660 100644 --- a/src/frame/packed_rgb_float.rs +++ b/src/frame/packed_rgb_float.rs @@ -50,7 +50,7 @@ pub enum Rgbf32FrameError { }, } -/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`). +/// A validated packed **RGBF32** frame. /// One plane, 3 × `f32` per pixel, channel order `R, G, B`. /// /// Values are **linear** RGB by convention — no gamma / OETF handling @@ -64,6 +64,29 @@ pub enum Rgbf32FrameError { /// `stride` is in **`f32` elements** (≥ `3 * width`), matching the /// per-format convention that stride aligns with the underlying slice /// element type. No width parity constraint. +/// +/// # Endian contract — **LE-encoded bytes** (`AV_PIX_FMT_RGBF32LE`) +/// +/// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as +/// `f32`. This frame maps to FFmpeg `AV_PIX_FMT_RGBF32LE`. FFmpeg also +/// defines `AV_PIX_FMT_RGBF32BE` and an unsuffixed `AV_PIX_FMT_RGBF32` +/// alias that is **target-endian** (resolves to `RGBF32LE` on LE hosts and +/// `RGBF32BE` on BE hosts). **Callers on a BE host who hold target-endian +/// `AV_PIX_FMT_RGBF32` bytes must convert them to LE before constructing +/// this frame** — otherwise the LE-decode contract here would re-interpret +/// the BE bytes as LE and produce byte-swapped float data. The 4-channel +/// `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` pair follows the same +/// `*LE` convention; this frame uses the analogous LE binding. +/// +/// On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so `&[f32]` is also a host-native float slice; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgbf32Frame<'a> { rgb: &'a [f32], diff --git a/src/frame/planar_gbr_float.rs b/src/frame/planar_gbr_float.rs index d612e715..505a7bd7 100644 --- a/src/frame/planar_gbr_float.rs +++ b/src/frame/planar_gbr_float.rs @@ -147,6 +147,20 @@ const fn check_plane( /// `f32` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are /// preserved bit-exact on lossless pass-through outputs and clamped to /// `[0.0, 1.0]` on integer-output paths. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The three `&[f32]` planes are the **LE-encoded byte layout** reinterpreted +/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format +/// name. On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so the slices are also host-native float slices; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrpf32Frame<'a> { g: &'a [f32], @@ -250,6 +264,20 @@ impl<'a> Gbrpf32Frame<'a> { /// Four full-resolution `f32` planes in **G, B, R, A** order. Alpha is /// real per-pixel; nominal range `[0.0, 1.0]` (opaque = 1.0). Stride is /// in `f32` elements. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The four `&[f32]` planes are the **LE-encoded byte layout** reinterpreted +/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format +/// name. On a little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so the slices are also host-native float slices; on a +/// big-endian host the bytes have to be byte-swapped back to host-native +/// before arithmetic. Downstream row kernels handle this byte-swap (or +/// no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 4 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrapf32Frame<'a> { g: &'a [f32], @@ -372,6 +400,20 @@ impl<'a> Gbrapf32Frame<'a> { /// Three full-resolution [`half::f16`] planes in **G, B, R** order. Stride /// is in `f16` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are /// permitted (saturation to `+Inf` occurs on f16→f32 narrowing paths). +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The three `&[half::f16]` planes are the **LE-encoded byte layout** +/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in +/// the format name. On a little-endian host (every CI runner today) LE +/// bytes _are_ host-native, so the slices are also host-native f16 slices; +/// on a big-endian host the bytes have to be byte-swapped back to +/// host-native before arithmetic. Downstream row kernels handle this +/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrpf16Frame<'a> { g: &'a [half::f16], @@ -475,6 +517,20 @@ impl<'a> Gbrpf16Frame<'a> { /// Four full-resolution [`half::f16`] planes in **G, B, R, A** order. /// Alpha is real per-pixel; nominal range `[0.0, 1.0]`. Stride is in /// `f16` elements. +/// +/// # Endian contract — **LE-encoded bytes** +/// +/// The four `&[half::f16]` planes are the **LE-encoded byte layout** +/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in +/// the format name. On a little-endian host (every CI runner today) LE +/// bytes _are_ host-native, so the slices are also host-native f16 slices; +/// on a big-endian host the bytes have to be byte-swapped back to +/// host-native before arithmetic. Downstream row kernels handle this +/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap. +/// +/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers +/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each +/// `linesize[i]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Gbrapf16Frame<'a> { g: &'a [half::f16], diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs index ffb04e6a..5135b3d6 100644 --- a/src/row/arch/neon/alpha_extract.rs +++ b/src/row/arch/neon/alpha_extract.rs @@ -116,7 +116,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -154,7 +154,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -357,7 +357,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -375,7 +375,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs index b999b618..bee7633d 100644 --- a/src/row/arch/wasm_simd128/alpha_extract.rs +++ b/src/row/arch/wasm_simd128/alpha_extract.rs @@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -226,7 +226,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -518,7 +518,7 @@ mod tests { unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w); } - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -538,7 +538,7 @@ mod tests { unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w); } - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs index 1ebe97c1..2c58d3e9 100644 --- a/src/row/arch/x86_avx2/alpha_extract.rs +++ b/src/row/arch/x86_avx2/alpha_extract.rs @@ -213,7 +213,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -636,7 +636,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -657,7 +657,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs index 203e08e3..45743fb5 100644 --- a/src/row/arch/x86_avx512/alpha_extract.rs +++ b/src/row/arch/x86_avx512/alpha_extract.rs @@ -206,7 +206,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -604,7 +604,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -627,7 +627,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs index d327e299..4b1d800b 100644 --- a/src/row/arch/x86_sse41/alpha_extract.rs +++ b/src/row/arch/x86_sse41/alpha_extract.rs @@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_to_u8_at_0( + scalar::copy_alpha_packed_u16x4_to_u8_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -227,7 +227,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0( } if x < width { - scalar::copy_alpha_packed_u16x4_at_0( + scalar::copy_alpha_packed_u16x4_at_0::( &packed[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -521,7 +521,7 @@ mod tests { pseudo_random_u8(&mut rgba_simd, 0xFEED); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } @@ -542,7 +542,7 @@ mod tests { pseudo_random_u16(&mut rgba_simd, 0x1337); let mut rgba_scalar = rgba_simd.clone(); unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) }; - scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w); + scalar::copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba_scalar, w); assert_eq!(rgba_simd, rgba_scalar, "width={w}"); } } diff --git a/src/row/dispatch/alpha_extract.rs b/src/row/dispatch/alpha_extract.rs index 00ecb61e..1e0351c5 100644 --- a/src/row/dispatch/alpha_extract.rs +++ b/src/row/dispatch/alpha_extract.rs @@ -95,17 +95,26 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3( /// Runtime-dispatched α-extract for AYUV64 → u8 RGBA: gather α from /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u8) via `>> 8`. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source `packed` plane byte order (`false` = LE on +/// disk/wire — matching the LE-encoded `Ayuv64Frame` contract; +/// `true` = BE). Like [`copy_alpha_plane_u16_to_u8`], the existing SIMD +/// helpers use host-native u16 loads with no `from_le` / `from_be` +/// normalisation, so SIMD is only correct on LE host processing LE +/// source. The dispatcher computes +/// `safe_for_simd = !BE && cfg!(target_endian = "little")` and falls +/// back to the target-endian-aware scalar in every other quadrant. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( packed: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width); + // SIMD α-extract helpers use host-native u16 loads. Force scalar in + // any quadrant where source byte order doesn't match host byte order. + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_packed_u16x4_to_u8_at_0::(packed, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -141,7 +150,7 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( }, _ => {} } - scalar::copy_alpha_packed_u16x4_to_u8_at_0(packed, rgba_out, width); + scalar::copy_alpha_packed_u16x4_to_u8_at_0::(packed, rgba_out, width); } // --------------------------------------------------------------------------- @@ -152,17 +161,19 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( /// `packed[0 + 4*n]` (u16) into `rgba_out[3 + 4*n]` (u16). No depth /// conversion. /// -/// Selects the highest available SIMD backend; falls back to scalar. -/// When `use_simd` is `false`, calls scalar directly. +/// `BE` selects the source `packed` plane byte order. See +/// [`copy_alpha_packed_u16x4_to_u8_at_0`] for the rationale: SIMD is +/// only correct on LE host with LE source; scalar is target-endian-aware. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_packed_u16x4_at_0( +pub(crate) fn copy_alpha_packed_u16x4_at_0( packed: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool, ) { - if !use_simd { - return scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width); + let safe_for_simd = !BE && cfg!(target_endian = "little"); + if !safe_for_simd || !use_simd { + return scalar::copy_alpha_packed_u16x4_at_0::(packed, rgba_out, width); } cfg_select! { target_arch = "aarch64" => { @@ -198,7 +209,7 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0( }, _ => {} } - scalar::copy_alpha_packed_u16x4_at_0(packed, rgba_out, width); + scalar::copy_alpha_packed_u16x4_at_0::(packed, rgba_out, width); } // --------------------------------------------------------------------------- diff --git a/src/row/scalar/alpha_extract.rs b/src/row/scalar/alpha_extract.rs index 6c77346a..6fc664ac 100644 --- a/src/row/scalar/alpha_extract.rs +++ b/src/row/scalar/alpha_extract.rs @@ -27,7 +27,16 @@ pub(crate) fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], wi /// into `rgba_out[3 + 4*n]` (u8 element) with depth-conv `>> 8`. /// /// AYUV64 layout per pixel: `[A(16), Y(16), U(16), V(16)]` — α is at slot 0. -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_AYUV64LE` per the Frame +/// contract; `true` = BE on disk/wire). Each raw u16 is normalised to +/// host-native order via `u16::from_le` / `u16::from_be` before the +/// `>> 8` depth conversion. On a host whose endianness matches the +/// source the conversion compiles to a no-op; otherwise it is a +/// `swap_bytes`. Without this a BE host (e.g., s390x) processing the +/// LE-encoded Frame would emit a byte-reversed α byte. +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -35,17 +44,34 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_0( debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 4] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 4]) + } else { + u16::from_le(packed[n * 4]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } /// AYUV64 → u16 RGBA: gather α from `packed[0 + 4*n]` (u16) into /// `rgba_out[3 + 4*n]` (u16). No depth conversion. -pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_packed_u16x4_to_u8_at_0`] for the full rationale. +pub(crate) fn copy_alpha_packed_u16x4_at_0( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = packed[n * 4]; + let raw = if BE { + u16::from_be(packed[n * 4]) + } else { + u16::from_le(packed[n * 4]) + }; + rgba_out[n * 4 + 3] = raw; } } @@ -58,8 +84,15 @@ pub(crate) fn copy_alpha_packed_u16x4_at_0(packed: &[u16], rgba_out: &mut [u16], /// Used in Strategy A+: after `expand_rgb_to_rgba_row` fills the RGBA buffer /// with a forced-opaque alpha, this helper overwrites only the α slot with the /// real source alpha, depth-converted to u8. +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_RGBA64LE` / +/// `AV_PIX_FMT_BGRA64LE` per the Frame contract; `true` = BE). Each raw +/// u16 is normalised to host-native order via `u16::from_le` / +/// `u16::from_be` before the `>> 8` depth conversion. Without this a BE +/// host processing the LE-encoded Frame would emit a byte-reversed α byte. #[allow(dead_code)] // wired in sinker Task 10 -pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( +pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -67,7 +100,12 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 4 + 3] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 4 + 3]) + } else { + u16::from_le(packed[n * 4 + 3]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } @@ -77,12 +115,24 @@ pub(crate) fn copy_alpha_packed_u16x4_to_u8_at_3( /// Used in Strategy A+: after `expand_rgb_u16_to_rgba_u16_row` fills the /// RGBA buffer, this helper overwrites only the α slot with the real source /// alpha at native 16-bit depth. +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_packed_u16x4_to_u8_at_3`] for the full rationale. #[allow(dead_code)] // wired in sinker Task 10 -pub(crate) fn copy_alpha_packed_u16x4_at_3(packed: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn copy_alpha_packed_u16x4_at_3( + packed: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed too short"); debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short"); for n in 0..width { - rgba_u16_out[n * 4 + 3] = packed[n * 4 + 3]; + let raw = if BE { + u16::from_be(packed[n * 4 + 3]) + } else { + u16::from_le(packed[n * 4 + 3]) + }; + rgba_u16_out[n * 4 + 3] = raw; } } @@ -195,21 +245,49 @@ pub(crate) fn copy_alpha_ya_u8(packed: &[u8], rgba_out: &mut [u8], width: usize) /// into `rgba_out[3 + 4*n]` (u8). /// /// Ya16 layout per pixel: `[Y(16), A(16)]` — α is at odd u16 offsets (slot 1). -pub(crate) fn copy_alpha_ya_u16_to_u8(packed: &[u16], rgba_out: &mut [u8], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_YA16LE` per the +/// `Ya16Frame` contract; `true` = BE). Each raw u16 is normalised to +/// host-native order via `u16::from_le` / `u16::from_be` before the +/// `>> 8` depth conversion. Without this a BE host processing the +/// LE-encoded Frame would emit a byte-reversed α byte. +pub(crate) fn copy_alpha_ya_u16_to_u8( + packed: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (packed[n * 2 + 1] >> 8) as u8; + let raw = if BE { + u16::from_be(packed[n * 2 + 1]) + } else { + u16::from_le(packed[n * 2 + 1]) + }; + rgba_out[n * 4 + 3] = (raw >> 8) as u8; } } /// Ya16 → u16 RGBA: gather A from `packed[1 + 2*n]` (u16) into /// `rgba_out[3 + 4*n]` (u16). No depth conversion. -pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usize) { +/// +/// `BE` selects the **byte order** of the encoded source `packed` plane. +/// See [`copy_alpha_ya_u16_to_u8`] for the full rationale. +pub(crate) fn copy_alpha_ya_u16( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 2, "packed too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = packed[n * 2 + 1]; + let raw = if BE { + u16::from_be(packed[n * 2 + 1]) + } else { + u16::from_le(packed[n * 2 + 1]) + }; + rgba_out[n * 4 + 3] = raw; } } @@ -218,13 +296,42 @@ pub(crate) fn copy_alpha_ya_u16(packed: &[u16], rgba_out: &mut [u16], width: usi /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 255, and rounded /// with round-half-up (`+ 0.5` then truncate). Only slot 3 of every 4-element /// tuple is written; R, G, B slots are untouched. +/// +/// `BE` selects the **byte order** of the encoded source α plane: +/// `false` = LE on disk/wire (e.g., `AV_PIX_FMT_GBRAPF32LE` per the +/// `Gbrapf32Frame` contract; this also matches the case where the f32 +/// scratch is already host-native and the host is little-endian); +/// `true` = BE on disk/wire (or host-native scratch on a BE host). Each +/// raw f32 is bit-normalised to host-native order via +/// `f32::from_bits(u32::from_le(bits))` (or `from_be`) BEFORE the clamp / +/// scale / round-half-up. Without this a BE host (e.g., s390x) processing +/// the LE-encoded Frame would clamp byte-swapped garbage values, typically +/// producing α = 0 or α = 255 regardless of intent. Mirrors the +/// `copy_alpha_plane_u16_to_u8::` endian pattern. +/// +/// Routing pattern at the sinker layer: +/// - **Direct-Frame paths** (e.g., `Gbrapf32Frame` → α plane consumed directly) +/// pass `BE = false` (data is LE-encoded per the unified Frame contract). +/// - **Post-widen paths** (e.g., `Gbrapf16Frame` widened-to-f32 scratch) pass +/// `BE = HOST_NATIVE_BE` (scratch is host-native f32 after widen). // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], width: usize) { +pub(crate) fn copy_alpha_plane_f32_to_u8( + alpha: &[f32], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 255.0 + 0.5) as u8; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + let v = f32::from_bits(host_bits); + rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8; } } @@ -232,13 +339,28 @@ pub(crate) fn copy_alpha_plane_f32_to_u8(alpha: &[f32], rgba_out: &mut [u8], wid /// /// Each α sample is clamped to `[0.0, 1.0]`, multiplied by 65535, and rounded /// with round-half-up. Only slot 3 of every 4-element tuple is written. +/// +/// `BE` selects the **byte order** of the encoded source α plane. +/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the +/// direct-Frame vs post-widen routing pattern. // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], width: usize) { +pub(crate) fn copy_alpha_plane_f32_to_u16( + alpha: &[f32], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = (alpha[n].clamp(0.0, 1.0) * 65535.0 + 0.5) as u16; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + let v = f32::from_bits(host_bits); + rgba_out[n * 4 + 3] = (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16; } } @@ -247,13 +369,30 @@ pub(crate) fn copy_alpha_plane_f32_to_u16(alpha: &[f32], rgba_out: &mut [u16], w /// /// No clamping, no rounding — HDR values, NaN, and Inf in the α plane are /// preserved bit-exact. Only slot 3 of every 4-element tuple is written. +/// The output α is always written in **host-native** byte order (the +/// downstream consumer of `&[f32]` expects host-native floats); this helper's +/// `BE` only describes the **input** plane. +/// +/// `BE` selects the **byte order** of the encoded source α plane. +/// See [`copy_alpha_plane_f32_to_u8`] for the full rationale and the +/// direct-Frame vs post-widen routing pattern. // Not yet consumed by any sinker (Task 8 wires MixedSinker impls). #[allow(dead_code)] -pub(crate) fn copy_alpha_plane_f32(alpha: &[f32], rgba_out: &mut [f32], width: usize) { +pub(crate) fn copy_alpha_plane_f32( + alpha: &[f32], + rgba_out: &mut [f32], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = alpha[n]; + let bits = alpha[n].to_bits(); + let host_bits = if BE { + u32::from_be(bits) + } else { + u32::from_le(bits) + }; + rgba_out[n * 4 + 3] = f32::from_bits(host_bits); } } @@ -270,21 +409,56 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_0_depth_converts_correctly() { let packed: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_to_u8_at_0::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0x12, 1, 1, 1, 0xAB]); } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_0_preserves_native_u16() { let packed: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301,]; let mut rgba = std::vec![1u16; 8]; - copy_alpha_packed_u16x4_at_0(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_at_0::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0x1234, 1, 1, 1, 0xABCD]); } + /// BE parity for AYUV64 alpha-at-slot-0 → u8 RGBA: byte-swapping the + /// packed source and toggling the `BE` flag must yield byte-for-byte + /// identical output. Locks down the corruption where a BE host + /// processing the LE-encoded Frame contract would emit a byte-reversed α. + #[test] + fn copy_alpha_packed_u16x4_to_u8_at_0_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 8]; + let mut rgba_be = std::vec![1u8; 8]; + copy_alpha_packed_u16x4_to_u8_at_0::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_to_u8_at_0::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for AYUV64 alpha-at-slot-0 → u16 RGBA. + #[test] + fn copy_alpha_packed_u16x4_at_0_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 100, 200, 300, 0xABCD, 101, 201, 301]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 8]; + let mut rgba_be = std::vec![7u16; 8]; + copy_alpha_packed_u16x4_at_0::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_at_0::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + #[test] fn copy_alpha_plane_u8_scatters_into_rgba_alpha_slot() { let alpha = std::vec![50u8, 60, 70, 80]; @@ -423,28 +597,71 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_ya_u16_to_u8_depth_converts_via_high_byte() { // Ya16 packed → u8 RGBA: α >> 8 selects the high byte. let packed: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_ya_u16_to_u8(&packed, &mut rgba, 2); + copy_alpha_ya_u16_to_u8::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0xFF]); } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_ya_u16_preserves_native_u16() { let packed: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC]; let mut rgba = std::vec![1u16; 8]; - copy_alpha_ya_u16(&packed, &mut rgba, 2); + copy_alpha_ya_u16::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xABCD, 1, 1, 1, 0x9ABC]); } + /// BE parity for Ya16 → u8 RGBA: byte-swapping the packed source and + /// toggling the `BE` flag must yield byte-for-byte identical output. + /// Locks down the codex-flagged corruption where a BE host (e.g. + /// s390x) processing the LE-encoded `Ya16Frame` would otherwise emit + /// a byte-reversed α byte under the combined `with_rgb + with_rgba` + /// Strategy A+ path. #[test] + fn copy_alpha_ya_u16_to_u8_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0xFF00, 0x0001, 0x00FF]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 12]; + let mut rgba_be = std::vec![1u8; 12]; + copy_alpha_ya_u16_to_u8::(&packed_le, &mut rgba_le, 3); + copy_alpha_ya_u16_to_u8::(&packed_be, &mut rgba_be, 3); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for Ya16 → u16 RGBA (16-bit α path). + #[test] + fn copy_alpha_ya_u16_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![0x1234, 0xABCD, 0x5678, 0x9ABC, 0x0001, 0x00FF]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 12]; + let mut rgba_be = std::vec![7u16; 12]; + copy_alpha_ya_u16::(&packed_le, &mut rgba_le, 3); + copy_alpha_ya_u16::(&packed_be, &mut rgba_be, 3); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// On a LE host, `BE = false` makes the bit-normalize a no-op, so passing + /// host-native `f32` literals as if they were already LE-encoded reproduces + /// the original (pre-endian-aware) clamp+scale semantics. BE-host scalar + /// correctness is locked down by the `*_be_parity_with_swapped_buffer` + /// tests below. + #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_to_u8_clamps_and_scales() { // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 128, 255, 255, 0] in slot 3. let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1]; let mut rgba = vec![1u8; 20]; - copy_alpha_plane_f32_to_u8(&alpha, &mut rgba, 5); + copy_alpha_plane_f32_to_u8::(&alpha, &mut rgba, 5); // R, G, B slots (0, 1, 2) must be untouched; slot 3 has the alpha. assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0"); assert_eq!(rgba[7], 128, "alpha[1]=0.5 → 128"); @@ -458,11 +675,12 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_to_u16_clamps_and_scales() { // Values [0.0, 0.5, 1.0, 1.5, -0.1] → [0, 32768, 65535, 65535, 0] in slot 3. let alpha = vec![0.0f32, 0.5, 1.0, 1.5, -0.1]; let mut rgba = vec![1u16; 20]; - copy_alpha_plane_f32_to_u16(&alpha, &mut rgba, 5); + copy_alpha_plane_f32_to_u16::(&alpha, &mut rgba, 5); assert_eq!(rgba[3], 0, "alpha[0]=0.0 → 0"); assert_eq!(rgba[7], 32768, "alpha[1]=0.5 → 32768"); assert_eq!(rgba[11], 65535, "alpha[2]=1.0 → 65535"); @@ -475,11 +693,12 @@ mod tests { } #[test] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f32_lossless_passthrough() { // HDR (2.5), NaN, Inf, negative all preserved bit-exact. let alpha = vec![2.5f32, f32::NAN, f32::INFINITY, -1.0]; let mut rgba = vec![0.0f32; 16]; - copy_alpha_plane_f32(&alpha, &mut rgba, 4); + copy_alpha_plane_f32::(&alpha, &mut rgba, 4); assert_eq!(rgba[3], 2.5, "HDR 2.5 preserved"); assert!(rgba[7].is_nan(), "NaN preserved"); assert!(rgba[11].is_infinite() && rgba[11] > 0.0, "+Inf preserved"); @@ -490,43 +709,142 @@ mod tests { assert_eq!(rgba[2], 0.0); } + /// BE parity for Gbrapf32 → u8 RGBA: byte-swapping the bits of every + /// f32 in the source α plane and toggling `BE` must produce identical + /// output. Locks down the codex 3rd-pass finding where a BE host + /// processing the LE-encoded `Gbrapf32Frame` would clamp byte-swapped + /// garbage values (typical result: α = 0 or α = 255 regardless of intent). + #[test] + fn copy_alpha_plane_f32_to_u8_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![1u8; 32]; + let mut rgba_be = std::vec![1u8; 32]; + copy_alpha_plane_f32_to_u8::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32_to_u8::(&alpha_be, &mut rgba_be, 8); + assert_eq!( + rgba_le, rgba_be, + "BE flag + bit-swapped buffer must match LE path" + ); + } + + /// BE parity for Gbrapf32 → u16 RGBA. + #[test] + fn copy_alpha_plane_f32_to_u16_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 1.5, -0.1, 0.123]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![7u16; 32]; + let mut rgba_be = std::vec![7u16; 32]; + copy_alpha_plane_f32_to_u16::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32_to_u16::(&alpha_be, &mut rgba_be, 8); + assert_eq!( + rgba_le, rgba_be, + "BE flag + bit-swapped buffer must match LE path" + ); + } + + /// BE parity for Gbrapf32 → f32 RGBA (lossless α pass-through). The + /// output α must equal the host-native f32 bit-pattern of the LE source + /// regardless of the host's byte order. NaN bit-patterns may differ + /// across hardware after a `from_bits → to_bits` round-trip, so we + /// compare on the bit representation of finite, non-NaN samples only. + #[test] + fn copy_alpha_plane_f32_be_parity_with_swapped_buffer() { + let alpha_le: std::vec::Vec = + std::vec![0.0f32, 0.25, 0.5, 0.75, 1.0, 2.5, -1.0, f32::INFINITY]; + let alpha_be: std::vec::Vec = alpha_le + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect(); + let mut rgba_le = std::vec![0.0f32; 32]; + let mut rgba_be = std::vec![0.0f32; 32]; + copy_alpha_plane_f32::(&alpha_le, &mut rgba_le, 8); + copy_alpha_plane_f32::(&alpha_be, &mut rgba_be, 8); + let bits_le: std::vec::Vec = rgba_le.iter().map(|v| v.to_bits()).collect(); + let bits_be: std::vec::Vec = rgba_be.iter().map(|v| v.to_bits()).collect(); + assert_eq!( + bits_le, bits_be, + "BE flag + bit-swapped buffer must match LE path bit-for-bit" + ); + } + // ---- copy_alpha_packed_u16x4_to_u8_at_3 / copy_alpha_packed_u16x4_at_3 -- /// Alpha at slot 3 is depth-converted >> 8 and written to rgba_out[3 + 4*n]. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_3_narrows_correctly() { let packed: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; let mut rgba = std::vec![1u8; 8]; - copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 2); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed, &mut rgba, 2); assert_eq!(rgba, std::vec![1, 1, 1, 0xAB, 1, 1, 1, 0x12]); } /// Alpha at slot 3 is copied verbatim (no depth conversion). #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_3_copies_verbatim() { let packed: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; let mut rgba_u16 = std::vec![1u16; 8]; - copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 2); + copy_alpha_packed_u16x4_at_3::(&packed, &mut rgba_u16, 2); assert_eq!(rgba_u16, std::vec![1, 1, 1, 0xABFF, 1, 1, 1, 0x1234]); } /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_to_u8_at_3_touches_only_alpha_slot() { let packed: std::vec::Vec = std::vec![0, 0, 0, 0xFFFF]; let mut rgba = std::vec![42u8; 4]; - copy_alpha_packed_u16x4_to_u8_at_3(&packed, &mut rgba, 1); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed, &mut rgba, 1); assert_eq!(rgba[..3], [42, 42, 42]); assert_eq!(rgba[3], 0xFF); } /// Only the alpha slot (index 3) is overwritten; RGB slots [0..3] are untouched. #[test] + #[cfg(target_endian = "little")] fn copy_alpha_packed_u16x4_at_3_touches_only_alpha_slot() { let packed: std::vec::Vec = std::vec![0, 0, 0, 0xBEEF]; let mut rgba_u16 = std::vec![99u16; 4]; - copy_alpha_packed_u16x4_at_3(&packed, &mut rgba_u16, 1); + copy_alpha_packed_u16x4_at_3::(&packed, &mut rgba_u16, 1); assert_eq!(rgba_u16[..3], [99, 99, 99]); assert_eq!(rgba_u16[3], 0xBEEF); } + + /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u8 RGBA. + #[test] + fn copy_alpha_packed_u16x4_to_u8_at_3_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![1u8; 8]; + let mut rgba_be = std::vec![1u8; 8]; + copy_alpha_packed_u16x4_to_u8_at_3::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_to_u8_at_3::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } + + /// BE parity for Rgba64 / Bgra64 alpha-at-slot-3 → u16 RGBA. + #[test] + fn copy_alpha_packed_u16x4_at_3_be_parity_with_swapped_buffer() { + let packed_le: std::vec::Vec = std::vec![100, 200, 300, 0xABFF, 101, 201, 301, 0x1234]; + let packed_be: std::vec::Vec = packed_le.iter().map(|x| x.swap_bytes()).collect(); + let mut rgba_le = std::vec![7u16; 8]; + let mut rgba_be = std::vec![7u16; 8]; + copy_alpha_packed_u16x4_at_3::(&packed_le, &mut rgba_le, 2); + copy_alpha_packed_u16x4_at_3::(&packed_be, &mut rgba_be, 2); + assert_eq!( + rgba_le, rgba_be, + "BE flag + byte-swapped buffer must match LE path" + ); + } } diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs index 3b9ba779..766c310b 100644 --- a/src/row/scalar/planar_gbr_f16.rs +++ b/src/row/scalar/planar_gbr_f16.rs @@ -198,16 +198,34 @@ pub(crate) fn gbrapf16_to_rgba_f16_row( /// Only slot 3 of every 4-element tuple is written; R, G, B slots are /// untouched. Lossless — HDR, NaN, and Inf in the α plane are preserved /// bit-exact. +/// +/// `BE` selects the **byte order** of the encoded source α plane +/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_GBRAPF16LE` per the +/// `Gbrapf16Frame` contract; `true` = BE on disk/wire). Each raw f16 is +/// bit-normalised to host-native order via `u16::from_le` / `u16::from_be` +/// BEFORE the slot-3 write so the output buffer always carries host-native +/// `half::f16` (matching the rest of the f16 row kernels). Without this a +/// BE host processing the LE-encoded Frame would emit byte-reversed α bits. // Only called from the `mod tests` block which is gated on `feature = "std"`. // Under `cargo test --no-default-features` the test module is compiled out, // leaving the function without callers; suppress the resulting lint there. #[cfg_attr(not(feature = "std"), expect(dead_code))] #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn copy_alpha_plane_f16(alpha: &[half::f16], rgba_out: &mut [half::f16], width: usize) { +pub(crate) fn copy_alpha_plane_f16( + alpha: &[half::f16], + rgba_out: &mut [half::f16], + width: usize, +) { debug_assert!(alpha.len() >= width, "alpha plane too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short"); for n in 0..width { - rgba_out[n * 4 + 3] = alpha[n]; + let raw = alpha[n].to_bits(); + let host_bits = if BE { + u16::from_be(raw) + } else { + u16::from_le(raw) + }; + rgba_out[n * 4 + 3] = half::f16::from_bits(host_bits); } } @@ -414,11 +432,12 @@ mod tests { miri, ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" )] + #[cfg(target_endian = "little")] fn copy_alpha_plane_f16_only_writes_alpha_slot() { let alpha = vec![half::f16::from_f32(0.7), half::f16::from_f32(0.3)]; let sentinel = half::f16::from_f32(0.1); let mut rgba = vec![sentinel; 8]; - copy_alpha_plane_f16(&alpha, &mut rgba, 2); + copy_alpha_plane_f16::(&alpha, &mut rgba, 2); // Only slot 3 written; R, G, B slots (0, 1, 2) must be untouched. assert_eq!(rgba[0], sentinel, "R slot 0 untouched"); assert_eq!(rgba[1], sentinel, "G slot 0 untouched"); @@ -429,4 +448,34 @@ mod tests { assert_eq!(rgba[6], sentinel, "B slot 1 untouched"); assert_eq!(rgba[7], half::f16::from_f32(0.3), "A slot 1"); } + + /// BE parity for `copy_alpha_plane_f16`: byte-swapping the bits of every + /// f16 in the source α plane and toggling `BE` must produce identical + /// output. Mirrors the f32 alpha-patch endian-aware fix. + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn copy_alpha_plane_f16_be_parity_with_swapped_buffer() { + let alpha_le = vec![ + half::f16::from_f32(0.0), + half::f16::from_f32(0.25), + half::f16::from_f32(0.5), + half::f16::from_f32(1.0), + half::f16::from_f32(2.5), + half::f16::from_f32(-1.0), + ]; + let alpha_be = be_encode_f16(&alpha_le); + let mut rgba_le = vec![half::f16::ZERO; 24]; + let mut rgba_be = vec![half::f16::ZERO; 24]; + copy_alpha_plane_f16::(&alpha_le, &mut rgba_le, 6); + copy_alpha_plane_f16::(&alpha_be, &mut rgba_be, 6); + let bits_le: std::vec::Vec = rgba_le.iter().map(|v| v.to_bits()).collect(); + let bits_be: std::vec::Vec = rgba_be.iter().map(|v| v.to_bits()).collect(); + assert_eq!( + bits_le, bits_be, + "BE flag + bit-swapped buffer must match LE path bit-for-bit" + ); + } } diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs index 7782994d..6ab55456 100644 --- a/src/sinker/mixed/ayuv64.rs +++ b/src/sinker/mixed/ayuv64.rs @@ -350,7 +350,8 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0( + // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`. + crate::row::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_0::( packed, rgba_row, w, use_simd, ); } @@ -404,7 +405,13 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0(packed, rgba_u16_row, w, use_simd); + // `Ayuv64Frame` is LE-encoded per the unified Frame contract → `BE = false`. + crate::row::alpha_extract::copy_alpha_packed_u16x4_at_0::( + packed, + rgba_u16_row, + w, + use_simd, + ); } } diff --git a/src/sinker/mixed/gray.rs b/src/sinker/mixed/gray.rs index 5d6c44e1..b07b9191 100644 --- a/src/sinker/mixed/gray.rs +++ b/src/sinker/mixed/gray.rs @@ -1499,8 +1499,9 @@ impl PixelSink for MixedSinker<'_, Ya16> { let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - // Patch α from source (native u16 depth). - copy_alpha_ya_u16(packed, rgba_u16_row, w); + // Patch α from source (native u16 depth). `Ya16Frame` is LE-encoded + // per the unified Frame contract → `BE = false`. + copy_alpha_ya_u16::(packed, rgba_u16_row, w); } } @@ -1562,7 +1563,8 @@ impl PixelSink for MixedSinker<'_, Ya16> { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); // Overwrite the α channel with real source α (>> 8 for u8 output). - copy_alpha_ya_u16_to_u8(packed, rgba_row, w); + // `Ya16Frame` is LE-encoded per the unified Frame contract → `BE = false`. + copy_alpha_ya_u16_to_u8::(packed, rgba_row, w); } Ok(()) @@ -2408,6 +2410,139 @@ mod tests { assert_eq!(v, [0x80, 0xFF]); } + /// Strategy A+ (combined `with_rgb` + `with_rgba`) must produce α bytes + /// byte-identical to the standalone `with_rgba` path. Locks down the + /// codex-flagged corruption where a BE host processing the LE-encoded + /// `Ya16Frame` would otherwise diverge between the two paths: standalone + /// uses the endian-aware `ya16_to_rgba_row::` kernel; combined + /// expanded RGB → RGBA then patched α via `copy_alpha_ya_u16_to_u8` which + /// previously read raw `packed[n*2+1]` host-native and so emitted a + /// byte-reversed α byte on BE. After the fix, `copy_alpha_ya_u16_to_u8` + /// is target-endian-aware (`` for the LE Frame contract) and the + /// two paths agree on every host. + /// + /// To exercise the LE-encoded byte contract on every host we build the + /// `&[u16]` plane by bit-casting LE bytes — `u16::from_le_bytes` per + /// sample. On LE hosts that's a no-op; on BE hosts it byte-swaps so the + /// in-memory bytes match the FFmpeg `AV_PIX_FMT_YA16LE` layout. + #[test] + fn ya16_combined_rgb_and_rgba_alpha_matches_standalone_le_encoded() { + let w: u32 = 8; + let h: u32 = 1; + // Logical samples (Y, A) per pixel. + let samples: [(u16, u16); 8] = [ + (0x0000, 0xFFFF), + (0x8000, 0x4000), + (0xFFFF, 0x0000), + (0x1234, 0xABCD), + (0x00FF, 0xFF00), + (0x5A5A, 0xA5A5), + (0x7FFF, 0x8000), + (0xC000, 0x3FFF), + ]; + // Build the `&[u16]` plane such that its in-memory bytes match the + // FFmpeg `AV_PIX_FMT_YA16LE` byte layout on every host. We want a + // host-native u16 whose underlying bytes spell `[low, high]` (LE): + // `u16::from_ne_bytes(x.to_le_bytes())` is `x` on LE and `x.swap_bytes()` + // on BE — the right value to store in either case. + let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) }; + let packed: std::vec::Vec = samples + .iter() + .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)]) + .collect(); + let frame = Ya16Frame::new(&packed, w, h, w * 2); + + // Run combined (with_rgb + with_rgba) — exercises Strategy A+ with the + // newly endian-aware `copy_alpha_ya_u16_to_u8::`. Forces + // `with_simd(false)` so the test runs purely scalar — no SIMD intrinsics + // — which lets it execute under `cargo miri test`. BE CI is driven by + // miri on s390x / powerpc64; gating it out of miri would skip exactly + // the host where BE corruption would surface. + let mut rgb_combined = std::vec![0u8; (w * h * 3) as usize]; + let mut rgba_combined = std::vec![0u8; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) + .with_rgb(&mut rgb_combined) + .unwrap() + .with_rgba(&mut rgba_combined) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + // Run standalone (with_rgba only) — exercises the endian-aware + // `ya16_to_rgba_row::` kernel. Same scalar-only rationale. + let mut rgba_standalone = std::vec![0u8; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) + .with_rgba(&mut rgba_standalone) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combined, rgba_standalone, + "combined (with_rgb+with_rgba) RGBA must equal standalone with_rgba" + ); + } + + /// u16 RGBA variant of the combined-vs-standalone parity check. Locks + /// down `copy_alpha_ya_u16::` (the u16 alpha-patch helper for + /// 16-bit RGBA outputs). + #[test] + fn ya16_combined_rgb_u16_and_rgba_u16_alpha_matches_standalone_le_encoded() { + let w: u32 = 8; + let h: u32 = 1; + let samples: [(u16, u16); 8] = [ + (0x0000, 0xFFFF), + (0x8000, 0x4000), + (0xFFFF, 0x0000), + (0x1234, 0xABCD), + (0x00FF, 0xFF00), + (0x5A5A, 0xA5A5), + (0x7FFF, 0x8000), + (0xC000, 0x3FFF), + ]; + // See sibling test for the `le_encoded` rationale. + let le_encoded = |x: u16| -> u16 { u16::from_ne_bytes(x.to_le_bytes()) }; + let packed: std::vec::Vec = samples + .iter() + .flat_map(|&(y, a)| [le_encoded(y), le_encoded(a)]) + .collect(); + let frame = Ya16Frame::new(&packed, w, h, w * 2); + + // Forces `with_simd(false)` so this test runs purely scalar — no SIMD + // intrinsics — which lets it execute under `cargo miri test`. BE CI is + // driven by miri on s390x / powerpc64; gating it out of miri would skip + // exactly the host where BE corruption would surface. + let mut rgb_combined = std::vec![0u16; (w * h * 3) as usize]; + let mut rgba_combined = std::vec![0u16; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) + .with_rgb_u16(&mut rgb_combined) + .unwrap() + .with_rgba_u16(&mut rgba_combined) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + let mut rgba_standalone = std::vec![0u16; (w * h * 4) as usize]; + { + let mut sink = MixedSinker::::new(w as usize, h as usize) + .with_simd(false) + .with_rgba_u16(&mut rgba_standalone) + .unwrap(); + ya16_to(&frame, FR, M, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combined, rgba_standalone, + "combined (with_rgb_u16+with_rgba_u16) RGBA u16 must equal standalone" + ); + } + #[test] #[cfg_attr( miri, diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index 837b8246..49e34d84 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -726,7 +726,11 @@ impl PixelSink for MixedSinker<'_, Rgba64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::( + in64, rgba_row, w, + ); } } @@ -759,7 +763,13 @@ impl PixelSink for MixedSinker<'_, Rgba64> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::( + in64, + rgba_u16_row, + w, + ); } } @@ -995,7 +1005,11 @@ impl PixelSink for MixedSinker<'_, Bgra64> { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3(in64, rgba_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_to_u8_at_3::( + in64, rgba_row, w, + ); } } @@ -1024,7 +1038,13 @@ impl PixelSink for MixedSinker<'_, Bgra64> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; expand_rgb_u16_to_rgba_u16_row::<16>(rgb_u16_row, rgba_u16_row, w); - crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3(in64, rgba_u16_row, w); + // `Rgba64Frame` / `Bgra64Frame` are LE-encoded per the unified Frame + // contract → `BE = false`. + crate::row::scalar::alpha_extract::copy_alpha_packed_u16x4_at_3::( + in64, + rgba_u16_row, + w, + ); } } diff --git a/src/sinker/mixed/packed_rgb_f16.rs b/src/sinker/mixed/packed_rgb_f16.rs index 62ab1cc8..e349f130 100644 --- a/src/sinker/mixed/packed_rgb_f16.rs +++ b/src/sinker/mixed/packed_rgb_f16.rs @@ -34,25 +34,6 @@ use crate::{ yuv::{Rgbf16, Rgbf16Row, Rgbf16Sink}, }; -/// `BE` value that makes the `rgbf16_to_*` row dispatchers treat their input as -/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf16Frame`] -/// exposes a `&[half::f16]` row in **host-native** layout — the API contract is that the -/// caller hands us already-decoded half-floats. The kernel `BE` parameter, -/// however, names the **encoded** byte order (so `BE = false` means "decode -/// LE-encoded bytes" via `u16::from_le`). On a LE host the host-native layout -/// is LE, so `BE = false` is correct; on a BE host the host-native layout is -/// BE, so we must request `BE = true` to make `u16::from_be` no-op the swap. -/// Without this routing the loaders would byte-swap an already-decoded host- -/// native `f16` on BE hosts, corrupting every output path. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` — same truth table, different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Rgbf16 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf16> { @@ -253,27 +234,27 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if let Some(buf) = rgb_f16.as_deref_mut() { let f16_start = one_plane_start * 3; let f16_end = one_plane_end * 3; - rgbf16_to_rgb_f16_row::(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); + rgbf16_to_rgb_f16_row::(rgb_in, &mut buf[f16_start..f16_end], w, use_simd); } // Lossless f32 widen — also independent of integer conversion paths. if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf16_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf16_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf16_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf16_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct half-float → u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float → u8 when no RGB / luma / @@ -288,7 +269,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -307,7 +288,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { w, h, )?; - rgbf16_to_rgb_row::(rgb_in, rgb_row, w, use_simd); + rgbf16_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -347,7 +328,7 @@ impl PixelSink for MixedSinker<'_, Rgbf16> { // over `rgb_row` via `expand_rgb_to_rgba_row`. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf16_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_float.rs b/src/sinker/mixed/packed_rgb_float.rs index f189e5ab..e1c17a39 100644 --- a/src/sinker/mixed/packed_rgb_float.rs +++ b/src/sinker/mixed/packed_rgb_float.rs @@ -31,25 +31,6 @@ use crate::{ yuv::{Rgbf32, Rgbf32Row, Rgbf32Sink}, }; -/// `BE` value that makes the `rgbf32_to_*` row dispatchers treat their input as -/// host-native (a no-op byte-swap). Used here because [`crate::frame::Rgbf32Frame`] -/// exposes a `&[f32]` row in **host-native** layout — the API contract is that the caller -/// hands us already-decoded floats. The kernel `BE` parameter, however, names -/// the **encoded** byte order (so `BE = false` means "decode LE-encoded bytes" -/// via `u32::from_le`). On a LE host the host-native layout is LE, so -/// `BE = false` is correct; on a BE host the host-native layout is BE, so we -/// must request `BE = true` to make `u32::from_be` no-op the swap. Without this -/// routing the loaders would byte-swap an already-decoded host-native `f32` on -/// BE hosts, corrupting every output path. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` — same truth table, different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Rgbf32 impl ------------------------------------------------------- impl<'a> MixedSinker<'a, Rgbf32> { @@ -228,20 +209,20 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if let Some(buf) = rgb_f32.as_deref_mut() { let f32_start = one_plane_start * 3; let f32_end = one_plane_end * 3; - rgbf32_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); + rgbf32_to_rgb_f32_row::(rgb_in, &mut buf[f32_start..f32_end], w, use_simd); } // u16 RGB output — direct float→u16 conversion (no staging). if let Some(buf) = rgb_u16.as_deref_mut() { let u16_start = one_plane_start * 3; let u16_end = one_plane_end * 3; - rgbf32_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); + rgbf32_to_rgb_u16_row::(rgb_in, &mut buf[u16_start..u16_end], w, use_simd); } // u16 RGBA output — direct float→u16 conversion (no staging). if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_u16_row::(rgb_in, rgba_row, w, use_simd); } // u8 RGBA standalone fast path — direct float→u8 conversion when @@ -256,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { if want_rgba_u8 && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); return Ok(()); } @@ -276,7 +257,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { w, h, )?; - rgbf32_to_rgb_row::(rgb_in, rgb_row, w, use_simd); + rgbf32_to_rgb_row::(rgb_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -318,7 +299,7 @@ impl PixelSink for MixedSinker<'_, Rgbf32> { // less memory pass for combined `with_rgb + with_rgba` callers. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); + rgbf32_to_rgba_row::(rgb_in, rgba_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index c041b901..22159b92 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -51,7 +51,8 @@ use crate::{ gbrapf32_to_rgba_u16_row, gbrpf16_to_rgb_f16_row, gbrpf16_to_rgb_row, gbrpf16_to_rgba_f16_row, gbrpf16_to_rgba_row, gbrpf32_to_hsv_row, gbrpf32_to_luma_row, gbrpf32_to_luma_u16_row, gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_f32_row, - gbrpf32_to_rgba_u16_row, scalar::alpha_extract::copy_alpha_plane_f32_to_u8, + gbrpf32_to_rgba_u16_row, + scalar::{alpha_extract::copy_alpha_plane_f32_to_u8, planar_gbr_f16::widen_f16_be_to_host_f32}, }, yuv::{Gbrapf16, Gbrapf16Row, Gbrapf16Sink, Gbrpf16, Gbrpf16Row, Gbrpf16Sink}, }; @@ -64,56 +65,22 @@ const GBR_F16_FULL_RANGE: bool = true; // Chunk size for the inline f16→f32 widening scratch arrays (stack-allocated). const WIDEN_CHUNK: usize = 64; -/// `BE` value that makes the `gbrpf16_to_*` / `gbrapf16_to_*` row dispatchers -/// (and the widened `gbrpf32_to_*` chain after `widen_f16_to_f32`) treat -/// their input as **host-native** (a no-op byte-swap). -/// -/// [`crate::frame::Gbrpf16Frame`] / [`crate::frame::Gbrapf16Frame`] expose -/// `&[half::f16]` plane rows in **host-native** layout — the API contract -/// is that the caller hands us already-decoded half-floats. The kernel `BE` -/// parameter, however, names the **encoded** byte order (so `BE = false` -/// means "decode LE-encoded bytes" via `u16::from_le`). On a LE host the -/// host-native layout is LE, so `BE = false` is correct; on a BE host the -/// host-native layout is BE, so we must request `BE = true` to make -/// `u16::from_be` no-op the swap. Without this routing the loaders would -/// byte-swap an already-decoded host-native `f16` on BE hosts, corrupting -/// every output path (codex PR #84 Finding 3). -/// -/// Crucially, the **widened f32 chain** must also use `HOST_NATIVE_BE`: -/// after [`widen_f16_to_f32`] (which calls `half::f16::to_f32` on host-native -/// f16 bits) the scratch is host-native f32, so the downstream -/// `gbrpf32_to_*` kernel's `from_le`/`from_be` loader must be a no-op — -/// achieved by routing with `HOST_NATIVE_BE`. -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced in `c3a6478` and the `Rgbf16` sinker fix in -/// `dcf40a3`. Same truth table: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -/// -/// The α-plane scatter for [`Gbrapf16`] (Strategy A+ / standalone-RGBA) -/// widens the host-native f16 α plane to host-native f32 via -/// [`widen_f16_to_f32`] then calls `copy_alpha_plane_f32_to_u8` — both -/// operations are endian-agnostic. Mix-mode corruption (LE-decoded RGB + -/// host-native α) is therefore eliminated by routing the RGB chain via -/// `HOST_NATIVE_BE`. +// Endianness routing for **post-widen** `gbrpf32_to_*` calls. +// +// `widen_f16_be_to_host_f32::` produces **host-native f32 scratch** from +// LE-encoded f16 plane bits (it normalises bits before widening), so the +// downstream `gbrpf32_to_*::` kernel sees input that already +// matches the host's byte order. The kernel's `from_le` / `from_be` then +// becomes a no-op on every host — correct. +// +// Distinct from the **direct** Frame-to-row-kernel pattern elsewhere in this +// file (the `gbrpf16_to_*::` u8/f16 calls): those receive raw LE-encoded +// `&[half::f16]` plane bytes per the unified Frame contract, so they pass +// `BE = false` to tell the kernel to apply `from_le`. Post-widen scratch is +// already host-native, so it must use `BE = HOST_NATIVE_BE` to keep the kernel +// byte-swap a no-op on every host. const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); -/// Widen `width` `half::f16` values from `src` into `dst` (f32 elements). -/// -/// The source slice is `&[half::f16]` in **host-native** layout (per the -/// `Gbrpf16Frame` / `Gbrapf16Frame` API contract); `to_f32` interprets the -/// bits as host-native and emits host-native `f32`. Downstream `gbrpf32_to_*` -/// callers must therefore route with [`HOST_NATIVE_BE`] (not the encoded -/// `BE`) to avoid double byte-swapping. -#[cfg_attr(not(tarpaulin), inline(always))] -fn widen_f16_to_f32(src: &[half::f16], dst: &mut [f32], count: usize) { - for i in 0..count { - dst[i] = src[i].to_f32(); - } -} - // ---- Gbrpf16 accessor impl block ---------------------------------------- impl<'a> MixedSinker<'a, Gbrpf16> { @@ -351,7 +318,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -363,14 +330,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { height: h, channels: 4, })?; - gbrpf16_to_rgba_f16_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf16_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -394,9 +354,14 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { let mut offset = 0; while offset < w { let n = (w - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n); - widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n); - widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n); + // Bit-normalise LE-encoded f16 plane bits → host-native f32 so the + // downstream `gbrpf32_to_*` kernel (invoked with `BE = HOST_NATIVE_BE` + // — see module-scope constant) sees host-native f32 on every host. + // The post-widen scratch is host-native, distinct from the direct- + // Frame paths which use `` per the LE-encoded byte contract. + widen_f16_be_to_host_f32::(g_in, offset, &mut gf_chunk, n); + widen_f16_be_to_host_f32::(b_in, offset, &mut bf_chunk, n); + widen_f16_be_to_host_f32::(r_in, offset, &mut rf_chunk, n); let gf = &gf_chunk[..n]; let bf = &bf_chunk[..n]; let rf = &rf_chunk[..n]; @@ -480,7 +445,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -504,7 +469,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { w, h, )?; - gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A: expand RGB → RGBA (constant α = 0xFF). if let Some(buf) = rgba.as_deref_mut() { @@ -761,7 +726,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { // rgb_f16: no source α — use the no-α kernel (lossless scatter). let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -774,15 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { height: h, channels: 4, })?; - gbrapf16_to_rgba_f16_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf16_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -806,10 +763,12 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { let mut offset = 0; while offset < w { let n = (w - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&g_in[offset..], &mut gf_chunk, n); - widen_f16_to_f32(&b_in[offset..], &mut bf_chunk, n); - widen_f16_to_f32(&r_in[offset..], &mut rf_chunk, n); - widen_f16_to_f32(&a_in[offset..], &mut af_chunk, n); + // Bit-normalise LE-encoded f16 plane bits → host-native f32 (see the + // canonical helper's docs); downstream kernel uses `BE = false`. + widen_f16_be_to_host_f32::(g_in, offset, &mut gf_chunk, n); + widen_f16_be_to_host_f32::(b_in, offset, &mut bf_chunk, n); + widen_f16_be_to_host_f32::(r_in, offset, &mut rf_chunk, n); + widen_f16_be_to_host_f32::(a_in, offset, &mut af_chunk, n); let gf = &gf_chunk[..n]; let bf = &bf_chunk[..n]; let rf = &rf_chunk[..n]; @@ -918,7 +877,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; // Write opaque RGB → RGBA (α = 0xFF), then overwrite α from source. - gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); // Scatter f16 α → u8 slot 3: widen + clamp + scale. widen_and_scatter_f16_alpha_to_u8(a_in, rgba_row, w); return Ok(()); @@ -944,7 +903,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { w, h, )?; - gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from source. if let Some(buf) = rgba.as_deref_mut() { @@ -962,14 +921,26 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { /// /// Used by `Gbrapf16` Strategy A+ and standalone-RGBA paths to overwrite /// the per-pixel alpha byte from the f16 source α plane. +/// +/// Endian routing: `widen_f16_be_to_host_f32::` converts the +/// LE-encoded `Gbrapf16Frame` α plane bits into **host-native f32** +/// scratch. The downstream `copy_alpha_plane_f32_to_u8` therefore receives +/// host-native f32 input, not LE-encoded f32, and must be invoked with +/// `BE = HOST_NATIVE_BE` so the kernel's `from_le` / `from_be` is a no-op +/// on every host (no second byte-swap). This is the **post-widen** routing +/// pattern; contrast with `planar_gbr_float.rs` which calls the same +/// helper with `BE = false` because it consumes the **direct** LE-encoded +/// `Gbrapf32Frame` α plane. #[cfg_attr(not(tarpaulin), inline(always))] fn widen_and_scatter_f16_alpha_to_u8(alpha_f16: &[half::f16], rgba_out: &mut [u8], width: usize) { let mut af_chunk = [0.0f32; WIDEN_CHUNK]; let mut offset = 0; while offset < width { let n = (width - offset).min(WIDEN_CHUNK); - widen_f16_to_f32(&alpha_f16[offset..], &mut af_chunk, n); - copy_alpha_plane_f32_to_u8(&af_chunk[..n], &mut rgba_out[offset * 4..], n); + // Bit-normalise LE-encoded f16 α bits → host-native f32 before clamping + // and scaling to u8 — correct on both LE and BE hosts. + widen_f16_be_to_host_f32::(alpha_f16, offset, &mut af_chunk, n); + copy_alpha_plane_f32_to_u8::(&af_chunk[..n], &mut rgba_out[offset * 4..], n); offset += n; } } diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs index 7b979049..ddee135b 100644 --- a/src/sinker/mixed/planar_gbr_float.rs +++ b/src/sinker/mixed/planar_gbr_float.rs @@ -53,36 +53,6 @@ use crate::{ const GBR_FLOAT_LUMA_MATRIX: ColorMatrix = ColorMatrix::Bt709; const GBR_FLOAT_FULL_RANGE: bool = true; -/// `BE` value that makes the `gbrpf32_to_*` / `gbrapf32_to_*` row dispatchers -/// treat their input as **host-native** (a no-op byte-swap). -/// -/// [`crate::frame::Gbrpf32Frame`] / [`crate::frame::Gbrapf32Frame`] expose -/// `&[f32]` plane rows in **host-native** layout — the API contract is that -/// the caller hands us already-decoded floats. The kernel `BE` parameter, -/// however, names the **encoded** byte order (so `BE = false` means "decode -/// LE-encoded bytes" via `u32::from_le`). On a LE host the host-native layout -/// is LE, so `BE = false` is correct; on a BE host the host-native layout is -/// BE, so we must request `BE = true` to make `u32::from_be` no-op the swap. -/// Without this routing the loaders would byte-swap an already-decoded host- -/// native `f32` on BE hosts, corrupting every output path (codex PR #84 -/// Finding 2). -/// -/// This is the **sinker-layer** complement to the SIMD-backend-internal -/// `HOST_NATIVE_BE` introduced for the f16→f32 widen-then-convert paths in -/// `c3a6478` and the `Rgbf32` sinker fix in `dcf40a3`. Same truth table, -/// different layer: -/// -/// • LE host: `HOST_NATIVE_BE = false` → `from_le` (no-op on LE) → correct. -/// • BE host: `HOST_NATIVE_BE = true` → `from_be` (no-op on BE) → correct. -/// -/// The α-plane scatter (Strategy A+ / standalone-RGBA) consumes the host- -/// native `&[f32]` α plane via `copy_alpha_plane_f32_to_u8`, which is endian- -/// agnostic — there's no BE branching needed for the α path because it does -/// not byte-load through `from_le`/`from_be`. Mix-mode corruption (LE-decoded -/// RGB + host-native α) is therefore eliminated by routing the RGB chain via -/// `HOST_NATIVE_BE`. -const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - // ---- Gbrpf32 accessor impl block ---------------------------------------- impl<'a> MixedSinker<'a, Gbrpf32> { @@ -321,7 +291,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -333,14 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f32_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf32_to_rgba_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -348,7 +311,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -360,14 +323,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f16_row::( - g_in, - b_in, - r_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrpf32_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB / RGBA path (direct float → u16, no staging) ----------- @@ -375,12 +331,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_u16_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_u16_row::(g_in, b_in, r_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path (no RGB / luma / HSV needed) ------- @@ -395,7 +351,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -422,10 +378,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { w, h, )?; - gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -438,7 +394,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -451,7 +407,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, @@ -721,7 +677,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -733,15 +689,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f32_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf32_to_rgba_f32_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -749,7 +697,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -761,15 +709,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f16_row::( - g_in, - b_in, - r_in, - a_in, - &mut buf[start..end], - w, - use_simd, - ); + gbrapf32_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB path (direct, no staging) ------------------------------ @@ -777,14 +717,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGBA path (direct — source α clamped + scaled) ------------- if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_u16_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_u16_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path ------------------------------------ @@ -799,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); return Ok(()); } @@ -826,10 +766,10 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { w, h, )?; - gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row::( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -842,7 +782,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row::( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -855,7 +795,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row::( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, @@ -869,10 +809,20 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from // the source f32 α plane (clamped × 255 → u8). + // + // `BE = false`: `a_in` is the **direct** Gbrapf32Frame α plane, which + // is LE-encoded f32 per the Phase-1 unified Frame contract. The helper + // bit-normalises each f32 to host-native order before clamp/scale, so + // the conversion compiles to a no-op on LE hosts and a `swap_bytes` on + // BE hosts (e.g., s390x). Without this BE hosts would clamp byte- + // swapped garbage and emit α = 0 / 255 regardless of intent. Distinct + // from the **post-widen** routing in `planar_gbr_f16.rs` + // (`widen_and_scatter_f16_alpha_to_u8`), which feeds host-native f32 + // scratch into the same helper with `BE = HOST_NATIVE_BE`. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; expand_rgb_to_rgba_row(rgb_row, rgba_row, w); - copy_alpha_plane_f32_to_u8(a_in, rgba_row, w); + copy_alpha_plane_f32_to_u8::(a_in, rgba_row, w); } Ok(()) diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs index 1cd09a1f..574e482c 100644 --- a/src/sinker/mixed/tests/packed_rgb_f16.rs +++ b/src/sinker/mixed/tests/packed_rgb_f16.rs @@ -311,131 +311,49 @@ fn rgbf16_simd_matches_scalar_with_random_input() { assert_eq!(rgb_f16_simd, pix, "RGB f16 output is not lossless"); } -/// Sinker-layer host-native-`f16` regression for the bug fixed alongside -/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf16`] sinker used to -/// hardcode `::` when calling the row dispatchers, telling them to -/// "decode LE-encoded input". Because [`Rgbf16Frame`] hands us a host-native -/// `&[half::f16]` row, that routing was a no-op on LE hosts but corrupted -/// every output path on BE hosts (the `u16` loaders would byte-swap an -/// already-decoded f16 bit-pattern). The fix replaces those `::` with -/// `::`, which is `false` on LE and `true` on BE — a no-op -/// byte-swap on either host. +/// LE-encoded byte contract regression: builds an [`Rgbf16Frame`] from a +/// `&[half::f16]` plane explicitly encoded as LE bytes (per the FFmpeg +/// `AV_PIX_FMT_*LE` convention documented on `Rgbf16Frame`), runs it +/// through the sinker's `with_rgb_f16` lossless pass-through, and asserts +/// the output equals the host-native intended values. /// -/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), -/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte -/// identical to `::`, so this test cannot distinguish the broken vs -/// fixed code on LE. It instead documents the equivalence at the **kernel -/// dispatch** layer — calling each `rgbf16_to_*` dispatcher with both -/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) -/// must produce identical output on the active host. +/// Vacuous on LE hosts (where `to_le` on a `u16` is a no-op so the LE- +/// encoded plane *is* host-native), but on a BE host this would fail fast +/// for any regression that drops the `::` kernel routing — the +/// kernel must apply `u16::from_le` to recover host-native f16 bit-patterns +/// from the LE-encoded bytes. /// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the -/// equality `::` ≡ `::` is _false_ — `::` -/// decodes the host-native fixture as if it were LE-encoded (byte-swap), -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. The dispatch-equivalence claim is specifically -/// about the LE host-routing pattern; the BE-host correctness of the routing -/// change is verified instead by -/// [`rgbf16_sinker_host_native_contract_lossless_passthrough`] and the -/// row-kernel BE parity tests in `src/row/arch/*/tests/`. -#[test] -#[cfg(target_endian = "little")] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf16_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - rgbf16_to_rgb_f16_row, rgbf16_to_rgb_f32_row, rgbf16_to_rgb_row, rgbf16_to_rgb_u16_row, - rgbf16_to_rgba_row, rgbf16_to_rgba_u16_row, - }; - - // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. - // Compute it locally so the test asserts the same condition without taking - // a dependency on a private const. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33 covers SIMD main loop + scalar tail across every backend. - let w = 33usize; - let f32_inputs = [0.0f32, 0.5, 1.0, 1.75, -0.25]; - let pix: std::vec::Vec = (0..w * 3) - .map(|i| half::f16::from_f32(f32_inputs[i % f32_inputs.len()])) - .collect(); - - // u8 RGB. - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - rgbf16_to_rgb_row::(&pix, &mut rgb_false, w, true); - rgbf16_to_rgb_row::(&pix, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); - - // u8 RGBA. - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - rgbf16_to_rgba_row::(&pix, &mut rgba_false, w, true); - rgbf16_to_rgba_row::(&pix, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); - - // u16 RGB. - let mut rgb_u16_false = std::vec![0u16; w * 3]; - let mut rgb_u16_host = std::vec![0u16; w * 3]; - rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); - rgbf16_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); - assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); - - // u16 RGBA. - let mut rgba_u16_false = std::vec![0u16; w * 4]; - let mut rgba_u16_host = std::vec![0u16; w * 4]; - rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); - rgbf16_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); - assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); - - // f16 lossless pass-through. - let mut f16_false = std::vec![half::f16::ZERO; w * 3]; - let mut f16_host = std::vec![half::f16::ZERO; w * 3]; - rgbf16_to_rgb_f16_row::(&pix, &mut f16_false, w, true); - rgbf16_to_rgb_f16_row::(&pix, &mut f16_host, w, true); - assert_eq!(f16_false, f16_host, "f16 RGB diverges"); - if !HOST_NATIVE_BE { - assert_eq!( - f16_host, pix, - "f16 lossless pass-through corrupted on LE host" - ); - } - - // f32 lossless widen. - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - rgbf16_to_rgb_f32_row::(&pix, &mut f32_false, w, true); - rgbf16_to_rgb_f32_row::(&pix, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 widen diverges"); -} - -/// End-to-end sinker contract test: feeding host-native `half::f16` through -/// [`MixedSinker`] must round-trip the f16 input bit-exact via -/// `with_rgb_f16` on every host. Documents the public-API contract that the -/// [`HOST_NATIVE_BE`] routing fix preserves. Pairs with the kernel-level -/// test above; together they cover both the dispatch boundary and the public -/// sinker boundary. +/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf16_sinker_host_native_contract_lossless_passthrough() { +fn rgbf16_sinker_le_encoded_frame_decodes_correctly() { let vals_f32 = [0.5f32, 1.5, -0.25, 100.0]; - let pix: std::vec::Vec = (0..16 * 4 * 3) + let intended: Vec = (0..16 * 4 * 3) .map(|i| half::f16::from_f32(vals_f32[i % vals_f32.len()])) .collect(); + // Encode the plane as LE bytes reinterpreted as f16 (the documented + // `*LE` Frame contract). On LE host: identity. On BE host: byte-swapped + // bit-patterns the kernel must `from_le` back to host-native. + let pix: Vec = intended + .iter() + .map(|&v| half::f16::from_bits(v.to_bits().to_le())) + .collect(); let src = Rgbf16Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); let mut rgb_f16_out = std::vec![half::f16::ZERO; 16 * 4 * 3]; let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) .with_rgb_f16(&mut rgb_f16_out) .unwrap(); rgbf16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - // Bit-exact pass-through on every host — broken `::` routing - // would byte-swap on a BE host; the fixed routing keeps the f16 intact. - assert_eq!(rgb_f16_out, pix, "Rgbf16 sinker f16 pass-through corrupted"); + assert_eq!( + rgb_f16_out, intended, + "Rgbf16 sinker LE-encoded plane decoded incorrectly" + ); } diff --git a/src/sinker/mixed/tests/packed_rgb_float.rs b/src/sinker/mixed/tests/packed_rgb_float.rs index fd4df2da..8bd01266 100644 --- a/src/sinker/mixed/tests/packed_rgb_float.rs +++ b/src/sinker/mixed/tests/packed_rgb_float.rs @@ -246,140 +246,59 @@ fn rgbf32_simd_matches_scalar_with_random_input() { assert_eq!(rgb_f32_simd, pix, "RGB f32 output is not lossless"); } -/// Sinker-layer host-native-`f32` regression for the bug fixed alongside -/// `c3a6478` (PR #83 codex 2nd-pass review): the [`Rgbf32`] sinker used to -/// hardcode `::` when calling the row dispatchers, telling them to -/// "decode LE-encoded input". Because [`Rgbf32Frame`] hands us a host-native -/// `&[f32]` row, that routing was a no-op on LE hosts but corrupted every -/// output path on BE hosts (the loaders would byte-swap an already-decoded -/// f32). The fix replaces those `::` with `::`, which -/// is `false` on LE and `true` on BE — a no-op byte-swap on either host. +/// LE-encoded byte contract regression: builds an [`Rgbf32Frame`] from a +/// `&[f32]` plane explicitly encoded as LE bytes (per the FFmpeg +/// `AV_PIX_FMT_*LE` convention documented on `Rgbf32Frame`), runs it +/// through the sinker's `with_rgb_f32` lossless pass-through, and asserts +/// the output equals the host-native intended values. /// -/// On a LE host (the only target Apple-Silicon and x86_64 macOS can run), -/// `HOST_NATIVE_BE = false` and `::` is byte-for-byte -/// identical to `::`, so this test cannot distinguish the broken vs -/// fixed code on LE. It instead documents the equivalence at the **kernel -/// dispatch** layer — calling each `rgbf32_to_*` dispatcher with both -/// `BE = false` and `BE = HOST_NATIVE_BE` (= `cfg!(target_endian = "big")`) -/// must produce identical output on the active host. +/// Vacuous on LE hosts (where `f32::to_le_bytes` is a no-op so the LE- +/// encoded plane *is* host-native), but on a BE host this would fail fast +/// for any regression that drops the `::` kernel routing — the +/// kernel must apply `u32::from_le` to recover host-native f32 from the +/// LE-encoded bytes; if it skipped the swap (e.g. `::` on +/// BE), the output would be byte-swapped relative to `intended`. /// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host the -/// equality `::` ≡ `::` is _false_ — `::` -/// decodes the host-native fixture as if it were LE-encoded (byte-swap), -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. The dispatch-equivalence claim is specifically -/// about the LE host-routing pattern; the BE-host correctness of the routing -/// change is verified instead by -/// [`rgbf32_sinker_host_native_contract_lossless_passthrough`] and the -/// row-kernel BE parity tests in `src/row/arch/*/tests/`. -#[test] -#[cfg(target_endian = "little")] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf32_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - rgbf32_to_rgb_f32_row, rgbf32_to_rgb_row, rgbf32_to_rgb_u16_row, rgbf32_to_rgba_row, - rgbf32_to_rgba_u16_row, - }; - - // The sinker layer's `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`. - // Compute it locally so the test asserts the same condition without taking - // a dependency on a private const. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33 covers SIMD main loop + scalar tail across every backend. - let w = 33usize; - let mut pix = std::vec![0.0f32; w * 3]; - for (i, v) in pix.iter_mut().enumerate() { - // Mix in-range, HDR, and negative values to exercise every clamp branch. - *v = match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }; - } - - // u8 RGB. - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - rgbf32_to_rgb_row::(&pix, &mut rgb_false, w, true); - rgbf32_to_rgb_row::(&pix, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges"); - - // u8 RGBA. - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - rgbf32_to_rgba_row::(&pix, &mut rgba_false, w, true); - rgbf32_to_rgba_row::(&pix, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges"); - - // u16 RGB. - let mut rgb_u16_false = std::vec![0u16; w * 3]; - let mut rgb_u16_host = std::vec![0u16; w * 3]; - rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_false, w, true); - rgbf32_to_rgb_u16_row::(&pix, &mut rgb_u16_host, w, true); - assert_eq!(rgb_u16_false, rgb_u16_host, "u16 RGB diverges"); - - // u16 RGBA. - let mut rgba_u16_false = std::vec![0u16; w * 4]; - let mut rgba_u16_host = std::vec![0u16; w * 4]; - rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_false, w, true); - rgbf32_to_rgba_u16_row::(&pix, &mut rgba_u16_host, w, true); - assert_eq!(rgba_u16_false, rgba_u16_host, "u16 RGBA diverges"); - - // f32 lossless pass-through. - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - rgbf32_to_rgb_f32_row::(&pix, &mut f32_false, w, true); - rgbf32_to_rgb_f32_row::(&pix, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 RGB diverges"); - // And on the host (LE on every CI runner) both must equal `pix` bit-exact. - if !HOST_NATIVE_BE { - assert_eq!( - f32_host, pix, - "f32 lossless pass-through corrupted on LE host" - ); - } -} - -/// End-to-end sinker contract test: feeding host-native `f32` through -/// [`MixedSinker`] must produce the same output every other sinker -/// would expect from a host-native source — specifically, `with_rgb_f32` -/// must be bit-exact identical to the input on every host. Documents the -/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves. -/// Pairs with the kernel-level test above; together they cover both the -/// dispatch boundary and the public sinker boundary. +/// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn rgbf32_sinker_host_native_contract_lossless_passthrough() { +fn rgbf32_sinker_le_encoded_frame_decodes_correctly() { // Mix HDR, in-range, and negative values — the f32 lossless path must // round-trip them bit-exact on every host. - let mut pix = std::vec![0.0f32; 16 * 4 * 3]; - for (i, v) in pix.iter_mut().enumerate() { - *v = match i % 4 { + let intended: Vec = (0..16 * 4 * 3) + .map(|i| match i % 4 { 0 => 0.5, 1 => 1.5, 2 => -0.25, _ => 100.0, - }; - } + }) + .collect(); + // Construct the plane as LE-encoded bytes reinterpreted as f32 (the + // documented `*LE` Frame contract). On LE host this is identity; on BE + // host the bit-pattern is byte-swapped so the kernel must `from_le` it + // back to host-native. + let pix: Vec = intended + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); let src = Rgbf32Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); let mut rgb_f32_out = std::vec![0.0f32; 16 * 4 * 3]; let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) .with_rgb_f32(&mut rgb_f32_out) .unwrap(); rgbf32_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - // Bit-exact pass-through on every host. On the buggy `::` routing - // a BE host would see byte-swapped output here; on the fixed routing the - // assertion holds on both LE and BE. - assert_eq!(rgb_f32_out, pix, "Rgbf32 sinker f32 pass-through corrupted"); + // Output must be host-native intended values. On a BE host with a + // regressed `::` routing this would be byte-swapped. + assert_eq!( + rgb_f32_out, intended, + "Rgbf32 sinker LE-encoded plane decoded incorrectly" + ); } diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index e6420e59..a8132480 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -862,291 +862,176 @@ fn gbrapf32_rgba_f16_strategy_a_plus_matches_independent_kernel() { ); } -// ---- HOST_NATIVE_BE routing parity (codex PR #84 Findings 1-3) ------------- +// ---- LE-encoded byte contract regressions (post-#83/#84/#85 audit) -------- // -// LE-host routing-equivalence and host-native sinker-contract tests for the -// `Gbrpf32` / `Gbrapf32` / `Gbrpf16` / `Gbrapf16` sinkers. Mirrors the -// `Rgbf32` / `Rgbf16` sinker tests added for PR #83's `dcf40a3` (sinker -// HOST_NATIVE_BE routing) and `c3a6478` (dispatch f16-widen HOST_NATIVE_BE -// routing). +// Each of the four float planar GBR Frame types is documented as +// LE-encoded bytes reinterpreted as `f32` / `half::f16` (FFmpeg `*LE` +// pixel-format convention). The sinker row-kernel dispatch must apply +// `u32::from_le` / `u16::from_le` (kernel `BE = false`) to recover host- +// native arithmetic from those bytes. These tests build a plane explicitly +// from LE-encoded bit patterns (`f32::from_bits(intended.to_bits().to_le())` +// and the f16 analogue) and assert the lossless pass-through output equals +// the host-native intended values. // -// On a LE host `HOST_NATIVE_BE = false`, so the kernel-level test below is -// a routing sanity check (proving the dispatcher / sinker substitute the -// correct `BE` template parameter); BE-host correctness of the routing is -// verified by the existing row-kernel BE parity tests in -// `src/row/arch/*/tests/` and by the contract tests below (which assert -// host-native pass-through end-to-end on every host). - -/// Kernel-level test: on a LE host, `gbrpf32_to_*::` and -/// `gbrpf32_to_*::` must produce byte-identical output for -/// every Tier 10 float planar GBR dispatcher across every output type -/// (u8 RGB / u8 RGBA / u16 RGB / u16 RGBA / f32 lossless). Width 33 covers -/// SIMD main loop + scalar tail across every backend; width 5 covers tail- -/// only paths. -/// -/// **LE-host-only**: gated on `target_endian = "little"`. On a BE host -/// `::` decodes the host-native fixture as LE-encoded (byte-swap) -/// while `:: == ::` decodes as BE (no swap), so the -/// outputs diverge by design. This sinker-routing-equivalence claim is -/// specifically about the LE host pattern; BE-host correctness of the -/// routing change is verified by the contract tests below and the row- -/// kernel BE parity tests in `src/row/arch/*/tests/`. -#[test] -#[cfg(target_endian = "little")] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrpf32_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - gbrpf32_to_rgb_f32_row, gbrpf32_to_rgb_row, gbrpf32_to_rgb_u16_row, gbrpf32_to_rgba_row, - gbrpf32_to_rgba_u16_row, - }; - - // Sinker-layer `HOST_NATIVE_BE` mirrors `cfg!(target_endian = "big")`; on - // the LE-host gate this evaluates to `false`. - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - // Width 33: SIMD main loop + scalar tail. Width 5: tail-only path. Run both - // to cover SIMD-tail-aware backends. - for w in [5usize, 7usize, 33usize] { - let mut gp = std::vec![0.0f32; w]; - let mut bp = std::vec![0.0f32; w]; - let mut rp = std::vec![0.0f32; w]; - for (i, (g, (b, r))) in gp - .iter_mut() - .zip(bp.iter_mut().zip(rp.iter_mut())) - .enumerate() - { - *g = match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }; - *b = match i % 5 { - 0 => 0.25, - 1 => 0.75, - 2 => 1.5, - 3 => 0.0, - _ => -0.5, - }; - *r = match i % 5 { - 0 => 1.0, - 1 => 0.5, - 2 => 0.0, - 3 => -0.25, - _ => 1.25, - }; - } - - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - gbrpf32_to_rgb_row::(&gp, &bp, &rp, &mut rgb_false, w, true); - gbrpf32_to_rgb_row::(&gp, &bp, &rp, &mut rgb_host, w, true); - assert_eq!(rgb_false, rgb_host, "u8 RGB diverges (w = {w})"); - - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - gbrpf32_to_rgba_row::(&gp, &bp, &rp, &mut rgba_false, w, true); - gbrpf32_to_rgba_row::(&gp, &bp, &rp, &mut rgba_host, w, true); - assert_eq!(rgba_false, rgba_host, "u8 RGBA diverges (w = {w})"); - - let mut u16_false = std::vec![0u16; w * 3]; - let mut u16_host = std::vec![0u16; w * 3]; - gbrpf32_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_false, w, true); - gbrpf32_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_host, w, true); - assert_eq!(u16_false, u16_host, "u16 RGB diverges (w = {w})"); - - let mut u16a_false = std::vec![0u16; w * 4]; - let mut u16a_host = std::vec![0u16; w * 4]; - gbrpf32_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_false, w, true); - gbrpf32_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_host, w, true); - assert_eq!(u16a_false, u16a_host, "u16 RGBA diverges (w = {w})"); - - let mut f32_false = std::vec![0.0f32; w * 3]; - let mut f32_host = std::vec![0.0f32; w * 3]; - gbrpf32_to_rgb_f32_row::(&gp, &bp, &rp, &mut f32_false, w, true); - gbrpf32_to_rgb_f32_row::(&gp, &bp, &rp, &mut f32_host, w, true); - assert_eq!(f32_false, f32_host, "f32 RGB diverges (w = {w})"); - } -} +// Vacuous on LE host (where `to_le` is identity so the LE-encoded plane is +// host-native already), but on a BE host any regression that drops the +// `::` routing would be caught here — kernel without `from_le` would +// emit byte-swapped bit-patterns, failing the bit-exact assertion below. +// +// Mirrors the `Grayf32` regression added in PR #85's `52f8191`. -/// Sinker contract test: feeding host-native `f32` planes through -/// [`MixedSinker`] must produce the same output every other sinker -/// would expect from a host-native source — specifically, `with_rgb_f32` -/// must be bit-exact identical to the source on every host. Documents the -/// public-API contract that the [`HOST_NATIVE_BE`] routing fix preserves. -/// Pairs with the kernel-level test above; together they cover both the -/// dispatch boundary and the public sinker boundary. +/// LE-encoded byte contract regression for [`Gbrpf32`]. +/// +/// Forces `with_simd(false)` so the test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri (per the +/// codex 4th-pass finding) would skip exactly the host where BE corruption +/// would surface. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrpf32_sinker_host_native_contract_lossless_passthrough() { +fn gbrpf32_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; // Mix HDR, in-range, and negative values — the f32 lossless path must // round-trip them bit-exact on every host. - let mut gp = std::vec![0.0f32; w * h]; - let mut bp = std::vec![0.0f32; w * h]; - let mut rp = std::vec![0.0f32; w * h]; - for (i, (g, (b, r))) in gp - .iter_mut() - .zip(bp.iter_mut().zip(rp.iter_mut())) - .enumerate() - { - *g = match i % 4 { + let intended_g: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 0.5, 1 => 1.5, 2 => -0.25, _ => 100.0, - }; - *b = match i % 4 { + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 0.0, 1 => 0.25, 2 => 1.0, _ => f32::INFINITY, - }; - *r = match i % 4 { + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| match i % 4 { 0 => 1.0, 1 => -1.0, 2 => 65505.0, _ => 0.5, - }; - } + }) + .collect(); + // LE-encode each plane (per the documented `*LE` Frame contract). + let gp: Vec = intended_g + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); + let bp: Vec = intended_b + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); + let rp: Vec = intended_r + .iter() + .map(|&v| f32::from_bits(v.to_bits().to_le())) + .collect(); let src = Gbrpf32Frame::try_new( &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, ) .unwrap(); - // rgb_f32 lossless: each pixel `(R, G, B)` packed in source plane order. let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_f32(&mut rgb_f32) .unwrap(); gbrpf32_to(&src, &mut sink).unwrap(); - // The lossless scatter writes `(R, G, B)` per pixel in plane-index order. - // Bit-exact equality on every host. Buggy `::` routing on a BE host - // would byte-swap the output here; the fix keeps it bit-exact. for i in 0..(w * h) { - assert_eq!(rgb_f32[i * 3], rp[i], "R mismatch at idx {i}"); - assert_eq!(rgb_f32[i * 3 + 1], gp[i], "G mismatch at idx {i}"); - assert_eq!(rgb_f32[i * 3 + 2], bp[i], "B mismatch at idx {i}"); + assert_eq!( + rgb_f32[i * 3].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgb_f32[i * 3 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgb_f32[i * 3 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); } } -/// Same as [`gbrpf32_kernel_host_native_be_matches_false_on_le_host`] but -/// for the `Gbrpf16` family — covers both `use_simd = false` (dispatch's -/// scalar widen-fallback) and `use_simd = true` (SIMD widen path) at tail -/// widths 5, 7, 33 to exercise every backend's main loop + scalar tail. +/// LE-encoded byte contract regression for [`Gbrapf32`] (lossless RGBA +/// pass-through, including the α plane). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg(target_endian = "little")] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrpf16_kernel_host_native_be_matches_false_on_le_host() { - use crate::row::{ - gbrpf16_to_rgb_row, gbrpf16_to_rgb_u16_row, gbrpf16_to_rgba_row, gbrpf16_to_rgba_u16_row, +fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h).map(|i| 0.1 + (i as f32) * 0.001).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.2 + (i as f32) * 0.002).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.3 + (i as f32) * 0.003).collect(); + let intended_a: Vec = (0..w * h).map(|i| 0.5 + (i as f32) * 0.0005).collect(); + + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); - const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); - - for w in [5usize, 7usize, 33usize] { - let gp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 0.0, - 1 => 0.5, - 2 => 1.0, - 3 => 1.75, - _ => -0.25, - }) - }) - .collect(); - let bp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 0.25, - 1 => 0.75, - 2 => 1.5, - 3 => 0.0, - _ => -0.5, - }) - }) - .collect(); - let rp: Vec = (0..w) - .map(|i| { - half::f16::from_f32(match i % 5 { - 0 => 1.0, - 1 => 0.5, - 2 => 0.0, - 3 => -0.25, - _ => 1.25, - }) - }) - .collect(); - - // Both `use_simd = false` and `use_simd = true` to cover dispatch's - // scalar widen-fallback and the SIMD widen path on every backend. - for use_simd in [false, true] { - let mut rgb_false = std::vec![0u8; w * 3]; - let mut rgb_host = std::vec![0u8; w * 3]; - gbrpf16_to_rgb_row::(&gp, &bp, &rp, &mut rgb_false, w, use_simd); - gbrpf16_to_rgb_row::(&gp, &bp, &rp, &mut rgb_host, w, use_simd); - assert_eq!( - rgb_false, rgb_host, - "u8 RGB diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut rgba_false = std::vec![0u8; w * 4]; - let mut rgba_host = std::vec![0u8; w * 4]; - gbrpf16_to_rgba_row::(&gp, &bp, &rp, &mut rgba_false, w, use_simd); - gbrpf16_to_rgba_row::(&gp, &bp, &rp, &mut rgba_host, w, use_simd); - assert_eq!( - rgba_false, rgba_host, - "u8 RGBA diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut u16_false = std::vec![0u16; w * 3]; - let mut u16_host = std::vec![0u16; w * 3]; - gbrpf16_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_false, w, use_simd); - gbrpf16_to_rgb_u16_row::(&gp, &bp, &rp, &mut u16_host, w, use_simd); - assert_eq!( - u16_false, u16_host, - "u16 RGB diverges (w = {w}, use_simd = {use_simd})" - ); - - let mut u16a_false = std::vec![0u16; w * 4]; - let mut u16a_host = std::vec![0u16; w * 4]; - gbrpf16_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_false, w, use_simd); - gbrpf16_to_rgba_u16_row::(&gp, &bp, &rp, &mut u16a_host, w, use_simd); - assert_eq!( - u16a_false, u16a_host, - "u16 RGBA diverges (w = {w}, use_simd = {use_simd})" - ); - } + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgba_f32(&mut rgba_f32) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!( + rgba_f32[i * 4].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); + assert_eq!( + rgba_f32[i * 4 + 3].to_bits(), + intended_a[i].to_bits(), + "A idx {i}" + ); } } -/// Sinker contract: host-native `half::f16` source through [`MixedSinker`] -/// `with_rgb_f16` must round-trip the planes bit-exact on every host. The -/// `::` routing keeps the lossless interleave a no-op in the -/// BE-load layer; the buggy `::` routing on a BE host would byte-swap -/// every f16 element. +/// LE-encoded byte contract regression for [`Gbrpf16`]. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { +fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; - let gp: Vec = (0..w * h) + let intended_g: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 0.5, @@ -1156,7 +1041,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); - let bp: Vec = (0..w * h) + let intended_b: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 0.0, @@ -1166,7 +1051,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); - let rp: Vec = (0..w * h) + let intended_r: Vec = (0..w * h) .map(|i| { half::f16::from_f32(match i % 4 { 0 => 1.0, @@ -1176,6 +1061,14 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { }) }) .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); let src = Gbrpf16Frame::try_new( &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, @@ -1184,6 +1077,7 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { let mut rgb_f16 = std::vec![half::f16::ZERO; w * h * 3]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgb_f16(&mut rgb_f16) .unwrap(); gbrpf16_to(&src, &mut sink).unwrap(); @@ -1191,46 +1085,53 @@ fn gbrpf16_sinker_host_native_contract_lossless_passthrough() { for i in 0..(w * h) { assert_eq!( rgb_f16[i * 3].to_bits(), - rp[i].to_bits(), - "R mismatch at idx {i}" + intended_r[i].to_bits(), + "R idx {i}" ); assert_eq!( rgb_f16[i * 3 + 1].to_bits(), - gp[i].to_bits(), - "G mismatch at idx {i}" + intended_g[i].to_bits(), + "G idx {i}" ); assert_eq!( rgb_f16[i * 3 + 2].to_bits(), - bp[i].to_bits(), - "B mismatch at idx {i}" + intended_b[i].to_bits(), + "B idx {i}" ); } } -/// Sinker contract: [`MixedSinker`] `with_rgba_f16` must round-trip -/// the source α plane bit-exact alongside the G/B/R planes, on every host. -/// Validates Strategy A+ alpha consistency under the `HOST_NATIVE_BE` -/// routing — the previous mix-mode (LE-decoded RGB + host-native α) is gone. +/// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA +/// pass-through, including the α plane). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() { +fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; - let gp: Vec = (0..w * h) + let intended_g: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001)) .collect(); - let bp: Vec = (0..w * h) + let intended_b: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002)) .collect(); - let rp: Vec = (0..w * h) + let intended_r: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003)) .collect(); - let ap: Vec = (0..w * h) + let intended_a: Vec = (0..w * h) .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001)) .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); let src = Gbrapf16Frame::try_new( &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, @@ -1239,57 +1140,543 @@ fn gbrapf16_sinker_host_native_contract_lossless_passthrough_with_alpha() { let mut rgba_f16 = std::vec![half::f16::ZERO; w * h * 4]; let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_f16(&mut rgba_f16) .unwrap(); gbrapf16_to(&src, &mut sink).unwrap(); for i in 0..(w * h) { - assert_eq!(rgba_f16[i * 4].to_bits(), rp[i].to_bits(), "R idx {i}"); - assert_eq!(rgba_f16[i * 4 + 1].to_bits(), gp[i].to_bits(), "G idx {i}"); - assert_eq!(rgba_f16[i * 4 + 2].to_bits(), bp[i].to_bits(), "B idx {i}"); - assert_eq!(rgba_f16[i * 4 + 3].to_bits(), ap[i].to_bits(), "A idx {i}"); + assert_eq!( + rgba_f16[i * 4].to_bits(), + intended_r[i].to_bits(), + "R idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 1].to_bits(), + intended_g[i].to_bits(), + "G idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 2].to_bits(), + intended_b[i].to_bits(), + "B idx {i}" + ); + assert_eq!( + rgba_f16[i * 4 + 3].to_bits(), + intended_a[i].to_bits(), + "A idx {i}" + ); } } -/// Sinker contract: [`MixedSinker`] `with_rgba_f32` lossless -/// pass-through plus α — confirms Strategy A+ alpha consistency when the -/// f32 RGB chain routes via `HOST_NATIVE_BE`. The α plane is host-native -/// f32, also routed via `HOST_NATIVE_BE`, eliminating any mix-mode. +/// LE-encoded byte contract regression for [`Gbrpf16`] **widening path** +/// (`with_rgb_f32`). Exercises the f16 → f32 widen step in the sinker — which +/// must bit-normalise LE-encoded f16 plane bits before converting to f32. +/// +/// Vacuous on LE hosts (where `to_le` is identity); on a BE host any +/// regression that drops the bit-normalize-first step in +/// `widen_f16_be_to_host_f32::` would interpret byte-swapped bits as +/// host-native f16 and decode to wildly wrong f32 values. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. #[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn gbrapf32_sinker_host_native_contract_lossless_passthrough_with_alpha() { +fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; - let mut gp = std::vec![0.0f32; w * h]; - let mut bp = std::vec![0.0f32; w * h]; - let mut rp = std::vec![0.0f32; w * h]; - let mut ap = std::vec![0.0f32; w * h]; + let intended_g: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.5, + 1 => 0.25, + 2 => 0.0, + _ => 1.0, + }) + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.125, + 1 => 0.75, + 2 => 0.0625, + _ => 0.875, + }) + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.375, + 1 => 0.625, + 2 => 0.9375, + _ => 0.03125, + }) + }) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + + let src = Gbrpf16Frame::try_new( + &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + let mut rgb_f32 = std::vec![0.0f32; w * h * 3]; + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgb_f32(&mut rgb_f32) + .unwrap(); + gbrpf16_to(&src, &mut sink).unwrap(); + for i in 0..(w * h) { - gp[i] = 0.1 + (i as f32) * 0.001; - bp[i] = 0.2 + (i as f32) * 0.002; - rp[i] = 0.3 + (i as f32) * 0.003; - ap[i] = 0.5 + (i as f32) * 0.0005; + assert_eq!(rgb_f32[i * 3], intended_r[i].to_f32(), "R idx {i}"); + assert_eq!(rgb_f32[i * 3 + 1], intended_g[i].to_f32(), "G idx {i}"); + assert_eq!(rgb_f32[i * 3 + 2], intended_b[i].to_f32(), "B idx {i}"); } +} - let src = Gbrapf32Frame::try_new( +/// LE-encoded byte contract regression for [`Gbrapf16`] **widening path** +/// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 → +/// f32 widen step — same bit-normalise-first contract as the no-α variant. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. +#[test] +fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.1 + (i as f32) * 0.001)) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.2 + (i as f32) * 0.002)) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.3 + (i as f32) * 0.003)) + .collect(); + let intended_a: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.5 + (i as f32) * 0.001)) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); + + let src = Gbrapf16Frame::try_new( &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, ) .unwrap(); let mut rgba_f32 = std::vec![0.0f32; w * h * 4]; - let mut sink = MixedSinker::::new(w, h) + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) .with_rgba_f32(&mut rgba_f32) .unwrap(); - gbrapf32_to(&src, &mut sink).unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgba_f32[i * 4], intended_r[i].to_f32(), "R idx {i}"); + assert_eq!(rgba_f32[i * 4 + 1], intended_g[i].to_f32(), "G idx {i}"); + assert_eq!(rgba_f32[i * 4 + 2], intended_b[i].to_f32(), "B idx {i}"); + assert_eq!(rgba_f32[i * 4 + 3], intended_a[i].to_f32(), "A idx {i}"); + } +} + +/// LE-encoded byte contract regression for [`Gbrpf16`] **widening → narrow +/// chain** (`with_rgb_u16` and `with_rgba`). Covers the post-widen routing +/// where `gbrpf32_to_rgb_u16_row` / `gbrpf32_to_rgba_u16_row` / +/// `gbrpf32_to_rgb_row` are invoked on **host-native f32 scratch** produced +/// by `widen_f16_be_to_host_f32::`. +/// +/// On a BE host this would have been corrupted under the prior +/// `gbrpf32_to_*::` post-widen routing — that kernel applied +/// `from_le` to scratch that was already host-native, byte-swapping the +/// f32 representation before scaling. Fixed by routing post-widen calls +/// through `::` (`true` on BE, `false` on LE), which makes +/// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch +/// the double-swap on BE. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. +#[test] +fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { + let w = 16usize; + let h = 4usize; + let intended_g: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.5, + 1 => 0.25, + 2 => 0.0, + _ => 1.0, + }) + }) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.125, + 1 => 0.75, + 2 => 0.0625, + _ => 0.875, + }) + }) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 4 { + 0 => 0.375, + 1 => 0.625, + 2 => 0.9375, + _ => 0.03125, + }) + }) + .collect(); + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + + let src = Gbrpf16Frame::try_new( + &gp, &bp, &rp, w as u32, h as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + // Exercise the u16 narrow path (post-widen → gbrpf32_to_rgb_u16_row). + let mut rgb_u16 = std::vec![0u16; w * h * 3]; + // Exercise the u8 narrow path via with_rgba (Strategy A: post-widen + // is unused for u8 since rgba=opaque-α; we trigger the SAME post-widen + // path by also attaching luma_u16 alongside u16). + let mut luma_u16 = std::vec![0u16; w * h]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgb_u16(&mut rgb_u16) + .unwrap() + .with_luma_u16(&mut luma_u16) + .unwrap(); + gbrpf16_to(&src, &mut sink).unwrap(); + } + + // Assert RGB u16 output matches the intended (clamp+scale × 65535) values. + let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 }; for i in 0..(w * h) { - assert_eq!(rgba_f32[i * 4], rp[i], "R idx {i}"); - assert_eq!(rgba_f32[i * 4 + 1], gp[i], "G idx {i}"); - assert_eq!(rgba_f32[i * 4 + 2], bp[i], "B idx {i}"); - assert_eq!(rgba_f32[i * 4 + 3], ap[i], "A idx {i}"); + assert_eq!( + rgb_u16[i * 3], + to_u16(intended_r[i].to_f32()), + "RGB u16 R idx {i}" + ); + assert_eq!( + rgb_u16[i * 3 + 1], + to_u16(intended_g[i].to_f32()), + "RGB u16 G idx {i}" + ); + assert_eq!( + rgb_u16[i * 3 + 2], + to_u16(intended_b[i].to_f32()), + "RGB u16 B idx {i}" + ); + } + // Sanity: luma_u16 (post-widen narrow) is non-zero — locks down that + // the post-widen luma kernel also sees host-native f32 scratch. + assert!( + luma_u16.iter().any(|&v| v > 0), + "luma_u16 must contain non-zero samples — \ + a corrupted byte-swap would still emit non-zero output but the rgb_u16 \ + assertion above is the primary guard" + ); +} + +// ---- LE-encoded Strategy A+ alpha-patch regressions (codex 3rd-pass) ------ +// +// The `copy_alpha_plane_f32_to_u8` (and `copy_alpha_plane_f32_to_u16`, +// `copy_alpha_plane_f32`) helper used to read each f32 α sample as +// host-native, which silently corrupted the α slot on BE hosts processing +// the LE-encoded `Gbrapf32Frame` α plane (the byte-swapped bits clamp to +// near-zero or near-one, producing α = 0 or 255 regardless of intent). +// Same bug class as the u16 alpha-patch helpers fixed in cf26058. +// +// These regressions trigger the **Strategy A+ combo path** (`with_rgb` + +// `with_rgba`, `with_rgb_u16` + `with_rgba_u16`) on a Frame whose α plane +// is built from explicit LE-encoded f32 bit-patterns. On a LE host the +// `to_le` on f32 bits is identity so the test reduces to the original +// semantics; on a BE host the kernel without `from_le` would clamp +// byte-swapped garbage and the assertion would fail. The non-multiple-of- +// SIMD widths (15, 17) exercise scalar-tail correctness in addition to +// any vectorized body. + +/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb` + `with_rgba`) +/// on a LE-encoded f32 α plane must reproduce standalone `with_rgba` output +/// byte-for-byte. The standalone path uses `gbrapf32_to_rgba_row::` +/// (already endian-aware), so any deviation indicates the Strategy A+ +/// alpha-patch path corrupted the α plane. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. +#[test] +fn gbrapf32_strategy_a_plus_le_encoded_frame_alpha_decodes_correctly() { + // 15 is non-multiple-of-{4,8,16} — exercises scalar tail in every backend. + let w = 15usize; + let h = 3usize; + let intended_g: Vec = (0..w * h).map(|i| 0.10 + (i as f32) * 0.001).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.20 + (i as f32) * 0.002).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.30 + (i as f32) * 0.003).collect(); + // Deliberately mix in-range, boundary, > 1, and negative α to stress + // clamp/scale correctness *after* the bit-normalize step. + let intended_a: Vec = (0..w * h) + .map(|i| match i % 7 { + 0 => 0.0, + 1 => 0.5, + 2 => 1.0, + 3 => 1.5, + 4 => -0.1, + 5 => 0.123, + _ => 0.876, + }) + .collect(); + + // LE-encode every plane (per the documented `*LE` Frame contract). + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); + + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba`. + let mut rgba_ref = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgba(&mut rgba_ref) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + // Strategy A+: `with_rgb` + `with_rgba` combo (alpha-patch path). + let mut rgb_combo = std::vec![0u8; w * h * 3]; + let mut rgba_combo = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgb(&mut rgb_combo) + .unwrap() + .with_rgba(&mut rgba_combo) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf32 Strategy A+ alpha-patch must equal standalone `with_rgba`" + ); +} + +/// Codex 3rd-pass regression: Gbrapf32 Strategy A+ (`with_rgb_u16` + +/// `with_rgba_u16`) on a LE-encoded f32 α plane. Defense-in-depth: the +/// current sinker calls `gbrapf32_to_rgba_u16_row::` directly here +/// (no alpha-patch helper invocation), but any future routing change that +/// switches to the alpha-patch helper must keep BE-host correctness. +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. +#[test] +fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { + // 17 is non-multiple-of-{4,8,16}. + let w = 17usize; + let h = 3usize; + let intended_g: Vec = (0..w * h).map(|i| 0.11 + (i as f32) * 0.0011).collect(); + let intended_b: Vec = (0..w * h).map(|i| 0.22 + (i as f32) * 0.0022).collect(); + let intended_r: Vec = (0..w * h).map(|i| 0.33 + (i as f32) * 0.0033).collect(); + let intended_a: Vec = (0..w * h) + .map(|i| match i % 5 { + 0 => 0.0, + 1 => 0.25, + 2 => 1.0, + 3 => 0.5, + _ => 0.75, + }) + .collect(); + + let le = |v: &Vec| -> Vec { + v.iter() + .map(|&x| f32::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le(&intended_g); + let bp = le(&intended_b); + let rp = le(&intended_r); + let ap = le(&intended_a); + + let src = Gbrapf32Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba_u16`. + let mut rgba_ref = std::vec![0u16; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgba_u16(&mut rgba_ref) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + // Combo: `with_rgb_u16` + `with_rgba_u16`. + let mut rgb_combo = std::vec![0u16; w * h * 3]; + let mut rgba_combo = std::vec![0u16; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgb_u16(&mut rgb_combo) + .unwrap() + .with_rgba_u16(&mut rgba_combo) + .unwrap(); + gbrapf32_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf32 Strategy A+ rgba_u16 must equal standalone `with_rgba_u16`" + ); + + // Independently assert the α slot reflects the intended values + // (clamp × 65535 + 0.5). This catches a hypothetical regression where + // both code paths share the same bug. + let to_u16 = |v: f32| -> u16 { (v.clamp(0.0, 1.0) * 65535.0 + 0.5) as u16 }; + for i in 0..(w * h) { + assert_eq!( + rgba_combo[i * 4 + 3], + to_u16(intended_a[i]), + "α slot idx {i}" + ); + } +} + +/// Codex 3rd-pass regression: Gbrapf16 Strategy A+ (`with_rgb` + `with_rgba`) +/// on a LE-encoded f16 α plane. This exercises the **post-widen** routing +/// pattern in `widen_and_scatter_f16_alpha_to_u8`: the f16 α plane is +/// widened to host-native f32 scratch via `widen_f16_be_to_host_f32::`, +/// then the alpha-patch helper must consume that scratch with +/// `BE = HOST_NATIVE_BE` (no double byte-swap). The test compares the +/// Strategy A+ combo output against the standalone `with_rgba` path, which +/// uses the `gbrpf16_to_rgba_row::` direct kernel + the same +/// `widen_and_scatter_f16_alpha_to_u8` helper (both paths share the +/// `widen_and_scatter` helper, so this test guards against the +/// post-widen routing flag being wrong). +/// +/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host +/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` +/// docstring for the rationale. +#[test] +fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { + let w = 15usize; + let h = 3usize; + let intended_g: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.10 + (i as f32) * 0.001)) + .collect(); + let intended_b: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.20 + (i as f32) * 0.002)) + .collect(); + let intended_r: Vec = (0..w * h) + .map(|i| half::f16::from_f32(0.30 + (i as f32) * 0.003)) + .collect(); + let intended_a: Vec = (0..w * h) + .map(|i| { + half::f16::from_f32(match i % 5 { + 0 => 0.0, + 1 => 0.5, + 2 => 1.0, + 3 => 0.25, + _ => 0.75, + }) + }) + .collect(); + + let le_f16 = |v: &Vec| -> Vec { + v.iter() + .map(|&x| half::f16::from_bits(x.to_bits().to_le())) + .collect() + }; + let gp = le_f16(&intended_g); + let bp = le_f16(&intended_b); + let rp = le_f16(&intended_r); + let ap = le_f16(&intended_a); + + let src = Gbrapf16Frame::try_new( + &gp, &bp, &rp, &ap, w as u32, h as u32, w as u32, w as u32, w as u32, w as u32, + ) + .unwrap(); + + // Reference: standalone `with_rgba` (uses `gbrpf16_to_rgba_row` then + // `widen_and_scatter_f16_alpha_to_u8` → exercises the post-widen helper + // as well, with the same routing as the combo path). + let mut rgba_ref = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgba(&mut rgba_ref) + .unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + } + + // Strategy A+: `with_rgb` + `with_rgba`. + let mut rgb_combo = std::vec![0u8; w * h * 3]; + let mut rgba_combo = std::vec![0u8; w * h * 4]; + { + let mut sink = MixedSinker::::new(w, h) + .with_simd(false) + .with_rgb(&mut rgb_combo) + .unwrap() + .with_rgba(&mut rgba_combo) + .unwrap(); + gbrapf16_to(&src, &mut sink).unwrap(); + } + + assert_eq!( + rgba_combo, rgba_ref, + "Gbrapf16 Strategy A+ post-widen alpha-patch must equal standalone `with_rgba`" + ); + + // Independently assert α slot reflects the intended values + // (widen → clamp × 255 + 0.5). + let to_u8 = |v: f32| -> u8 { (v.clamp(0.0, 1.0) * 255.0 + 0.5) as u8 }; + for i in 0..(w * h) { + assert_eq!( + rgba_combo[i * 4 + 3], + to_u8(intended_a[i].to_f32()), + "α slot idx {i}" + ); } }