Findit-AI · al8n · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/src/frame/packed_rgb_f16.rs b/src/frame/packed_rgb_f16.rs
@@ -50,7 +50,7 @@ pub enum Rgbf16FrameError {
   },
 }
 
-/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16`).
+/// A validated packed **RGBF16** frame (FFmpeg `AV_PIX_FMT_RGBF16LE`).
 /// One plane, 3 × `f16` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -65,6 +65,25 @@ pub enum Rgbf16FrameError {
 /// `stride` is in **`f16` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The `&[half::f16]` plane is the **LE-encoded byte layout** reinterpreted
+/// as `f16`, matching the FFmpeg **`AV_PIX_FMT_RGBF16LE`** pixel-format
+/// convention. (FFmpeg's unsuffixed `AV_PIX_FMT_RGBF16` is a *target-endian*
+/// alias — `RGBF16LE` on a little-endian host, `RGBF16BE` on a big-endian
+/// host — so this contract pins the canonical `*LE` byte order regardless
+/// of host endianness.)
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[half::f16]` is also a host-native f16 slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf16Frame<'a> {
   rgb: &'a [half::f16],

diff --git a/src/frame/packed_rgb_float.rs b/src/frame/packed_rgb_float.rs
@@ -50,7 +50,7 @@ pub enum Rgbf32FrameError {
   },
 }
 
-/// A validated packed **RGBF32** frame (FFmpeg `AV_PIX_FMT_RGBF32`).
+/// A validated packed **RGBF32** frame.
 /// One plane, 3 × `f32` per pixel, channel order `R, G, B`.
 ///
 /// Values are **linear** RGB by convention — no gamma / OETF handling
@@ -64,6 +64,29 @@ pub enum Rgbf32FrameError {
 /// `stride` is in **`f32` elements** (≥ `3 * width`), matching the
 /// per-format convention that stride aligns with the underlying slice
 /// element type. No width parity constraint.
+///
+/// # Endian contract — **LE-encoded bytes** (`AV_PIX_FMT_RGBF32LE`)
+///
+/// The `&[f32]` plane is the **LE-encoded byte layout** reinterpreted as
+/// `f32`. This frame maps to FFmpeg `AV_PIX_FMT_RGBF32LE`. FFmpeg also
+/// defines `AV_PIX_FMT_RGBF32BE` and an unsuffixed `AV_PIX_FMT_RGBF32`
+/// alias that is **target-endian** (resolves to `RGBF32LE` on LE hosts and
+/// `RGBF32BE` on BE hosts). **Callers on a BE host who hold target-endian
+/// `AV_PIX_FMT_RGBF32` bytes must convert them to LE before constructing
+/// this frame** — otherwise the LE-decode contract here would re-interpret
+/// the BE bytes as LE and produce byte-swapped float data. The 4-channel
+/// `AV_PIX_FMT_RGBAF32LE` / `AV_PIX_FMT_RGBAF32BE` pair follows the same
+/// `*LE` convention; this frame uses the analogous LE binding.
+///
+/// On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so `&[f32]` is also a host-native float slice; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding a byte buffer
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide
+/// `linesize[0]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Rgbf32Frame<'a> {
   rgb: &'a [f32],

diff --git a/src/frame/planar_gbr_float.rs b/src/frame/planar_gbr_float.rs
@@ -147,6 +147,20 @@ const fn check_plane(
 /// `f32` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// preserved bit-exact on lossless pass-through outputs and clamped to
 /// `[0.0, 1.0]` on integer-output paths.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf32Frame<'a> {
   g: &'a [f32],
@@ -250,6 +264,20 @@ impl<'a> Gbrpf32Frame<'a> {
 /// Four full-resolution `f32` planes in **G, B, R, A** order. Alpha is
 /// real per-pixel; nominal range `[0.0, 1.0]` (opaque = 1.0). Stride is
 /// in `f32` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[f32]` planes are the **LE-encoded byte layout** reinterpreted
+/// as `f32`, matching the FFmpeg `*LE` pixel-format suffix in the format
+/// name. On a little-endian host (every CI runner today) LE bytes _are_
+/// host-native, so the slices are also host-native float slices; on a
+/// big-endian host the bytes have to be byte-swapped back to host-native
+/// before arithmetic. Downstream row kernels handle this byte-swap (or
+/// no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f32 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 4 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf32Frame<'a> {
   g: &'a [f32],
@@ -372,6 +400,20 @@ impl<'a> Gbrapf32Frame<'a> {
 /// Three full-resolution [`half::f16`] planes in **G, B, R** order. Stride
 /// is in `f16` elements. Nominal range `[0.0, 1.0]`; HDR values > 1.0 are
 /// permitted (saturation to `+Inf` occurs on f16→f32 narrowing paths).
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The three `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrpf16Frame<'a> {
   g: &'a [half::f16],
@@ -475,6 +517,20 @@ impl<'a> Gbrpf16Frame<'a> {
 /// Four full-resolution [`half::f16`] planes in **G, B, R, A** order.
 /// Alpha is real per-pixel; nominal range `[0.0, 1.0]`. Stride is in
 /// `f16` elements.
+///
+/// # Endian contract — **LE-encoded bytes**
+///
+/// The four `&[half::f16]` planes are the **LE-encoded byte layout**
+/// reinterpreted as `f16`, matching the FFmpeg `*LE` pixel-format suffix in
+/// the format name. On a little-endian host (every CI runner today) LE
+/// bytes _are_ host-native, so the slices are also host-native f16 slices;
+/// on a big-endian host the bytes have to be byte-swapped back to
+/// host-native before arithmetic. Downstream row kernels handle this
+/// byte-swap (or no-op on LE) under the hood — callers do **not** pre-swap.
+///
+/// Stride is in **f16 elements** (not bytes). Callers holding byte buffers
+/// from FFmpeg should cast via `bytemuck::cast_slice` and divide each
+/// `linesize[i]` by 2 before constructing.
 #[derive(Debug, Clone, Copy)]
 pub struct Gbrapf16Frame<'a> {
   g: &'a [half::f16],

diff --git a/src/row/arch/neon/alpha_extract.rs b/src/row/arch/neon/alpha_extract.rs
@@ -116,7 +116,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+    scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -154,7 +154,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
   }
 
   if x < width {
-    scalar::copy_alpha_packed_u16x4_at_0(
+    scalar::copy_alpha_packed_u16x4_at_0::<false>(
       &packed[x * 4..width * 4],
       &mut rgba_out[x * 4..width * 4],
       width - x,
@@ -357,7 +357,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -375,7 +375,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }

diff --git a/src/row/arch/wasm_simd128/alpha_extract.rs b/src/row/arch/wasm_simd128/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -226,7 +226,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -518,7 +518,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -538,7 +538,7 @@ mod tests {
       unsafe {
         super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w);
       }
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }

diff --git a/src/row/arch/x86_avx2/alpha_extract.rs b/src/row/arch/x86_avx2/alpha_extract.rs
@@ -213,7 +213,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -636,7 +636,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -657,7 +657,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }

diff --git a/src/row/arch/x86_avx512/alpha_extract.rs b/src/row/arch/x86_avx512/alpha_extract.rs
@@ -206,7 +206,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -294,7 +294,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -604,7 +604,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -627,7 +627,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }

diff --git a/src/row/arch/x86_sse41/alpha_extract.rs b/src/row/arch/x86_sse41/alpha_extract.rs
@@ -152,7 +152,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -227,7 +227,7 @@ pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
     }
 
     if x < width {
-      scalar::copy_alpha_packed_u16x4_at_0(
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(
         &packed[x * 4..width * 4],
         &mut rgba_out[x * 4..width * 4],
         width - x,
@@ -521,7 +521,7 @@ mod tests {
       pseudo_random_u8(&mut rgba_simd, 0xFEED);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }
@@ -542,7 +542,7 @@ mod tests {
       pseudo_random_u16(&mut rgba_simd, 0x1337);
       let mut rgba_scalar = rgba_simd.clone();
       unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
-      scalar::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_scalar, w);
+      scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
       assert_eq!(rgba_simd, rgba_scalar, "width={w}");
     }
   }