From f0803b7bd49f675dbe3bf9d8994e7c062a56b5ec Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Thu, 7 May 2026 23:35:41 +1200 Subject: [PATCH 1/3] feat(be-tier10-float): BE support for Gbrpf32/Gbrapf32/Gbrpf16/Gbrapf16 row kernels Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon/endian.rs | 44 ++ src/row/arch/neon/planar_gbr_float.rs | 295 ++++++----- src/row/arch/neon/tests/planar_gbr_float.rs | 271 +++++++--- src/row/arch/wasm_simd128/planar_gbr_float.rs | 132 ++--- .../wasm_simd128/tests/planar_gbr_float.rs | 147 ++++-- src/row/arch/x86_avx2/planar_gbr_float.rs | 256 ++++----- .../arch/x86_avx2/tests/planar_gbr_float.rs | 297 +++++++---- src/row/arch/x86_avx512/endian.rs | 61 +++ src/row/arch/x86_avx512/planar_gbr_float.rs | 256 ++++----- .../arch/x86_avx512/tests/planar_gbr_float.rs | 297 +++++++---- src/row/arch/x86_sse41/endian.rs | 52 ++ src/row/arch/x86_sse41/planar_gbr_float.rs | 254 ++++----- .../arch/x86_sse41/tests/planar_gbr_float.rs | 237 ++++++--- src/row/dispatch/planar_gbr_float.rs | 324 ++++++------ src/row/scalar/planar_gbr_f16.rs | 183 ++++++- src/row/scalar/planar_gbr_float.rs | 485 ++++++++++++++---- src/sinker/mixed/planar_gbr_f16.rs | 44 +- src/sinker/mixed/planar_gbr_float.rs | 44 +- 18 files changed, 2441 insertions(+), 1238 deletions(-) diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs index 55ac4ad5..e3b48606 100644 --- a/src/row/arch/neon/endian.rs +++ b/src/row/arch/neon/endian.rs @@ -17,6 +17,50 @@ use core::arch::aarch64::*; +// ---- u16x4 loaders --------------------------------------------------------- + +/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> uint16x4_t { + let v = unsafe { vld1_u16(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) }; + v +} + +/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte. +/// Caller must have NEON enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> uint16x4_t { + let v = unsafe { vld1_u16(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4`. +/// +/// # Safety +/// +/// Same as `load_le_u16x4` / `load_be_u16x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x4(ptr: *const u8) -> uint16x4_t { + if BE { + unsafe { load_be_u16x4(ptr) } + } else { + unsafe { load_le_u16x4(ptr) } + } +} + // ---- u16x8 loaders --------------------------------------------------------- /// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order. diff --git a/src/row/arch/neon/planar_gbr_float.rs b/src/row/arch/neon/planar_gbr_float.rs index df682d83..536f7c02 100644 --- a/src/row/arch/neon/planar_gbr_float.rs +++ b/src/row/arch/neon/planar_gbr_float.rs @@ -24,7 +24,10 @@ use core::arch::aarch64::*; use crate::{ ColorMatrix, - row::scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + row::{ + arch::neon::endian::{load_endian_u16x4, load_endian_u32x4}, + scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + }, }; // ---- shared helpers --------------------------------------------------------- @@ -35,6 +38,16 @@ unsafe fn clamp01(v: float32x4_t, zero: float32x4_t, one: float32x4_t) -> float3 unsafe { vminq_f32(vmaxq_f32(v, zero), one) } } +/// Load 4 f32 values with optional BE byte-swap via `load_endian_u32x4`. +/// This is the endian-aware replacement for `vld1q_f32(ptr.add(x))`. +#[inline(always)] +unsafe fn load_f32x4(ptr: *const f32, x: usize) -> float32x4_t { + unsafe { + let u = load_endian_u32x4::(ptr.add(x).cast::()); + vreinterpretq_f32_u32(u) + } +} + /// Scale, add 0.5, truncate → `uint32x4_t` (round-half-up). #[inline(always)] unsafe fn scale_round_u32(v: float32x4_t, scale: float32x4_t, half: float32x4_t) -> uint32x4_t { @@ -58,7 +71,7 @@ unsafe fn narrow_to_u8(v: uint32x4_t) -> uint8x8_t { /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgb_row( +pub(crate) unsafe fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -78,9 +91,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); let gi = narrow_to_u8(scale_round_u32(gv, scale, half)); let bi = narrow_to_u8(scale_round_u32(bv, scale, half)); let ri = narrow_to_u8(scale_round_u32(rv, scale, half)); @@ -93,7 +106,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -109,7 +122,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgba_row( +pub(crate) unsafe fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -130,9 +143,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); let gi = narrow_to_u8(scale_round_u32(gv, scale, half)); let bi = narrow_to_u8(scale_round_u32(bv, scale, half)); let ri = narrow_to_u8(scale_round_u32(rv, scale, half)); @@ -145,7 +158,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -161,7 +174,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -181,9 +194,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); let gu = vqmovn_u32(scale_round_u32(gv, scale, half)); let bu = vqmovn_u32(scale_round_u32(bv, scale, half)); let ru = vqmovn_u32(scale_round_u32(rv, scale, half)); @@ -192,7 +205,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -208,7 +221,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -229,9 +242,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); let gu = vqmovn_u32(scale_round_u32(gv, scale, half)); let bu = vqmovn_u32(scale_round_u32(bv, scale, half)); let ru = vqmovn_u32(scale_round_u32(rv, scale, half)); @@ -240,7 +253,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -258,7 +271,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -273,14 +286,14 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); vst3q_f32(out.as_mut_ptr().add(x * 3), float32x4x3_t(rv, gv, bv)); x += 4; } if x < width { - scalar::gbrpf32_to_rgb_f32_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_f32_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -298,7 +311,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -314,9 +327,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( let one_v = vdupq_n_f32(1.0); let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); vst4q_f32( out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, one_v), @@ -324,7 +337,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f32_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f32_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -343,7 +356,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_fp16( +pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_fp16( g: &[f32], b: &[f32], r: &[f32], @@ -358,9 +371,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); // IEEE-754 RNE narrow via vcvt_f16_f32. let gh = vcvt_f16_f32(gv); let bh = vcvt_f16_f32(bv); @@ -377,7 +390,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_fp16( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -393,7 +406,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_fp16( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( +pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( g: &[f32], b: &[f32], r: &[f32], @@ -412,9 +425,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( let _ = one_h; // computed above; use constant for clarity let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); let gh = vreinterpret_u16_f16(vcvt_f16_f32(gv)); let bh = vreinterpret_u16_f16(vcvt_f16_f32(bv)); let rh = vreinterpret_u16_f16(vcvt_f16_f32(rv)); @@ -425,7 +438,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -445,7 +458,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_row( +pub(crate) unsafe fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -467,7 +480,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -498,7 +511,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_u16_row( +pub(crate) unsafe fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -517,7 +530,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -547,7 +560,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( /// 3. `h_out.len()`, `s_out.len()`, `v_out.len()` ≥ `width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf32_to_hsv_row( +pub(crate) unsafe fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -568,7 +581,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -598,7 +611,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrapf32_to_rgba_row( +pub(crate) unsafe fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -620,10 +633,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); - let av = clamp01(vld1q_f32(a.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); + let av = clamp01(load_f32x4::(a.as_ptr(), x), zero, one); let gi = narrow_to_u8(scale_round_u32(gv, scale, half)); let bi = narrow_to_u8(scale_round_u32(bv, scale, half)); let ri = narrow_to_u8(scale_round_u32(rv, scale, half)); @@ -636,7 +649,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &g[x..], &b[x..], &r[x..], @@ -659,7 +672,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -681,10 +694,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(vld1q_f32(g.as_ptr().add(x)), zero, one); - let bv = clamp01(vld1q_f32(b.as_ptr().add(x)), zero, one); - let rv = clamp01(vld1q_f32(r.as_ptr().add(x)), zero, one); - let av = clamp01(vld1q_f32(a.as_ptr().add(x)), zero, one); + let gv = clamp01(load_f32x4::(g.as_ptr(), x), zero, one); + let bv = clamp01(load_f32x4::(b.as_ptr(), x), zero, one); + let rv = clamp01(load_f32x4::(r.as_ptr(), x), zero, one); + let av = clamp01(load_f32x4::(a.as_ptr(), x), zero, one); let gu = vqmovn_u32(scale_round_u32(gv, scale, half)); let bu = vqmovn_u32(scale_round_u32(bv, scale, half)); let ru = vqmovn_u32(scale_round_u32(rv, scale, half)); @@ -693,7 +706,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &g[x..], &b[x..], &r[x..], @@ -718,7 +731,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -735,15 +748,15 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); - let av = vld1q_f32(a.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); + let av = load_f32x4::(a.as_ptr(), x); vst4q_f32(out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, av)); x += 4; } if x < width { - scalar::gbrapf32_to_rgba_f32_row( + scalar::gbrapf32_to_rgba_f32_row::( &g[x..], &b[x..], &r[x..], @@ -766,7 +779,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_fp16( +pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_fp16( g: &[f32], b: &[f32], r: &[f32], @@ -783,10 +796,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vld1q_f32(g.as_ptr().add(x)); - let bv = vld1q_f32(b.as_ptr().add(x)); - let rv = vld1q_f32(r.as_ptr().add(x)); - let av = vld1q_f32(a.as_ptr().add(x)); + let gv = load_f32x4::(g.as_ptr(), x); + let bv = load_f32x4::(b.as_ptr(), x); + let rv = load_f32x4::(r.as_ptr(), x); + let av = load_f32x4::(a.as_ptr(), x); let gh = vreinterpret_u16_f16(vcvt_f16_f32(gv)); let bh = vreinterpret_u16_f16(vcvt_f16_f32(bv)); let rh = vreinterpret_u16_f16(vcvt_f16_f32(rv)); @@ -798,7 +811,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_fp16( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_f16_row( + scalar::gbrapf32_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], @@ -821,7 +834,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_fp16( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -841,9 +854,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -868,7 +881,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_row( + scalar::gbrpf32_to_rgb_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -890,7 +903,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon,fp16")] -pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -911,9 +924,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -937,7 +950,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_row( + scalar::gbrpf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -960,7 +973,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -980,9 +993,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1002,7 +1015,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_u16_row( + scalar::gbrpf32_to_rgb_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1025,7 +1038,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1046,9 +1059,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1068,7 +1081,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_u16_row( + scalar::gbrpf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1091,7 +1104,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1106,9 +1119,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); vst3q_f32(out.as_mut_ptr().add(x * 3), float32x4x3_t(rv, gv, bv)); x += 4; } @@ -1122,7 +1135,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_f32_row( + scalar::gbrpf32_to_rgb_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1145,7 +1158,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( +pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1161,9 +1174,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( let one_v = vdupq_n_f32(1.0); let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); vst4q_f32( out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, one_v), @@ -1180,7 +1193,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_f32_row( + scalar::gbrpf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1204,7 +1217,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1219,9 +1232,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gu = vld1_u16(g.as_ptr().add(x).cast::()); - let bu = vld1_u16(b.as_ptr().add(x).cast::()); - let ru = vld1_u16(r.as_ptr().add(x).cast::()); + let gu = load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = load_endian_u16x4::(r.as_ptr().add(x).cast::()); vst3_u16( out.as_mut_ptr().add(x * 3).cast::(), uint16x4x3_t(ru, gu, bu), @@ -1229,7 +1242,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -1245,7 +1258,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1262,9 +1275,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( let alpha = vdup_n_u16(0x3C00u16); let mut x = 0usize; while x + 4 <= width { - let gu = vld1_u16(g.as_ptr().add(x).cast::()); - let bu = vld1_u16(b.as_ptr().add(x).cast::()); - let ru = vld1_u16(r.as_ptr().add(x).cast::()); + let gu = load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = load_endian_u16x4::(r.as_ptr().add(x).cast::()); vst4_u16( out.as_mut_ptr().add(x * 4).cast::(), uint16x4x4_t(ru, gu, bu, alpha), @@ -1272,7 +1285,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -1290,7 +1303,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( #[target_feature(enable = "neon,fp16")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_row_fp16( +pub(crate) unsafe fn gbrpf16_to_luma_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1309,7 +1322,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_fp16( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_fp16( + gbrpf16_to_rgb_row_fp16::( &g[offset..], &b[offset..], &r[offset..], @@ -1341,7 +1354,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_fp16( #[target_feature(enable = "neon,fp16")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_u16_row_fp16( +pub(crate) unsafe fn gbrpf16_to_luma_u16_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1360,7 +1373,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_fp16( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_fp16( + gbrpf16_to_rgb_row_fp16::( &g[offset..], &b[offset..], &r[offset..], @@ -1391,7 +1404,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_hsv_row_fp16( +pub(crate) unsafe fn gbrpf16_to_hsv_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1412,7 +1425,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_fp16( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_fp16( + gbrpf16_to_rgb_row_fp16::( &g[offset..], &b[offset..], &r[offset..], @@ -1443,7 +1456,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( +pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1465,10 +1478,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(a.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1496,7 +1509,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1520,7 +1533,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( +pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1542,10 +1555,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(a.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1569,7 +1582,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1593,7 +1606,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( #[inline] #[target_feature(enable = "neon,fp16")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( +pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1610,10 +1623,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(g.as_ptr().add(x).cast()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(b.as_ptr().add(x).cast()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(r.as_ptr().add(x).cast()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(a.as_ptr().add(x).cast()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); vst4q_f32(out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, av)); x += 4; } @@ -1629,7 +1642,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_f32_row( + scalar::gbrapf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1654,7 +1667,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1671,10 +1684,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gu = vld1_u16(g.as_ptr().add(x).cast::()); - let bu = vld1_u16(b.as_ptr().add(x).cast::()); - let ru = vld1_u16(r.as_ptr().add(x).cast::()); - let au = vld1_u16(a.as_ptr().add(x).cast::()); + let gu = load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = load_endian_u16x4::(r.as_ptr().add(x).cast::()); + let au = load_endian_u16x4::(a.as_ptr().add(x).cast::()); vst4_u16( out.as_mut_ptr().add(x * 4).cast::(), uint16x4x4_t(ru, gu, bu, au), @@ -1682,7 +1695,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrapf16_to_rgba_f16_row( + scalar_f16::gbrapf16_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], diff --git a/src/row/arch/neon/tests/planar_gbr_float.rs b/src/row/arch/neon/tests/planar_gbr_float.rs index 0e591f1f..1f35cdbc 100644 --- a/src/row/arch/neon/tests/planar_gbr_float.rs +++ b/src/row/arch/neon/tests/planar_gbr_float.rs @@ -39,9 +39,9 @@ fn neon_gbrpf32_to_rgb_matches_scalar() { let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; unsafe { - gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb width={w}"); } } @@ -61,9 +61,9 @@ fn neon_gbrpf32_to_rgba_matches_scalar() { let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; unsafe { - gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba width={w}"); } } @@ -83,9 +83,9 @@ fn neon_gbrpf32_to_rgb_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; unsafe { - gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 width={w}"); } } @@ -105,9 +105,9 @@ fn neon_gbrpf32_to_rgba_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; unsafe { - gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 width={w}"); } } @@ -127,9 +127,9 @@ fn neon_gbrpf32_to_rgb_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; unsafe { - gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f32 width={w}"); } } @@ -149,9 +149,9 @@ fn neon_gbrpf32_to_rgba_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; unsafe { - gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f32 width={w}"); } } @@ -174,9 +174,9 @@ fn neon_gbrpf32_to_rgb_f16_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; unsafe { - gbrpf32_to_rgb_f16_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgb_f16_row_fp16::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 width={w}"); } } @@ -199,9 +199,9 @@ fn neon_gbrpf32_to_rgba_f16_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; unsafe { - gbrpf32_to_rgba_f16_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf32_to_rgba_f16_row_fp16::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 width={w}"); } } @@ -222,9 +222,9 @@ fn neon_gbrpf32_to_luma_matches_scalar() { let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; unsafe { - gbrpf32_to_luma_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); } - scalar::planar_gbr_float::gbrpf32_to_luma_row( + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &g, &b, &r, @@ -253,9 +253,9 @@ fn neon_gbrpf32_to_luma_u16_matches_scalar() { let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; unsafe { - gbrpf32_to_luma_u16_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); } - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &g, &b, &r, @@ -287,9 +287,9 @@ fn neon_gbrpf32_to_hsv_matches_scalar() { let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; unsafe { - gbrpf32_to_hsv_row(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w); + gbrpf32_to_hsv_row::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w); } - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &g, &b, &r, @@ -321,9 +321,9 @@ fn neon_gbrapf32_to_rgba_matches_scalar() { let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; unsafe { - gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w); + gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w); } - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba width={w}"); } } @@ -345,9 +345,9 @@ fn neon_gbrapf32_to_rgba_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; unsafe { - gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w); + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w); } - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 width={w}"); } } @@ -369,9 +369,9 @@ fn neon_gbrapf32_to_rgba_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; unsafe { - gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut simd, w); + gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut simd, w); } - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f32 width={w}"); } } @@ -396,9 +396,9 @@ fn neon_gbrapf32_to_rgba_f16_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; unsafe { - gbrapf32_to_rgba_f16_row_fp16(&g, &b, &r, &a, &mut simd, w); + gbrapf32_to_rgba_f16_row_fp16::(&g, &b, &r, &a, &mut simd, w); } - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 width={w}"); } } @@ -421,13 +421,13 @@ fn neon_gbrpf16_to_rgb_matches_scalar() { let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; unsafe { - gbrpf16_to_rgb_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgb_row_fp16::(&g, &b, &r, &mut simd, w); } // Scalar reference: widen f16→f32, then scalar gbrpf32_to_rgb_row. let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb width={w}"); } } @@ -450,12 +450,12 @@ fn neon_gbrpf16_to_rgba_matches_scalar() { let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; unsafe { - gbrpf16_to_rgba_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgba_row_fp16::(&g, &b, &r, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba width={w}"); } } @@ -478,12 +478,12 @@ fn neon_gbrpf16_to_rgb_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; unsafe { - gbrpf16_to_rgb_u16_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgb_u16_row_fp16::(&g, &b, &r, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_u16 width={w}"); } } @@ -506,12 +506,12 @@ fn neon_gbrpf16_to_rgba_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; unsafe { - gbrpf16_to_rgba_u16_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgba_u16_row_fp16::(&g, &b, &r, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_u16 width={w}"); } } @@ -534,12 +534,12 @@ fn neon_gbrpf16_to_rgb_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; unsafe { - gbrpf16_to_rgb_f32_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgb_f32_row_fp16::(&g, &b, &r, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f32 width={w}"); } } @@ -562,12 +562,12 @@ fn neon_gbrpf16_to_rgba_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; unsafe { - gbrpf16_to_rgba_f32_row_fp16(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgba_f32_row_fp16::(&g, &b, &r, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f32 width={w}"); } } @@ -587,9 +587,9 @@ fn neon_gbrpf16_to_rgb_f16_lossless_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; unsafe { - gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 width={w}"); } } @@ -609,9 +609,9 @@ fn neon_gbrpf16_to_rgba_f16_lossless_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; unsafe { - gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w); + gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w); } - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 width={w}"); } } @@ -635,12 +635,12 @@ fn neon_gbrpf16_to_luma_matches_scalar() { let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; unsafe { - gbrpf16_to_luma_row_fp16(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); + gbrpf16_to_luma_row_fp16::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_row( + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &gf, &bf, &rf, @@ -672,12 +672,12 @@ fn neon_gbrpf16_to_luma_u16_matches_scalar() { let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; unsafe { - gbrpf16_to_luma_u16_row_fp16(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); + gbrpf16_to_luma_u16_row_fp16::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &gf, &bf, &rf, @@ -712,12 +712,12 @@ fn neon_gbrpf16_to_hsv_matches_scalar() { let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; unsafe { - gbrpf16_to_hsv_row_fp16(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w); + gbrpf16_to_hsv_row_fp16::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &gf, &bf, &rf, @@ -752,13 +752,13 @@ fn neon_gbrapf16_to_rgba_matches_scalar() { let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; unsafe { - gbrapf16_to_rgba_row_fp16(&g, &b, &r, &a, &mut simd, w); + gbrapf16_to_rgba_row_fp16::(&g, &b, &r, &a, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba width={w}"); } } @@ -783,13 +783,13 @@ fn neon_gbrapf16_to_rgba_u16_matches_scalar() { let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; unsafe { - gbrapf16_to_rgba_u16_row_fp16(&g, &b, &r, &a, &mut simd, w); + gbrapf16_to_rgba_u16_row_fp16::(&g, &b, &r, &a, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_u16 width={w}"); } } @@ -814,13 +814,13 @@ fn neon_gbrapf16_to_rgba_f32_matches_scalar() { let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; unsafe { - gbrapf16_to_rgba_f32_row_fp16(&g, &b, &r, &a, &mut simd, w); + gbrapf16_to_rgba_f32_row_fp16::(&g, &b, &r, &a, &mut simd, w); } let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f32 width={w}"); } } @@ -842,9 +842,160 @@ fn neon_gbrapf16_to_rgba_f16_lossless_matches_scalar() { let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; unsafe { - gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w); + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w); } - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 width={w}"); } } + +// ---- BE parity helpers ------------------------------------------------------ + +fn be_encode_f32(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() +} + +fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() +} + +// ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrpf32_to_rgb_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE01_0001); + prng_f32(&mut b, 0xBE01_0002); + prng_f32(&mut r, 0xBE01_0003); + let mut le_out = std::vec![0u8; w * 3]; + let mut be_out = std::vec![0u8; w * 3]; + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf32 → u8 RGBA ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrpf32_to_rgba_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE02_0001); + prng_f32(&mut b, 0xBE02_0002); + prng_f32(&mut r, 0xBE02_0003); + let mut le_out = std::vec![0u8; w * 4]; + let mut be_out = std::vec![0u8; w * 4]; + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf32 → f32 RGB (lossless) -------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrpf32_to_rgb_f32_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE05_0001); + prng_f32(&mut b, 0xBE05_0002); + prng_f32(&mut r, 0xBE05_0003); + let mut le_out = std::vec![0.0f32; w * 3]; + let mut be_out = std::vec![0.0f32; w * 3]; + unsafe { gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_f32_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgb_f32 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGB (lossless) -------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrpf16_to_rgb_f16_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE07_0001); + prng_f16(&mut b, 0xBE07_0002); + prng_f16(&mut r, 0xBE07_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 3]; + let mut be_out = std::vec![half::f16::ZERO; w * 3]; + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGBA (lossless) ------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrpf16_to_rgba_f16_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE08_0001); + prng_f16(&mut b, 0xBE08_0002); + prng_f16(&mut r, 0xBE08_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf16_to_rgba_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrapf16 → f16 RGBA (lossless) ------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_gbrapf16_to_rgba_f16_be_parity() { + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + let mut a = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE0F_0001); + prng_f16(&mut b, 0xBE0F_0002); + prng_f16(&mut r, 0xBE0F_0003); + prng_f16(&mut a, 0xBE0F_0004); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); + } +} diff --git a/src/row/arch/wasm_simd128/planar_gbr_float.rs b/src/row/arch/wasm_simd128/planar_gbr_float.rs index 911da0cb..2dc6a42f 100644 --- a/src/row/arch/wasm_simd128/planar_gbr_float.rs +++ b/src/row/arch/wasm_simd128/planar_gbr_float.rs @@ -27,7 +27,7 @@ use core::arch::wasm32::*; use crate::{ ColorMatrix, - row::scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + row::{arch::wasm_simd128::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, }; // ---- shared helpers ---------------------------------------------------------- @@ -57,7 +57,7 @@ fn scale_round_i32(v: v128, scale: v128, half: v128) -> v128 { /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgb_row( +pub(crate) unsafe fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -77,9 +77,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -106,7 +106,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } @@ -122,7 +122,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgba_row( +pub(crate) unsafe fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -142,9 +142,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -171,7 +171,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } @@ -186,7 +186,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -206,9 +206,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -232,7 +232,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } @@ -248,7 +248,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -268,9 +268,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -294,7 +294,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } @@ -313,7 +313,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -325,7 +325,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 3, "out row too short"); - scalar::gbrpf32_to_rgb_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGBA (lossless, α = 1.0) -------------------------------- @@ -342,7 +342,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( #[inline] #[target_feature(enable = "simd128")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -354,7 +354,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrpf32_to_rgba_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGB (scalar narrow) --------------------------------------- @@ -372,7 +372,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -385,7 +385,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row( debug_assert!(out.len() >= width * 3, "out row too short"); // Scalar narrow: IEEE-754 round-to-nearest-even via half::f16::from_f32. - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGBA (scalar narrow, α = f16(1.0)) ---------------------- @@ -399,7 +399,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -411,7 +411,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } // ---- Gbrpf32 → u8 luma (staged via RGB scratch) ----------------------------- @@ -426,7 +426,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_row( +pub(crate) unsafe fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -446,7 +446,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -477,7 +477,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_u16_row( +pub(crate) unsafe fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -497,7 +497,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -527,7 +527,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( /// 3. `h_out.len()`, `s_out.len()`, `v_out.len()` ≥ `width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf32_to_hsv_row( +pub(crate) unsafe fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -549,7 +549,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -580,7 +580,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrapf32_to_rgba_row( +pub(crate) unsafe fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -602,10 +602,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); - let av = clamp01(v128_load(a.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let av = clamp01(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -637,7 +637,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &g[x..], &b[x..], &r[x..], @@ -660,7 +660,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -682,10 +682,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(v128_load(g.as_ptr().add(x).cast()), zero, one); - let bv = clamp01(v128_load(b.as_ptr().add(x).cast()), zero, one); - let rv = clamp01(v128_load(r.as_ptr().add(x).cast()), zero, one); - let av = clamp01(v128_load(a.as_ptr().add(x).cast()), zero, one); + let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); + let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); + let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let av = clamp01(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), zero, one); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -713,7 +713,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &g[x..], &b[x..], &r[x..], @@ -738,7 +738,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -752,7 +752,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( debug_assert!(a.len() >= width, "a row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f16 RGBA (scalar narrow, source α) -------------------------- @@ -768,7 +768,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrapf32_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -782,7 +782,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row( debug_assert!(a.len() >= width, "a row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } // ---- Gbrpf16 → f16 RGB (lossless, f16-native) -------------------------------- @@ -799,7 +799,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -811,7 +811,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 3, "out row too short"); - scalar_f16::gbrpf16_to_rgb_f16_row(g, b, r, out, width); + scalar_f16::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } // ---- Gbrpf16 → f16 RGBA (lossless, opaque α = f16(1.0)) --------------------- @@ -826,7 +826,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -838,7 +838,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar_f16::gbrpf16_to_rgba_f16_row(g, b, r, out, width); + scalar_f16::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } // ---- Gbrapf16 → f16 RGBA (lossless, source α) -------------------------------- @@ -853,7 +853,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -867,7 +867,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( debug_assert!(a.len() >= width, "a row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar_f16::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); + scalar_f16::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } // ---- Gbrpf16 widen helpers -------------------------------------------------- @@ -895,7 +895,7 @@ fn widen_f16_plane(src: &[half::f16], offset: usize, n: usize, dst: &mut [f32]) /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgb_row( +pub(crate) unsafe fn gbrpf16_to_rgb_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -917,7 +917,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row( widen_f16_plane(b, x, CHUNK, &mut bf); widen_f16_plane(r, x, CHUNK, &mut rf); unsafe { - gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut out[x * 3..(x + CHUNK) * 3], CHUNK); + gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut out[x * 3..(x + CHUNK) * 3], CHUNK); } x += CHUNK; } @@ -926,7 +926,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgb_row(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 3..width * 3], n); + scalar::gbrpf32_to_rgb_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 3..width * 3], n); } } @@ -941,7 +941,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgba_row( +pub(crate) unsafe fn gbrpf16_to_rgba_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -972,7 +972,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgba_row(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 4..width * 4], n); + scalar::gbrpf32_to_rgba_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 4..width * 4], n); } } @@ -987,7 +987,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1018,7 +1018,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgb_u16_row(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 3..width * 3], n); + scalar::gbrpf32_to_rgb_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 3..width * 3], n); } } @@ -1033,7 +1033,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn gbrpf16_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_u16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1064,6 +1064,6 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgba_u16_row(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 4..width * 4], n); + scalar::gbrpf32_to_rgba_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 4..width * 4], n); } } diff --git a/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs b/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs index 4625726d..3a1e3fd1 100644 --- a/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs +++ b/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs @@ -46,9 +46,9 @@ fn wasm_gbrpf32_to_rgb_matches_scalar() { let r = gbr_plane_f32(w, 0xDEAD_BEEF); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::gbrpf32_to_rgb_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgb_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgb_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgb_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgb width={w}"); } @@ -64,9 +64,9 @@ fn wasm_gbrpf32_to_rgba_matches_scalar() { let r = gbr_plane_f32(w, 0xF0E1_D2C3); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::gbrpf32_to_rgba_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgba_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgba_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgba_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgba width={w}"); } @@ -82,9 +82,9 @@ fn wasm_gbrpf32_to_rgb_u16_matches_scalar() { let r = gbr_plane_f32(w, 0x99AA_BBCC); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgb_u16 width={w}"); } @@ -100,9 +100,9 @@ fn wasm_gbrpf32_to_rgba_u16_matches_scalar() { let r = gbr_plane_f32(w, 0x0F0E_0D0C); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgba_u16 width={w}"); } @@ -118,9 +118,9 @@ fn wasm_gbrpf32_to_rgb_f32_matches_scalar() { let r = gbr_plane_f32(w, 0x5555_6666); let mut out_scalar = std::vec![0.0f32; w * 3]; let mut out_simd = std::vec![0.0f32; w * 3]; - scalar::gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgb_f32 width={w}"); } @@ -136,9 +136,9 @@ fn wasm_gbrpf32_to_rgba_f32_matches_scalar() { let r = gbr_plane_f32(w, 0xBBBB_CCCC); let mut out_scalar = std::vec![0.0f32; w * 4]; let mut out_simd = std::vec![0.0f32; w * 4]; - scalar::gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgba_f32 width={w}"); } @@ -154,9 +154,9 @@ fn wasm_gbrpf32_to_rgb_f16_matches_scalar() { let r = gbr_plane_f32(w, 0x2233_4455); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgb_f16 width={w}"); } @@ -172,9 +172,9 @@ fn wasm_gbrpf32_to_rgba_f16_matches_scalar() { let r = gbr_plane_f32(w, 0xEEFF_1122); let mut out_scalar = std::vec![half::f16::ZERO; w * 4]; let mut out_simd = std::vec![half::f16::ZERO; w * 4]; - scalar::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_rgba_f16 width={w}"); } @@ -191,9 +191,9 @@ fn wasm_gbrpf32_to_luma_matches_scalar() { let r = gbr_plane_f32(w, 0x9CAD_BEEF); let mut out_scalar = std::vec![0u8; w]; let mut out_simd = std::vec![0u8; w]; - scalar::gbrpf32_to_luma_row(&g, &b, &r, &mut out_scalar, w, ColorMatrix::Bt709, true); + scalar::gbrpf32_to_luma_row::(&g, &b, &r, &mut out_scalar, w, ColorMatrix::Bt709, true); unsafe { - gbrpf32_to_luma_row(&g, &b, &r, &mut out_simd, w, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::(&g, &b, &r, &mut out_simd, w, ColorMatrix::Bt709, true); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_luma width={w}"); } @@ -210,9 +210,9 @@ fn wasm_gbrpf32_to_luma_u16_matches_scalar() { let r = gbr_plane_f32(w, 0x7968_5748); let mut out_scalar = std::vec![0u16; w]; let mut out_simd = std::vec![0u16; w]; - scalar::gbrpf32_to_luma_u16_row(&g, &b, &r, &mut out_scalar, w, ColorMatrix::Bt709, true); + scalar::gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out_scalar, w, ColorMatrix::Bt709, true); unsafe { - gbrpf32_to_luma_u16_row(&g, &b, &r, &mut out_simd, w, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out_simd, w, ColorMatrix::Bt709, true); } assert_eq!(out_scalar, out_simd, "wasm gbrpf32_to_luma_u16 width={w}"); } @@ -232,9 +232,9 @@ fn wasm_gbrpf32_to_hsv_matches_scalar() { let mut h_simd = std::vec![0u8; w]; let mut s_simd = std::vec![0u8; w]; let mut v_simd = std::vec![0u8; w]; - scalar::gbrpf32_to_hsv_row(&g, &b, &r, &mut h_scalar, &mut s_scalar, &mut v_scalar, w); + scalar::gbrpf32_to_hsv_row::(&g, &b, &r, &mut h_scalar, &mut s_scalar, &mut v_scalar, w); unsafe { - gbrpf32_to_hsv_row(&g, &b, &r, &mut h_simd, &mut s_simd, &mut v_simd, w); + gbrpf32_to_hsv_row::(&g, &b, &r, &mut h_simd, &mut s_simd, &mut v_simd, w); } assert_eq!(h_scalar, h_simd, "wasm gbrpf32_to_hsv H width={w}"); assert_eq!(s_scalar, s_simd, "wasm gbrpf32_to_hsv S width={w}"); @@ -253,9 +253,9 @@ fn wasm_gbrapf32_to_rgba_matches_scalar() { let a = gbr_plane_f32(w, 0x6D7E_8F90); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut out_simd, w); + gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrapf32_to_rgba width={w}"); } @@ -272,9 +272,9 @@ fn wasm_gbrapf32_to_rgba_u16_matches_scalar() { let a = gbr_plane_f32(w, 0xFEDC_BA98); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut out_simd, w); + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrapf32_to_rgba_u16 width={w}"); } @@ -291,9 +291,9 @@ fn wasm_gbrapf32_to_rgba_f32_matches_scalar() { let a = gbr_plane_f32(w, 0x4B5A_6978); let mut out_scalar = std::vec![0.0f32; w * 4]; let mut out_simd = std::vec![0.0f32; w * 4]; - scalar::gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut out_simd, w); + gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrapf32_to_rgba_f32 width={w}"); } @@ -310,9 +310,9 @@ fn wasm_gbrapf32_to_rgba_f16_matches_scalar() { let a = gbr_plane_f32(w, 0x7654_3210); let mut out_scalar = std::vec![half::f16::ZERO; w * 4]; let mut out_simd = std::vec![half::f16::ZERO; w * 4]; - scalar::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut out_scalar, w); + scalar::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut out_simd, w); + gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrapf32_to_rgba_f16 width={w}"); } @@ -330,7 +330,7 @@ fn wasm_gbrpf16_to_rgb_f16_matches_scalar() { let mut out_simd = std::vec![half::f16::ZERO; w * 3]; scalar_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgb_f16 width={w}"); } @@ -348,7 +348,7 @@ fn wasm_gbrpf16_to_rgba_f16_matches_scalar() { let mut out_simd = std::vec![half::f16::ZERO; w * 4]; scalar_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgba_f16 width={w}"); } @@ -367,7 +367,7 @@ fn wasm_gbrapf16_to_rgba_f16_matches_scalar() { let mut out_simd = std::vec![half::f16::ZERO; w * 4]; scalar_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut out_scalar, w); unsafe { - gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut out_simd, w); + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrapf16_to_rgba_f16 width={w}"); } @@ -383,7 +383,7 @@ fn wasm_gbrpf16_to_rgb_matches_scalar() { let r = gbr_plane_f16(w, 0xE9EA_FBFC); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::gbrpf32_to_rgb_row( + scalar::gbrpf32_to_rgb_row::( &g.iter().map(|v| v.to_f32()).collect::>(), &b.iter().map(|v| v.to_f32()).collect::>(), &r.iter().map(|v| v.to_f32()).collect::>(), @@ -407,7 +407,7 @@ fn wasm_gbrpf16_to_rgba_matches_scalar() { let r = gbr_plane_f16(w, 0x6789_CDEF); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_simd = std::vec![0u8; w * 4]; - scalar::gbrpf32_to_rgba_row( + scalar::gbrpf32_to_rgba_row::( &g.iter().map(|v| v.to_f32()).collect::>(), &b.iter().map(|v| v.to_f32()).collect::>(), &r.iter().map(|v| v.to_f32()).collect::>(), @@ -431,7 +431,7 @@ fn wasm_gbrpf16_to_rgb_u16_matches_scalar() { let r = gbr_plane_f16(w, 0x8899_AABB); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_simd = std::vec![0u16; w * 3]; - scalar::gbrpf32_to_rgb_u16_row( + scalar::gbrpf32_to_rgb_u16_row::( &g.iter().map(|v| v.to_f32()).collect::>(), &b.iter().map(|v| v.to_f32()).collect::>(), &r.iter().map(|v| v.to_f32()).collect::>(), @@ -455,7 +455,7 @@ fn wasm_gbrpf16_to_rgba_u16_matches_scalar() { let r = gbr_plane_f16(w, 0xCC22_4466); let mut out_scalar = std::vec![0u16; w * 4]; let mut out_simd = std::vec![0u16; w * 4]; - scalar::gbrpf32_to_rgba_u16_row( + scalar::gbrpf32_to_rgba_u16_row::( &g.iter().map(|v| v.to_f32()).collect::>(), &b.iter().map(|v| v.to_f32()).collect::>(), &r.iter().map(|v| v.to_f32()).collect::>(), @@ -484,9 +484,9 @@ fn wasm_gbrpf32_to_rgb_round_half_up() { let w = 4; let mut out_scalar = std::vec![0u8; w * 3]; let mut out_simd = std::vec![0u8; w * 3]; - scalar::gbrpf32_to_rgb_row(&g, &b, &r, &mut out_scalar, w); + scalar::gbrpf32_to_rgb_row::(&g, &b, &r, &mut out_scalar, w); unsafe { - gbrpf32_to_rgb_row(&g, &b, &r, &mut out_simd, w); + gbrpf32_to_rgb_row::(&g, &b, &r, &mut out_simd, w); } // Both scalar and SIMD must use round-half-up (1, 2, 3, 4). assert_eq!(out_scalar, out_simd, "round-half-up: scalar vs SIMD"); @@ -495,3 +495,72 @@ fn wasm_gbrpf32_to_rgb_round_half_up() { assert_eq!(out_simd[6], 3, "2.5/255 → 3 (round-half-up)"); assert_eq!(out_simd[9], 4, "3.5/255 → 4 (round-half-up)"); } + +// ---- BE parity helpers ------------------------------------------------------- + +fn be_encode_f32(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() +} + +fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() +} + +// ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- + +#[test] +fn wasm_gbrpf32_to_rgb_be_parity() { + for w in [1usize, 4, 5, 7, 8, 16, 17, 32] { + let g = gbr_plane_f32(w, 0xBE01_0001); + let b = gbr_plane_f32(w, 0xBE01_0002); + let r = gbr_plane_f32(w, 0xBE01_0003); + let mut le_out = std::vec![0u8; w * 3]; + let mut be_out = std::vec![0u8; w * 3]; + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "wasm gbrpf32_to_rgb BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGB (lossless) -------------------------------- + +#[test] +fn wasm_gbrpf16_to_rgb_f16_be_parity() { + for w in [1usize, 4, 5, 7, 8, 16, 17, 32] { + let g = gbr_plane_f16(w, 0xBE07_0001); + let b = gbr_plane_f16(w, 0xBE07_0002); + let r = gbr_plane_f16(w, 0xBE07_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 3]; + let mut be_out = std::vec![half::f16::ZERO; w * 3]; + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "wasm gbrpf16_to_rgb_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrapf16 → f16 RGBA (lossless) ------------------------------ + +#[test] +fn wasm_gbrapf16_to_rgba_f16_be_parity() { + for w in [1usize, 4, 5, 7, 8, 16, 17, 32] { + let g = gbr_plane_f16(w, 0xBE0F_0001); + let b = gbr_plane_f16(w, 0xBE0F_0002); + let r = gbr_plane_f16(w, 0xBE0F_0003); + let a = gbr_plane_f16(w, 0xBE0F_0004); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "wasm gbrapf16_to_rgba_f16 BE parity width={w}"); + } +} diff --git a/src/row/arch/x86_avx2/planar_gbr_float.rs b/src/row/arch/x86_avx2/planar_gbr_float.rs index b8efbc3c..bec35099 100644 --- a/src/row/arch/x86_avx2/planar_gbr_float.rs +++ b/src/row/arch/x86_avx2/planar_gbr_float.rs @@ -49,7 +49,7 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, - row::scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + row::{arch::x86_avx2::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, }; // ---- shared helpers ---------------------------------------------------------- @@ -103,7 +103,7 @@ unsafe fn narrow_i32x8_to_u16x8(v: __m256i) -> __m128i { /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf32_to_rgb_row( +pub(crate) unsafe fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -122,9 +122,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -143,7 +143,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( x += 8; } if x < width { - scalar::gbrpf32_to_rgb_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -159,7 +159,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf32_to_rgba_row( +pub(crate) unsafe fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -178,9 +178,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -200,7 +200,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( x += 8; } if x < width { - scalar::gbrpf32_to_rgba_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -216,7 +216,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -235,9 +235,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -256,7 +256,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( x += 8; } if x < width { - scalar::gbrpf32_to_rgb_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -272,7 +272,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -291,9 +291,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -313,7 +313,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 8; } if x < width { - scalar::gbrpf32_to_rgba_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -333,7 +333,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -345,7 +345,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 3, "out row too short"); - scalar::gbrpf32_to_rgb_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGBA (lossless, α = 1.0) --------------------------------- @@ -362,7 +362,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( #[inline] #[target_feature(enable = "avx2")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -374,7 +374,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrpf32_to_rgba_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGB (F16C narrow) ---------------------------------------- @@ -392,7 +392,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -407,9 +407,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_loadu_ps(g.as_ptr().add(x)); - let bv = _mm256_loadu_ps(b.as_ptr().add(x)); - let rv = _mm256_loadu_ps(r.as_ptr().add(x)); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -431,7 +431,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( x += 8; } if x < width { - scalar::gbrpf32_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -448,7 +448,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -463,9 +463,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_loadu_ps(g.as_ptr().add(x)); - let bv = _mm256_loadu_ps(b.as_ptr().add(x)); - let rv = _mm256_loadu_ps(r.as_ptr().add(x)); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -486,7 +486,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 8; } if x < width { - scalar::gbrpf32_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -503,7 +503,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_row( +pub(crate) unsafe fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -523,7 +523,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -554,7 +554,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_u16_row( +pub(crate) unsafe fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -574,7 +574,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -604,7 +604,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( /// 3. `h_out.len()`, `s_out.len()`, `v_out.len()` ≥ `width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf32_to_hsv_row( +pub(crate) unsafe fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -626,7 +626,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -656,7 +656,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrapf32_to_rgba_row( +pub(crate) unsafe fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -677,10 +677,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm256_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())), zero, one); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -703,7 +703,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( x += 8; } if x < width { - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &g[x..], &b[x..], &r[x..], @@ -726,7 +726,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -747,10 +747,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm256_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm256_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm256_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())), zero, one); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -773,7 +773,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( x += 8; } if x < width { - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &g[x..], &b[x..], &r[x..], @@ -799,7 +799,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -813,7 +813,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( debug_assert!(a.len() >= width, "a row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f16 RGBA (F16C narrow, source α) ---------------------------- @@ -828,7 +828,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -845,10 +845,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_loadu_ps(g.as_ptr().add(x)); - let bv = _mm256_loadu_ps(b.as_ptr().add(x)); - let rv = _mm256_loadu_ps(r.as_ptr().add(x)); - let av = _mm256_loadu_ps(a.as_ptr().add(x)); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); + let av = _mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())); let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -872,7 +872,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( x += 8; } if x < width { - scalar::gbrapf32_to_rgba_f16_row( + scalar::gbrapf32_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], @@ -896,7 +896,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -916,9 +916,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 8 <= width { // Load 8 f16 lanes (16 bytes) per plane and widen to f32x8. - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -950,7 +950,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_row( + scalar::gbrpf32_to_rgb_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -973,7 +973,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -992,9 +992,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1026,7 +1026,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_row( + scalar::gbrpf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1050,7 +1050,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1069,9 +1069,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1102,7 +1102,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_u16_row( + scalar::gbrpf32_to_rgb_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1126,7 +1126,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1145,9 +1145,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1179,7 +1179,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_u16_row( + scalar::gbrpf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1203,7 +1203,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1218,9 +1218,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); // No 3-channel interleave intrinsic in AVX2 — scatter via scalar loop. let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; @@ -1246,7 +1246,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_f32_row( + scalar::gbrpf32_to_rgb_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1270,7 +1270,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1285,9 +1285,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; let mut rf = [0.0f32; 8]; @@ -1313,7 +1313,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_f32_row( + scalar::gbrpf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1339,7 +1339,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1355,9 +1355,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( let mut x = 0usize; while x + 8 <= width { // Load 8 × u16 (16 bytes) per plane. - let gu = _mm_loadu_si128(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadu_si128(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadu_si128(r.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x8::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x8::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x8::(r.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 8]; let mut b_buf = [0u16; 8]; let mut r_buf = [0u16; 8]; @@ -1374,7 +1374,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 8; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -1393,7 +1393,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1408,9 +1408,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 8 <= width { - let gu = _mm_loadu_si128(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadu_si128(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadu_si128(r.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x8::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x8::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x8::(r.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 8]; let mut b_buf = [0u16; 8]; let mut r_buf = [0u16; 8]; @@ -1428,7 +1428,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 8; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -1446,7 +1446,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( #[target_feature(enable = "avx2,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1498,7 +1498,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( #[target_feature(enable = "avx2,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1549,7 +1549,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( +pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1603,7 +1603,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1624,10 +1624,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); - let av = _mm256_cvtph_ps(_mm_loadu_si128(a.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1665,7 +1665,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1690,7 +1690,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1711,10 +1711,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); - let av = _mm256_cvtph_ps(_mm_loadu_si128(a.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1752,7 +1752,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1777,7 +1777,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "avx2,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1794,10 +1794,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(_mm_loadu_si128(g.as_ptr().add(x).cast())); - let bv = _mm256_cvtph_ps(_mm_loadu_si128(b.as_ptr().add(x).cast())); - let rv = _mm256_cvtph_ps(_mm_loadu_si128(r.as_ptr().add(x).cast())); - let av = _mm256_cvtph_ps(_mm_loadu_si128(a.as_ptr().add(x).cast())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; let mut rf = [0.0f32; 8]; @@ -1827,7 +1827,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_f32_row( + scalar::gbrapf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1852,7 +1852,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1869,10 +1869,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 8 <= width { - let gu = _mm_loadu_si128(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadu_si128(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadu_si128(r.as_ptr().add(x).cast::<__m128i>()); - let au = _mm_loadu_si128(a.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x8::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x8::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x8::(r.as_ptr().add(x).cast::()); + let au = endian::load_endian_u16x8::(a.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 8]; let mut b_buf = [0u16; 8]; let mut r_buf = [0u16; 8]; @@ -1892,7 +1892,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( x += 8; } if x < width { - scalar_f16::gbrapf16_to_rgba_f16_row( + scalar_f16::gbrapf16_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], diff --git a/src/row/arch/x86_avx2/tests/planar_gbr_float.rs b/src/row/arch/x86_avx2/tests/planar_gbr_float.rs index b122f8bf..2423247b 100644 --- a/src/row/arch/x86_avx2/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_avx2/tests/planar_gbr_float.rs @@ -95,8 +95,8 @@ fn avx2_gbrpf32_to_rgb_matches_scalar() { prng_f32(&mut r, 0xA001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb width={w}"); } } @@ -117,8 +117,8 @@ fn avx2_gbrpf32_to_rgb_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb lane-order width={w}"); } } @@ -140,8 +140,8 @@ fn avx2_gbrpf32_to_rgba_matches_scalar() { prng_f32(&mut r, 0xA002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba width={w}"); } } @@ -162,8 +162,8 @@ fn avx2_gbrpf32_to_rgba_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba lane-order width={w}"); } } @@ -185,8 +185,8 @@ fn avx2_gbrpf32_to_rgb_u16_matches_scalar() { prng_f32(&mut r, 0xA003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 width={w}"); } } @@ -207,8 +207,8 @@ fn avx2_gbrpf32_to_rgb_u16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 lane-order width={w}"); } } @@ -230,8 +230,8 @@ fn avx2_gbrpf32_to_rgba_u16_matches_scalar() { prng_f32(&mut r, 0xA004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 width={w}"); } } @@ -252,8 +252,8 @@ fn avx2_gbrpf32_to_rgba_u16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 lane-order width={w}"); } } @@ -275,8 +275,8 @@ fn avx2_gbrpf32_to_rgb_f32_matches_scalar() { prng_f32(&mut r, 0xA005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f32 width={w}"); } } @@ -298,8 +298,8 @@ fn avx2_gbrpf32_to_rgba_f32_matches_scalar() { prng_f32(&mut r, 0xA006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f32 width={w}"); } } @@ -321,8 +321,8 @@ fn avx2_gbrpf32_to_rgb_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xA007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf32_to_rgb_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 (F16C) width={w}"); } } @@ -343,8 +343,8 @@ fn avx2_gbrpf32_to_rgb_f16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf32_to_rgb_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 lane-order width={w}"); } } @@ -366,8 +366,8 @@ fn avx2_gbrpf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xA008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf32_to_rgba_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 (F16C) width={w}"); } } @@ -388,8 +388,8 @@ fn avx2_gbrpf32_to_rgba_f16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf32_to_rgba_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 lane-order width={w}"); } } @@ -412,8 +412,8 @@ fn avx2_gbrpf32_to_luma_matches_scalar() { prng_f32(&mut r, 0xA009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf32_to_luma_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_row( + unsafe { gbrpf32_to_luma_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &g, &b, &r, @@ -444,8 +444,8 @@ fn avx2_gbrpf32_to_luma_u16_matches_scalar() { prng_f32(&mut r, 0xA00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf32_to_luma_u16_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + unsafe { gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &g, &b, &r, @@ -479,8 +479,8 @@ fn avx2_gbrpf32_to_hsv_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf32_to_hsv_row(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + unsafe { gbrpf32_to_hsv_row::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &g, &b, &r, @@ -514,8 +514,8 @@ fn avx2_gbrapf32_to_rgba_matches_scalar() { prng_f32(&mut a, 0xA00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba width={w}"); } } @@ -537,8 +537,8 @@ fn avx2_gbrapf32_to_rgba_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba lane-order width={w}"); } } @@ -562,8 +562,8 @@ fn avx2_gbrapf32_to_rgba_u16_matches_scalar() { prng_f32(&mut a, 0xA00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 width={w}"); } } @@ -585,8 +585,8 @@ fn avx2_gbrapf32_to_rgba_u16_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 lane-order width={w}"); } } @@ -610,8 +610,8 @@ fn avx2_gbrapf32_to_rgba_f32_matches_scalar() { prng_f32(&mut a, 0xA00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f32 width={w}"); } } @@ -635,8 +635,8 @@ fn avx2_gbrapf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut a, 0xA00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf32_to_rgba_f16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 (F16C) width={w}"); } } @@ -658,8 +658,8 @@ fn avx2_gbrapf32_to_rgba_f16_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf32_to_rgba_f16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 lane-order width={w}"); } } @@ -681,11 +681,11 @@ fn avx2_gbrpf16_to_rgb_f16c_matches_scalar() { prng_f16(&mut r, 0xB001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf16_to_rgb_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb (F16C widen) width={w}"); } } @@ -706,11 +706,11 @@ fn avx2_gbrpf16_to_rgb_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf16_to_rgb_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb lane-order width={w}"); } } @@ -732,11 +732,11 @@ fn avx2_gbrpf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut r, 0xB002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf16_to_rgba_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba (F16C widen) width={w}"); } } @@ -757,11 +757,11 @@ fn avx2_gbrpf16_to_rgba_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf16_to_rgba_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba lane-order width={w}"); } } @@ -783,11 +783,11 @@ fn avx2_gbrpf16_to_rgb_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xB003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf16_to_rgb_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_u16 (F16C widen) width={w}"); } } @@ -809,11 +809,11 @@ fn avx2_gbrpf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xB004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf16_to_rgba_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -835,11 +835,11 @@ fn avx2_gbrpf16_to_rgb_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xB005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf16_to_rgb_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f32 (F16C widen) width={w}"); } } @@ -861,11 +861,11 @@ fn avx2_gbrpf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xB006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf16_to_rgba_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -887,8 +887,8 @@ fn avx2_gbrpf16_to_rgb_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xB007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 lossless width={w}"); } } @@ -909,8 +909,8 @@ fn avx2_gbrpf16_to_rgb_f16_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 lane-order width={w}"); } } @@ -932,8 +932,8 @@ fn avx2_gbrpf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xB008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 lossless width={w}"); } } @@ -954,8 +954,8 @@ fn avx2_gbrpf16_to_rgba_f16_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 lane-order width={w}"); } } @@ -978,11 +978,11 @@ fn avx2_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xB009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_row( + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &gf, &bf, &rf, @@ -1013,11 +1013,11 @@ fn avx2_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xB00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &gf, &bf, &rf, @@ -1051,11 +1051,11 @@ fn avx2_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &gf, &bf, &rf, @@ -1089,12 +1089,12 @@ fn avx2_gbrapf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut a, 0xB00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf16_to_rgba_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba (F16C widen) width={w}"); } } @@ -1116,12 +1116,12 @@ fn avx2_gbrapf16_to_rgba_lane_order() { asym_ramp_f16_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf16_to_rgba_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba lane-order width={w}"); } } @@ -1145,12 +1145,12 @@ fn avx2_gbrapf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut a, 0xB00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf16_to_rgba_u16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_u16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -1174,12 +1174,12 @@ fn avx2_gbrapf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut a, 0xB00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf16_to_rgba_f32_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_f32_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -1203,8 +1203,8 @@ fn avx2_gbrapf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut a, 0xB00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 lossless width={w}"); } } @@ -1226,8 +1226,125 @@ fn avx2_gbrapf16_to_rgba_f16_lane_order() { asym_ramp_f16_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 lane-order width={w}"); } } + +// ---- BE parity helpers ------------------------------------------------------- + +fn be_encode_f32(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() +} + +fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() +} + +// ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "AVX2 SIMD intrinsics unsupported by Miri")] +fn avx2_gbrpf32_to_rgb_be_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE01_0001); + prng_f32(&mut b, 0xBE01_0002); + prng_f32(&mut r, 0xBE01_0003); + let mut le_out = std::vec![0u8; w * 3]; + let mut be_out = std::vec![0u8; w * 3]; + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf32 → u8 RGBA ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "AVX2 SIMD intrinsics unsupported by Miri")] +fn avx2_gbrpf32_to_rgba_be_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE02_0001); + prng_f32(&mut b, 0xBE02_0002); + prng_f32(&mut r, 0xBE02_0003); + let mut le_out = std::vec![0u8; w * 4]; + let mut be_out = std::vec![0u8; w * 4]; + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGB (lossless) -------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "AVX2 SIMD intrinsics unsupported by Miri")] +fn avx2_gbrpf16_to_rgb_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE07_0001); + prng_f16(&mut b, 0xBE07_0002); + prng_f16(&mut r, 0xBE07_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 3]; + let mut be_out = std::vec![half::f16::ZERO; w * 3]; + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrapf16 → f16 RGBA (lossless) ------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "AVX2 SIMD intrinsics unsupported by Miri")] +fn avx2_gbrapf16_to_rgba_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + let mut a = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE0F_0001); + prng_f16(&mut b, 0xBE0F_0002); + prng_f16(&mut r, 0xBE0F_0003); + prng_f16(&mut a, 0xBE0F_0004); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); + } +} diff --git a/src/row/arch/x86_avx512/endian.rs b/src/row/arch/x86_avx512/endian.rs index b886d90a..99bcffaf 100644 --- a/src/row/arch/x86_avx512/endian.rs +++ b/src/row/arch/x86_avx512/endian.rs @@ -89,6 +89,67 @@ pub(crate) unsafe fn load_endian_u16x32(ptr: *const u8) -> __m51 } } +// ---- u16x16 loaders (via _mm256_loadu_si256, for f16 widening) ------------- +// +// AVX-512 kernels widen 16 × f16 using `_mm512_cvtph_ps(__m256i)`, which +// requires a 256-bit lane load. The helpers below provide endian-aware +// loading of that 32-byte (16 × u16) block. + +/// AVX2 `_mm256_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) +/// lane across both 128-bit halves. +const BYTESWAP_MASK_U16X16: __m256i = unsafe { + core::mem::transmute([ + // low 128-bit lane + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + // high 128-bit lane + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) +}; + +/// Loads 16 × u16 (32 bytes) from `ptr` (LE-encoded) into a `__m256i`, +/// host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// (implied by AVX-512) enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x16(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16X16) }; + v +} + +/// Loads 16 × u16 (32 bytes) from `ptr` (BE-encoded) into a `__m256i`, +/// host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 32 readable bytes. Caller must have AVX2 +/// (implied by AVX-512) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x16(ptr: *const u8) -> __m256i { + let v = unsafe { _mm256_loadu_si256(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16X16) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x16` or `load_be_u16x16`. +/// +/// # Safety +/// +/// Same as `load_le_u16x16` / `load_be_u16x16`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x16(ptr: *const u8) -> __m256i { + if BE { + unsafe { load_be_u16x16(ptr) } + } else { + unsafe { load_le_u16x16(ptr) } + } +} + // ---- u32x16 loaders -------------------------------------------------------- /// Loads 16 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. diff --git a/src/row/arch/x86_avx512/planar_gbr_float.rs b/src/row/arch/x86_avx512/planar_gbr_float.rs index 0cce0ba0..f3bc8a2d 100644 --- a/src/row/arch/x86_avx512/planar_gbr_float.rs +++ b/src/row/arch/x86_avx512/planar_gbr_float.rs @@ -50,7 +50,7 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, - row::scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + row::{arch::x86_avx512::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, }; // ---- shared helpers ---------------------------------------------------------- @@ -80,7 +80,7 @@ unsafe fn scale_round_i32(v: __m512, scale: __m512) -> __m512i { /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf32_to_rgb_row( +pub(crate) unsafe fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -99,9 +99,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -120,7 +120,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( x += 16; } if x < width { - scalar::gbrpf32_to_rgb_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -136,7 +136,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf32_to_rgba_row( +pub(crate) unsafe fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -155,9 +155,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -177,7 +177,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( x += 16; } if x < width { - scalar::gbrpf32_to_rgba_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -193,7 +193,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -212,9 +212,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -233,7 +233,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( x += 16; } if x < width { - scalar::gbrpf32_to_rgb_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -249,7 +249,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -268,9 +268,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -290,7 +290,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 16; } if x < width { - scalar::gbrpf32_to_rgba_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -310,7 +310,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -322,7 +322,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 3, "out row too short"); - scalar::gbrpf32_to_rgb_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGBA (lossless, α = 1.0) --------------------------------- @@ -339,7 +339,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -351,7 +351,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( debug_assert!(r.len() >= width, "r row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrpf32_to_rgba_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGB (F16C narrow) ---------------------------------------- @@ -369,7 +369,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -384,9 +384,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_loadu_ps(g.as_ptr().add(x)); - let bv = _mm512_loadu_ps(b.as_ptr().add(x)); - let rv = _mm512_loadu_ps(r.as_ptr().add(x)); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -408,7 +408,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( x += 16; } if x < width { - scalar::gbrpf32_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -425,7 +425,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -440,9 +440,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_loadu_ps(g.as_ptr().add(x)); - let bv = _mm512_loadu_ps(b.as_ptr().add(x)); - let rv = _mm512_loadu_ps(r.as_ptr().add(x)); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -463,7 +463,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 16; } if x < width { - scalar::gbrpf32_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -480,7 +480,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_row( +pub(crate) unsafe fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -500,7 +500,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -531,7 +531,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_u16_row( +pub(crate) unsafe fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -551,7 +551,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -581,7 +581,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( /// 3. `h_out.len()`, `s_out.len()`, `v_out.len()` ≥ `width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf32_to_hsv_row( +pub(crate) unsafe fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -603,7 +603,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -634,7 +634,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrapf32_to_rgba_row( +pub(crate) unsafe fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -655,10 +655,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm512_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())), zero, one); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -681,7 +681,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( x += 16; } if x < width { - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &g[x..], &b[x..], &r[x..], @@ -705,7 +705,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -726,10 +726,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm512_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm512_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm512_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())), zero, one); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -752,7 +752,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( x += 16; } if x < width { - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &g[x..], &b[x..], &r[x..], @@ -778,7 +778,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -792,7 +792,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( debug_assert!(a.len() >= width, "a row too short"); debug_assert!(out.len() >= width * 4, "out row too short"); - scalar::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f16 RGBA (F16C narrow, source α) ---------------------------- @@ -807,7 +807,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -824,10 +824,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_loadu_ps(g.as_ptr().add(x)); - let bv = _mm512_loadu_ps(b.as_ptr().add(x)); - let rv = _mm512_loadu_ps(r.as_ptr().add(x)); - let av = _mm512_loadu_ps(a.as_ptr().add(x)); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); + let av = _mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())); let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -851,7 +851,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( x += 16; } if x < width { - scalar::gbrapf32_to_rgba_f16_row( + scalar::gbrapf32_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], @@ -875,7 +875,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -895,9 +895,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 16 <= width { // Load 16 f16 lanes (32 bytes) per plane and widen to f32x16. - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -929,7 +929,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_row( + scalar::gbrpf32_to_rgb_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -952,7 +952,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -971,9 +971,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1005,7 +1005,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_row( + scalar::gbrpf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1029,7 +1029,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1048,9 +1048,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1081,7 +1081,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_u16_row( + scalar::gbrpf32_to_rgb_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1105,7 +1105,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1124,9 +1124,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1158,7 +1158,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_u16_row( + scalar::gbrpf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1182,7 +1182,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1197,9 +1197,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); // No 3-channel interleave intrinsic in AVX-512 — scatter via scalar loop. let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; @@ -1225,7 +1225,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_f32_row( + scalar::gbrpf32_to_rgb_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1249,7 +1249,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1264,9 +1264,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; let mut rf = [0.0f32; 16]; @@ -1292,7 +1292,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_f32_row( + scalar::gbrpf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1319,7 +1319,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1335,9 +1335,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( let mut x = 0usize; while x + 16 <= width { // Load 16 × u16 (32 bytes) per plane. - let gu = _mm256_loadu_si256(g.as_ptr().add(x).cast::<__m256i>()); - let bu = _mm256_loadu_si256(b.as_ptr().add(x).cast::<__m256i>()); - let ru = _mm256_loadu_si256(r.as_ptr().add(x).cast::<__m256i>()); + let gu = endian::load_endian_u16x16::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x16::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x16::(r.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 16]; let mut b_buf = [0u16; 16]; let mut r_buf = [0u16; 16]; @@ -1354,7 +1354,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 16; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -1373,7 +1373,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1388,9 +1388,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 16 <= width { - let gu = _mm256_loadu_si256(g.as_ptr().add(x).cast::<__m256i>()); - let bu = _mm256_loadu_si256(b.as_ptr().add(x).cast::<__m256i>()); - let ru = _mm256_loadu_si256(r.as_ptr().add(x).cast::<__m256i>()); + let gu = endian::load_endian_u16x16::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x16::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x16::(r.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 16]; let mut b_buf = [0u16; 16]; let mut r_buf = [0u16; 16]; @@ -1408,7 +1408,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 16; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -1426,7 +1426,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1478,7 +1478,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1529,7 +1529,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( +pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1583,7 +1583,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1604,10 +1604,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); - let av = _mm512_cvtph_ps(_mm256_loadu_si256(a.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1645,7 +1645,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1670,7 +1670,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1691,10 +1691,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); - let av = _mm512_cvtph_ps(_mm256_loadu_si256(a.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1732,7 +1732,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1757,7 +1757,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "avx512f,avx512bw,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1774,10 +1774,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(_mm256_loadu_si256(g.as_ptr().add(x).cast())); - let bv = _mm512_cvtph_ps(_mm256_loadu_si256(b.as_ptr().add(x).cast())); - let rv = _mm512_cvtph_ps(_mm256_loadu_si256(r.as_ptr().add(x).cast())); - let av = _mm512_cvtph_ps(_mm256_loadu_si256(a.as_ptr().add(x).cast())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; let mut rf = [0.0f32; 16]; @@ -1807,7 +1807,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_f32_row( + scalar::gbrapf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1832,7 +1832,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1849,10 +1849,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 16 <= width { - let gu = _mm256_loadu_si256(g.as_ptr().add(x).cast::<__m256i>()); - let bu = _mm256_loadu_si256(b.as_ptr().add(x).cast::<__m256i>()); - let ru = _mm256_loadu_si256(r.as_ptr().add(x).cast::<__m256i>()); - let au = _mm256_loadu_si256(a.as_ptr().add(x).cast::<__m256i>()); + let gu = endian::load_endian_u16x16::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x16::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x16::(r.as_ptr().add(x).cast::()); + let au = endian::load_endian_u16x16::(a.as_ptr().add(x).cast::()); let mut g_buf = [0u16; 16]; let mut b_buf = [0u16; 16]; let mut r_buf = [0u16; 16]; @@ -1872,7 +1872,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( x += 16; } if x < width { - scalar_f16::gbrapf16_to_rgba_f16_row( + scalar_f16::gbrapf16_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], diff --git a/src/row/arch/x86_avx512/tests/planar_gbr_float.rs b/src/row/arch/x86_avx512/tests/planar_gbr_float.rs index 6a5e7561..6650d80b 100644 --- a/src/row/arch/x86_avx512/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_avx512/tests/planar_gbr_float.rs @@ -98,8 +98,8 @@ fn avx512_gbrpf32_to_rgb_matches_scalar() { prng_f32(&mut r, 0xC001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb width={w}"); } } @@ -120,8 +120,8 @@ fn avx512_gbrpf32_to_rgb_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb lane-order width={w}"); } } @@ -143,8 +143,8 @@ fn avx512_gbrpf32_to_rgba_matches_scalar() { prng_f32(&mut r, 0xC002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba width={w}"); } } @@ -165,8 +165,8 @@ fn avx512_gbrpf32_to_rgba_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba lane-order width={w}"); } } @@ -188,8 +188,8 @@ fn avx512_gbrpf32_to_rgb_u16_matches_scalar() { prng_f32(&mut r, 0xC003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 width={w}"); } } @@ -210,8 +210,8 @@ fn avx512_gbrpf32_to_rgb_u16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 lane-order width={w}"); } } @@ -233,8 +233,8 @@ fn avx512_gbrpf32_to_rgba_u16_matches_scalar() { prng_f32(&mut r, 0xC004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 width={w}"); } } @@ -255,8 +255,8 @@ fn avx512_gbrpf32_to_rgba_u16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 lane-order width={w}"); } } @@ -278,8 +278,8 @@ fn avx512_gbrpf32_to_rgb_f32_matches_scalar() { prng_f32(&mut r, 0xC005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f32 width={w}"); } } @@ -301,8 +301,8 @@ fn avx512_gbrpf32_to_rgba_f32_matches_scalar() { prng_f32(&mut r, 0xC006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f32 width={w}"); } } @@ -326,8 +326,8 @@ fn avx512_gbrpf32_to_rgb_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xC007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf32_to_rgb_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 (F16C) width={w}"); } } @@ -350,8 +350,8 @@ fn avx512_gbrpf32_to_rgb_f16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf32_to_rgb_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 lane-order width={w}"); } } @@ -375,8 +375,8 @@ fn avx512_gbrpf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xC008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf32_to_rgba_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 (F16C) width={w}"); } } @@ -399,8 +399,8 @@ fn avx512_gbrpf32_to_rgba_f16_lane_order() { asym_ramp_f32(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf32_to_rgba_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 lane-order width={w}"); } } @@ -423,8 +423,8 @@ fn avx512_gbrpf32_to_luma_matches_scalar() { prng_f32(&mut r, 0xC009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf32_to_luma_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_row( + unsafe { gbrpf32_to_luma_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &g, &b, &r, @@ -455,8 +455,8 @@ fn avx512_gbrpf32_to_luma_u16_matches_scalar() { prng_f32(&mut r, 0xC00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf32_to_luma_u16_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + unsafe { gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &g, &b, &r, @@ -490,8 +490,8 @@ fn avx512_gbrpf32_to_hsv_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf32_to_hsv_row(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + unsafe { gbrpf32_to_hsv_row::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &g, &b, &r, @@ -525,8 +525,8 @@ fn avx512_gbrapf32_to_rgba_matches_scalar() { prng_f32(&mut a, 0xC00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba width={w}"); } } @@ -548,8 +548,8 @@ fn avx512_gbrapf32_to_rgba_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba lane-order width={w}"); } } @@ -573,8 +573,8 @@ fn avx512_gbrapf32_to_rgba_u16_matches_scalar() { prng_f32(&mut a, 0xC00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 width={w}"); } } @@ -596,8 +596,8 @@ fn avx512_gbrapf32_to_rgba_u16_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 lane-order width={w}"); } } @@ -621,8 +621,8 @@ fn avx512_gbrapf32_to_rgba_f32_matches_scalar() { prng_f32(&mut a, 0xC00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f32 width={w}"); } } @@ -648,8 +648,8 @@ fn avx512_gbrapf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut a, 0xC00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf32_to_rgba_f16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 (F16C) width={w}"); } } @@ -673,8 +673,8 @@ fn avx512_gbrapf32_to_rgba_f16_lane_order() { asym_ramp_f32_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf32_to_rgba_f16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 lane-order width={w}"); } } @@ -698,11 +698,11 @@ fn avx512_gbrpf16_to_rgb_f16c_matches_scalar() { prng_f16(&mut r, 0xD001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf16_to_rgb_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb (F16C widen) width={w}"); } } @@ -725,11 +725,11 @@ fn avx512_gbrpf16_to_rgb_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf16_to_rgb_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb lane-order width={w}"); } } @@ -753,11 +753,11 @@ fn avx512_gbrpf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut r, 0xD002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf16_to_rgba_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba (F16C widen) width={w}"); } } @@ -780,11 +780,11 @@ fn avx512_gbrpf16_to_rgba_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf16_to_rgba_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba lane-order width={w}"); } } @@ -808,11 +808,11 @@ fn avx512_gbrpf16_to_rgb_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xD003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf16_to_rgb_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_u16 (F16C widen) width={w}"); } } @@ -836,11 +836,11 @@ fn avx512_gbrpf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xD004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf16_to_rgba_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -864,11 +864,11 @@ fn avx512_gbrpf16_to_rgb_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xD005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf16_to_rgb_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f32 (F16C widen) width={w}"); } } @@ -892,11 +892,11 @@ fn avx512_gbrpf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xD006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf16_to_rgba_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -918,8 +918,8 @@ fn avx512_gbrpf16_to_rgb_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xD007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 lossless width={w}"); } } @@ -940,8 +940,8 @@ fn avx512_gbrpf16_to_rgb_f16_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 lane-order width={w}"); } } @@ -963,8 +963,8 @@ fn avx512_gbrpf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xD008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 lossless width={w}"); } } @@ -985,8 +985,8 @@ fn avx512_gbrpf16_to_rgba_f16_lane_order() { asym_ramp_f16(&mut g, &mut b, &mut r); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 lane-order width={w}"); } } @@ -1011,11 +1011,11 @@ fn avx512_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xD009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_row( + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &gf, &bf, &rf, @@ -1048,11 +1048,11 @@ fn avx512_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xD00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &gf, &bf, &rf, @@ -1088,11 +1088,11 @@ fn avx512_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &gf, &bf, &rf, @@ -1128,12 +1128,12 @@ fn avx512_gbrapf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut a, 0xD00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf16_to_rgba_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba (F16C widen) width={w}"); } } @@ -1157,12 +1157,12 @@ fn avx512_gbrapf16_to_rgba_lane_order() { asym_ramp_f16_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf16_to_rgba_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba lane-order width={w}"); } } @@ -1188,12 +1188,12 @@ fn avx512_gbrapf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut a, 0xD00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf16_to_rgba_u16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_u16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -1219,12 +1219,12 @@ fn avx512_gbrapf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut a, 0xD00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf16_to_rgba_f32_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_f32_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -1248,8 +1248,8 @@ fn avx512_gbrapf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut a, 0xD00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 lossless width={w}"); } } @@ -1271,8 +1271,125 @@ fn avx512_gbrapf16_to_rgba_f16_lane_order() { asym_ramp_f16_a(&mut g, &mut b, &mut r, &mut a); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 lane-order width={w}"); } } + +// ---- BE parity helpers ------------------------------------------------------- + +fn be_encode_f32(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() +} + +fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() +} + +// ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "AVX-512 SIMD intrinsics unsupported by Miri")] +fn avx512_gbrpf32_to_rgb_be_parity() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE01_0001); + prng_f32(&mut b, 0xBE01_0002); + prng_f32(&mut r, 0xBE01_0003); + let mut le_out = std::vec![0u8; w * 3]; + let mut be_out = std::vec![0u8; w * 3]; + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf32 → u8 RGBA ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "AVX-512 SIMD intrinsics unsupported by Miri")] +fn avx512_gbrpf32_to_rgba_be_parity() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE02_0001); + prng_f32(&mut b, 0xBE02_0002); + prng_f32(&mut r, 0xBE02_0003); + let mut le_out = std::vec![0u8; w * 4]; + let mut be_out = std::vec![0u8; w * 4]; + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGB (lossless) -------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "AVX-512 SIMD intrinsics unsupported by Miri")] +fn avx512_gbrpf16_to_rgb_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE07_0001); + prng_f16(&mut b, 0xBE07_0002); + prng_f16(&mut r, 0xBE07_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 3]; + let mut be_out = std::vec![half::f16::ZERO; w * 3]; + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrapf16 → f16 RGBA (lossless) ------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "AVX-512 SIMD intrinsics unsupported by Miri")] +fn avx512_gbrapf16_to_rgba_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + let mut a = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE0F_0001); + prng_f16(&mut b, 0xBE0F_0002); + prng_f16(&mut r, 0xBE0F_0003); + prng_f16(&mut a, 0xBE0F_0004); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); + } +} diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs index 992ca30e..292a44d8 100644 --- a/src/row/arch/x86_sse41/endian.rs +++ b/src/row/arch/x86_sse41/endian.rs @@ -75,6 +75,58 @@ pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> __m128 } } +// ---- u16x4 loaders (via _mm_loadl_epi64, low 64 bits only) ---------------- + +/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) +/// lane in the LOW 8 bytes of a 128-bit register. Upper bytes are zeroed. +const BYTESWAP_MASK_U16X4: __m128i = unsafe { + core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 0x80u8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80]) +}; + +/// Loads 4 × u16 (8 bytes) from `ptr` (LE-encoded) into the low 64 bits of +/// `__m128i`, host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadl_epi64(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16X4) }; + v +} + +/// Loads 4 × u16 (8 bytes) from `ptr` (BE-encoded) into the low 64 bits of +/// `__m128i`, host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 8 readable bytes. Caller must have SSE4.1 +/// (and SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadl_epi64(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16X4) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4`. +/// +/// # Safety +/// +/// Same as `load_le_u16x4` / `load_be_u16x4`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x4(ptr: *const u8) -> __m128i { + if BE { + unsafe { load_be_u16x4(ptr) } + } else { + unsafe { load_le_u16x4(ptr) } + } +} + // ---- u32x4 loaders --------------------------------------------------------- /// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. diff --git a/src/row/arch/x86_sse41/planar_gbr_float.rs b/src/row/arch/x86_sse41/planar_gbr_float.rs index 58920abd..a4c5f9b3 100644 --- a/src/row/arch/x86_sse41/planar_gbr_float.rs +++ b/src/row/arch/x86_sse41/planar_gbr_float.rs @@ -95,7 +95,7 @@ unsafe fn i32x4_to_u16x4(i32v: __m128i) -> [u16; 4] { /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf32_to_rgb_row( +pub(crate) unsafe fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -114,9 +114,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -129,7 +129,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -145,7 +145,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf32_to_rgba_row( +pub(crate) unsafe fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -164,9 +164,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -180,7 +180,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -196,7 +196,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -215,9 +215,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -230,7 +230,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -246,7 +246,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -265,9 +265,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -281,7 +281,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_u16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -298,7 +298,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -311,7 +311,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( debug_assert!(out.len() >= width * 3, "out row too short"); // SSE4.1 has no vst3-style intrinsic; use scalar (well-vectorised by compiler). - scalar::gbrpf32_to_rgb_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGBA (lossless, α = 1.0) --------------------------------- @@ -326,7 +326,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f32_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -339,7 +339,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( debug_assert!(out.len() >= width * 4, "out row too short"); // SSE4.1 has no vst4-style intrinsic; use scalar. - scalar::gbrpf32_to_rgba_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGB (F16C narrow) ---------------------------------------- @@ -356,7 +356,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -371,9 +371,9 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_loadu_ps(g.as_ptr().add(x)); - let bv = _mm_loadu_ps(b.as_ptr().add(x)); - let rv = _mm_loadu_ps(r.as_ptr().add(x)); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -396,7 +396,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( x += 4; } if x < width { - scalar::gbrpf32_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar::gbrpf32_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -413,7 +413,7 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -428,9 +428,9 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_loadu_ps(g.as_ptr().add(x)); - let bv = _mm_loadu_ps(b.as_ptr().add(x)); - let rv = _mm_loadu_ps(r.as_ptr().add(x)); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -451,7 +451,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -468,7 +468,7 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_row( +pub(crate) unsafe fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -488,7 +488,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -519,7 +519,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn gbrpf32_to_luma_u16_row( +pub(crate) unsafe fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -539,7 +539,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -569,7 +569,7 @@ pub(crate) unsafe fn gbrpf32_to_luma_u16_row( /// 3. `h_out.len()`, `s_out.len()`, `v_out.len()` ≥ `width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf32_to_hsv_row( +pub(crate) unsafe fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -591,7 +591,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -621,7 +621,7 @@ pub(crate) unsafe fn gbrpf32_to_hsv_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrapf32_to_rgba_row( +pub(crate) unsafe fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -642,10 +642,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())), zero, one); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -660,7 +660,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &g[x..], &b[x..], &r[x..], @@ -683,7 +683,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( +pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -704,10 +704,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_loadu_ps(g.as_ptr().add(x)), zero, one); - let bv = clamp01(_mm_loadu_ps(b.as_ptr().add(x)), zero, one); - let rv = clamp01(_mm_loadu_ps(r.as_ptr().add(x)), zero, one); - let av = clamp01(_mm_loadu_ps(a.as_ptr().add(x)), zero, one); + let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); + let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); + let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let av = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())), zero, one); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -722,7 +722,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &g[x..], &b[x..], &r[x..], @@ -746,7 +746,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(dead_code)] // dispatcher delegates to scalar for lossless f32 interleave -pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( +pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -761,7 +761,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( debug_assert!(out.len() >= width * 4, "out row too short"); // SSE4.1 has no 4-channel interleave store; use scalar. - scalar::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f16 RGBA (F16C narrow, source α) ---------------------------- @@ -776,7 +776,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f32_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( +pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( g: &[f32], b: &[f32], r: &[f32], @@ -793,10 +793,10 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_loadu_ps(g.as_ptr().add(x)); - let bv = _mm_loadu_ps(b.as_ptr().add(x)); - let rv = _mm_loadu_ps(r.as_ptr().add(x)); - let av = _mm_loadu_ps(a.as_ptr().add(x)); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); + let av = _mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())); let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -820,7 +820,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( x += 4; } if x < width { - scalar::gbrapf32_to_rgba_f16_row( + scalar::gbrapf32_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], @@ -844,7 +844,7 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -864,9 +864,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 4 <= width { // _mm_loadl_epi64: 64-bit load into the low half of __m128i (4 × u16 = 4 × f16). - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -892,7 +892,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_row( + scalar::gbrpf32_to_rgb_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -915,7 +915,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1,f16c")] -pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -934,9 +934,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -962,7 +962,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_row( + scalar::gbrpf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -985,7 +985,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1004,9 +1004,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1031,7 +1031,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_u16_row( + scalar::gbrpf32_to_rgb_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1055,7 +1055,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1074,9 +1074,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1102,7 +1102,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_u16_row( + scalar::gbrpf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1126,7 +1126,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1141,9 +1141,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); // No interleave intrinsic in SSE4.1 — scatter via scalar loop. let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; @@ -1169,7 +1169,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgb_f32_row( + scalar::gbrpf32_to_rgb_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1193,7 +1193,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1208,9 +1208,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; let mut rf = [0.0f32; 4]; @@ -1236,7 +1236,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( bf[i] = b[x + i].to_f32(); rf[i] = r[x + i].to_f32(); } - scalar::gbrpf32_to_rgba_f32_row( + scalar::gbrpf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1261,7 +1261,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1277,9 +1277,9 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( let mut x = 0usize; while x + 4 <= width { // Load 4 × u16 from each plane into the low 64 bits of __m128i. - let gu = _mm_loadl_epi64(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadl_epi64(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadl_epi64(r.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x4::(r.as_ptr().add(x).cast::()); let base = x * 3; for p in 0..4usize { let dst = out.as_mut_ptr().add(base + p * 3); @@ -1308,7 +1308,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); } } } @@ -1326,7 +1326,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1341,9 +1341,9 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gu = _mm_loadl_epi64(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadl_epi64(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadl_epi64(r.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x4::(r.as_ptr().add(x).cast::()); let base = x * 4; for p in 0..4usize { let g_word = match p { @@ -1373,7 +1373,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); } } } @@ -1391,7 +1391,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( #[target_feature(enable = "sse4.1,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1443,7 +1443,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( #[target_feature(enable = "sse4.1,f16c")] #[allow(clippy::too_many_arguments)] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( +pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1494,7 +1494,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( +pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1548,7 +1548,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1569,10 +1569,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); - let av = _mm_cvtph_ps(_mm_loadl_epi64(a.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1602,7 +1602,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_row( + scalar::gbrapf32_to_rgba_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1627,7 +1627,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1648,10 +1648,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); - let av = _mm_cvtph_ps(_mm_loadl_epi64(a.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1681,7 +1681,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_u16_row( + scalar::gbrapf32_to_rgba_u16_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1706,7 +1706,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( #[inline] #[target_feature(enable = "sse4.1,f16c")] #[allow(dead_code)] // dispatch wired in Task 8 (MixedSinker) -pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( +pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1723,10 +1723,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(_mm_loadl_epi64(g.as_ptr().add(x).cast())); - let bv = _mm_cvtph_ps(_mm_loadl_epi64(b.as_ptr().add(x).cast())); - let rv = _mm_cvtph_ps(_mm_loadl_epi64(r.as_ptr().add(x).cast())); - let av = _mm_cvtph_ps(_mm_loadl_epi64(a.as_ptr().add(x).cast())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; let mut rf = [0.0f32; 4]; @@ -1756,7 +1756,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( rf[i] = r[x + i].to_f32(); af[i] = a[x + i].to_f32(); } - scalar::gbrapf32_to_rgba_f32_row( + scalar::gbrapf32_to_rgba_f32_row::( &gf[..tail], &bf[..tail], &rf[..tail], @@ -1781,7 +1781,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( /// 3. `out.len()` ≥ `4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( +pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1798,10 +1798,10 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( unsafe { let mut x = 0usize; while x + 4 <= width { - let gu = _mm_loadl_epi64(g.as_ptr().add(x).cast::<__m128i>()); - let bu = _mm_loadl_epi64(b.as_ptr().add(x).cast::<__m128i>()); - let ru = _mm_loadl_epi64(r.as_ptr().add(x).cast::<__m128i>()); - let au = _mm_loadl_epi64(a.as_ptr().add(x).cast::<__m128i>()); + let gu = endian::load_endian_u16x4::(g.as_ptr().add(x).cast::()); + let bu = endian::load_endian_u16x4::(b.as_ptr().add(x).cast::()); + let ru = endian::load_endian_u16x4::(r.as_ptr().add(x).cast::()); + let au = endian::load_endian_u16x4::(a.as_ptr().add(x).cast::()); let base = x * 4; for p in 0..4usize { let g_word = match p { @@ -1837,7 +1837,7 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrapf16_to_rgba_f16_row( + scalar_f16::gbrapf16_to_rgba_f16_row::( &g[x..], &b[x..], &r[x..], diff --git a/src/row/arch/x86_sse41/tests/planar_gbr_float.rs b/src/row/arch/x86_sse41/tests/planar_gbr_float.rs index df057ba1..b5cbda18 100644 --- a/src/row/arch/x86_sse41/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_sse41/tests/planar_gbr_float.rs @@ -44,8 +44,8 @@ fn sse41_gbrpf32_to_rgb_matches_scalar() { prng_f32(&mut r, 0xF001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb width={w}"); } } @@ -67,8 +67,8 @@ fn sse41_gbrpf32_to_rgba_matches_scalar() { prng_f32(&mut r, 0xF002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba width={w}"); } } @@ -90,8 +90,8 @@ fn sse41_gbrpf32_to_rgb_u16_matches_scalar() { prng_f32(&mut r, 0xF003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_u16 width={w}"); } } @@ -113,8 +113,8 @@ fn sse41_gbrpf32_to_rgba_u16_matches_scalar() { prng_f32(&mut r, 0xF004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_u16 width={w}"); } } @@ -136,8 +136,8 @@ fn sse41_gbrpf32_to_rgb_f32_matches_scalar() { prng_f32(&mut r, 0xF005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f32 width={w}"); } } @@ -159,8 +159,8 @@ fn sse41_gbrpf32_to_rgba_f32_matches_scalar() { prng_f32(&mut r, 0xF006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f32 width={w}"); } } @@ -183,8 +183,8 @@ fn sse41_gbrpf32_to_rgb_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xF007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf32_to_rgb_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgb_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgb_f16 (F16C) width={w}"); } } @@ -207,8 +207,8 @@ fn sse41_gbrpf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut r, 0xF008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf32_to_rgba_f16_row_f16c(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf32_to_rgba_f16_row_f16c::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_float::gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf32_to_rgba_f16 (F16C) width={w}"); } } @@ -231,8 +231,8 @@ fn sse41_gbrpf32_to_luma_matches_scalar() { prng_f32(&mut r, 0xF009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf32_to_luma_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_row( + unsafe { gbrpf32_to_luma_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &g, &b, &r, @@ -263,8 +263,8 @@ fn sse41_gbrpf32_to_luma_u16_matches_scalar() { prng_f32(&mut r, 0xF00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf32_to_luma_u16_row(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + unsafe { gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &g, &b, &r, @@ -298,8 +298,8 @@ fn sse41_gbrpf32_to_hsv_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf32_to_hsv_row(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + unsafe { gbrpf32_to_hsv_row::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &g, &b, &r, @@ -333,8 +333,8 @@ fn sse41_gbrapf32_to_rgba_matches_scalar() { prng_f32(&mut a, 0xF00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba width={w}"); } } @@ -358,8 +358,8 @@ fn sse41_gbrapf32_to_rgba_u16_matches_scalar() { prng_f32(&mut a, 0xF00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_u16 width={w}"); } } @@ -383,8 +383,8 @@ fn sse41_gbrapf32_to_rgba_f32_matches_scalar() { prng_f32(&mut a, 0xF00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f32 width={w}"); } } @@ -409,8 +409,8 @@ fn sse41_gbrapf32_to_rgba_f16_f16c_matches_scalar() { prng_f32(&mut a, 0xF00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf32_to_rgba_f16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf32_to_rgba_f16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_float::gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf32_to_rgba_f16 (F16C) width={w}"); } } @@ -433,12 +433,12 @@ fn sse41_gbrpf16_to_rgb_f16c_matches_scalar() { prng_f16(&mut r, 0xE001_0003); let mut simd = std::vec![0u8; w * 3]; let mut scal = std::vec![0u8; w * 3]; - unsafe { gbrpf16_to_rgb_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_row_f16c::(&g, &b, &r, &mut simd, w) }; // Scalar reference: widen f16→f32, then scalar gbrpf32_to_rgb_row. let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb (F16C widen) width={w}"); } } @@ -461,11 +461,11 @@ fn sse41_gbrpf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut r, 0xE002_0003); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrpf16_to_rgba_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba (F16C widen) width={w}"); } } @@ -488,11 +488,11 @@ fn sse41_gbrpf16_to_rgb_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xE003_0003); let mut simd = std::vec![0u16; w * 3]; let mut scal = std::vec![0u16; w * 3]; - unsafe { gbrpf16_to_rgb_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_u16 (F16C widen) width={w}"); } } @@ -515,11 +515,11 @@ fn sse41_gbrpf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xE004_0003); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrpf16_to_rgba_u16_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_u16_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_u16_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -542,11 +542,11 @@ fn sse41_gbrpf16_to_rgb_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xE005_0003); let mut simd = std::vec![0.0f32; w * 3]; let mut scal = std::vec![0.0f32; w * 3]; - unsafe { gbrpf16_to_rgb_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgb_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgb_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f32 (F16C widen) width={w}"); } } @@ -569,11 +569,11 @@ fn sse41_gbrpf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut r, 0xE006_0003); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrpf16_to_rgba_f32_row_f16c(&g, &b, &r, &mut simd, w) }; + unsafe { gbrpf16_to_rgba_f32_row_f16c::(&g, &b, &r, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row(&gf, &bf, &rf, &mut scal, w); + scalar::planar_gbr_float::gbrpf32_to_rgba_f32_row::(&gf, &bf, &rf, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -595,8 +595,8 @@ fn sse41_gbrpf16_to_rgb_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xE007_0003); let mut simd = std::vec![half::f16::ZERO; w * 3]; let mut scal = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgb_f16 lossless width={w}"); } } @@ -618,8 +618,8 @@ fn sse41_gbrpf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut r, 0xE008_0003); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut simd, w) }; - scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut scal, w); + unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut simd, w) }; + scalar::planar_gbr_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut scal, w); assert_eq!(simd, scal, "gbrpf16_to_rgba_f16 lossless width={w}"); } } @@ -643,11 +643,11 @@ fn sse41_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xE009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_row( + scalar::planar_gbr_float::gbrpf32_to_luma_row::( &gf, &bf, &rf, @@ -679,11 +679,11 @@ fn sse41_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xE00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_luma_u16_row( + scalar::planar_gbr_float::gbrpf32_to_luma_u16_row::( &gf, &bf, &rf, @@ -718,11 +718,11 @@ fn sse41_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrpf32_to_hsv_row( + scalar::planar_gbr_float::gbrpf32_to_hsv_row::( &gf, &bf, &rf, @@ -757,12 +757,12 @@ fn sse41_gbrapf16_to_rgba_f16c_matches_scalar() { prng_f16(&mut a, 0xE00C_0004); let mut simd = std::vec![0u8; w * 4]; let mut scal = std::vec![0u8; w * 4]; - unsafe { gbrapf16_to_rgba_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba (F16C widen) width={w}"); } } @@ -787,12 +787,12 @@ fn sse41_gbrapf16_to_rgba_u16_f16c_matches_scalar() { prng_f16(&mut a, 0xE00D_0004); let mut simd = std::vec![0u16; w * 4]; let mut scal = std::vec![0u16; w * 4]; - unsafe { gbrapf16_to_rgba_u16_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_u16_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_u16_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_u16 (F16C widen) width={w}"); } } @@ -817,12 +817,12 @@ fn sse41_gbrapf16_to_rgba_f32_f16c_matches_scalar() { prng_f16(&mut a, 0xE00E_0004); let mut simd = std::vec![0.0f32; w * 4]; let mut scal = std::vec![0.0f32; w * 4]; - unsafe { gbrapf16_to_rgba_f32_row_f16c(&g, &b, &r, &a, &mut simd, w) }; + unsafe { gbrapf16_to_rgba_f32_row_f16c::(&g, &b, &r, &a, &mut simd, w) }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); let af: std::vec::Vec = a.iter().map(|v| v.to_f32()).collect(); - scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row(&gf, &bf, &rf, &af, &mut scal, w); + scalar::planar_gbr_float::gbrapf32_to_rgba_f32_row::(&gf, &bf, &rf, &af, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f32 (F16C widen) width={w}"); } } @@ -846,8 +846,125 @@ fn sse41_gbrapf16_to_rgba_f16_lossless_matches_scalar() { prng_f16(&mut a, 0xE00F_0004); let mut simd = std::vec![half::f16::ZERO; w * 4]; let mut scal = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut simd, w) }; - scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut scal, w); + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut simd, w) }; + scalar::planar_gbr_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut scal, w); assert_eq!(simd, scal, "gbrapf16_to_rgba_f16 lossless width={w}"); } } + +// ---- BE parity helpers ------------------------------------------------------- + +fn be_encode_f32(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() +} + +fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() +} + +// ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "SSE4.1 SIMD intrinsics unsupported by Miri")] +fn sse41_gbrpf32_to_rgb_be_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE01_0001); + prng_f32(&mut b, 0xBE01_0002); + prng_f32(&mut r, 0xBE01_0003); + let mut le_out = std::vec![0u8; w * 3]; + let mut be_out = std::vec![0u8; w * 3]; + unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf32 → u8 RGBA ------------------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "SSE4.1 SIMD intrinsics unsupported by Miri")] +fn sse41_gbrpf32_to_rgba_be_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![0.0f32; w]; + let mut b = std::vec![0.0f32; w]; + let mut r = std::vec![0.0f32; w]; + prng_f32(&mut g, 0xBE02_0001); + prng_f32(&mut b, 0xBE02_0002); + prng_f32(&mut r, 0xBE02_0003); + let mut le_out = std::vec![0u8; w * 4]; + let mut be_out = std::vec![0u8; w * 4]; + unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f32(&g); + let b_be = be_encode_f32(&b); + let r_be = be_encode_f32(&r); + unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); + } +} + +// ---- BE parity: Gbrpf16 → f16 RGB (lossless) -------------------------------- + +#[test] +#[cfg_attr(miri, ignore = "SSE4.1 SIMD intrinsics unsupported by Miri")] +fn sse41_gbrpf16_to_rgb_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE07_0001); + prng_f16(&mut b, 0xBE07_0002); + prng_f16(&mut r, 0xBE07_0003); + let mut le_out = std::vec![half::f16::ZERO; w * 3]; + let mut be_out = std::vec![half::f16::ZERO; w * 3]; + unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); + } +} + +// ---- BE parity: Gbrapf16 → f16 RGBA (lossless) ------------------------------ + +#[test] +#[cfg_attr(miri, ignore = "SSE4.1 SIMD intrinsics unsupported by Miri")] +fn sse41_gbrapf16_to_rgba_f16_be_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for &w in WIDTHS { + let mut g = std::vec![half::f16::ZERO; w]; + let mut b = std::vec![half::f16::ZERO; w]; + let mut r = std::vec![half::f16::ZERO; w]; + let mut a = std::vec![half::f16::ZERO; w]; + prng_f16(&mut g, 0xBE0F_0001); + prng_f16(&mut b, 0xBE0F_0002); + prng_f16(&mut r, 0xBE0F_0003); + prng_f16(&mut a, 0xBE0F_0004); + let mut le_out = std::vec![half::f16::ZERO; w * 4]; + let mut be_out = std::vec![half::f16::ZERO; w * 4]; + unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); + } +} diff --git a/src/row/dispatch/planar_gbr_float.rs b/src/row/dispatch/planar_gbr_float.rs index abb78f03..4ac016e4 100644 --- a/src/row/dispatch/planar_gbr_float.rs +++ b/src/row/dispatch/planar_gbr_float.rs @@ -53,7 +53,7 @@ use crate::{ /// Dispatch `gbrpf32_to_rgb_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_row( +pub(crate) fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -71,45 +71,45 @@ pub(crate) fn gbrpf32_to_rgb_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgb_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgb_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgb_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgb_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgb_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgb_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgb_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgb_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgb_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgb_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgb_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_row::(g, b, r, out, width); } // ---- Gbrpf32 → u8 RGBA ------------------------------------------------------ /// Dispatch `gbrpf32_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_row( +pub(crate) fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -127,45 +127,45 @@ pub(crate) fn gbrpf32_to_rgba_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgba_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgba_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgba_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgba_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgba_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgba_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgba_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgba_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgba_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgba_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgba_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_row::(g, b, r, out, width); } // ---- Gbrpf32 → u16 RGB ------------------------------------------------------ /// Dispatch `gbrpf32_to_rgb_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_u16_row( +pub(crate) fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -183,45 +183,45 @@ pub(crate) fn gbrpf32_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgb_u16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_u16_row::(g, b, r, out, width); } // ---- Gbrpf32 → u16 RGBA ----------------------------------------------------- /// Dispatch `gbrpf32_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_u16_row( +pub(crate) fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -239,45 +239,45 @@ pub(crate) fn gbrpf32_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgba_u16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_u16_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGB (lossless) ------------------------------------------- /// Dispatch `gbrpf32_to_rgb_f32_row` (lossless interleave). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_f32_row( +pub(crate) fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -295,7 +295,7 @@ pub(crate) fn gbrpf32_to_rgb_f32_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgb_f32_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } return; } }, @@ -303,14 +303,14 @@ pub(crate) fn gbrpf32_to_rgb_f32_row( _ => {} } } - scalar::gbrpf32_to_rgb_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f32 RGBA (lossless) ------------------------------------------ /// Dispatch `gbrpf32_to_rgba_f32_row` (lossless). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_f32_row( +pub(crate) fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -328,7 +328,7 @@ pub(crate) fn gbrpf32_to_rgba_f32_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_rgba_f32_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } return; } }, @@ -336,14 +336,14 @@ pub(crate) fn gbrpf32_to_rgba_f32_row( _ => {} } } - scalar::gbrpf32_to_rgba_f32_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f32_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGB (fused narrow) ---------------------------------------- /// Dispatch `gbrpf32_to_rgb_f16_row` (fused f32→f16 narrow + interleave). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_f16_row( +pub(crate) fn gbrpf32_to_rgb_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -363,9 +363,9 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( // fp16 feature needed for vcvt_f16_f32. if fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf32_to_rgb_f16_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgb_f16_row_fp16::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } return; } @@ -375,9 +375,9 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( // F16C runtime detection for narrow. if f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgb_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgb_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } return; } @@ -385,9 +385,9 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( // F16C runtime detection for narrow. if f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgb_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgb_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } return; } @@ -395,9 +395,9 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( // F16C runtime detection for narrow. if f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgb_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgb_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } return; } @@ -406,21 +406,21 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( if simd128_available() { // wasm32 has no native f16 narrowing — delegates to scalar narrow. // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgb_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgb_f16_row::(g, b, r, out, width); } // ---- Gbrpf32 → f16 RGBA (fused narrow) --------------------------------------- /// Dispatch `gbrpf32_to_rgba_f16_row` (fused f32→f16 narrow + interleave). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_f16_row( +pub(crate) fn gbrpf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -440,9 +440,9 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( // fp16 feature needed for vcvt_f16_f32. if fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf32_to_rgba_f16_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf32_to_rgba_f16_row_fp16::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } return; } @@ -451,27 +451,27 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( if avx512_available() { if f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf32_to_rgba_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf32_to_rgba_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } return; } if avx2_available() { if f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf32_to_rgba_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf32_to_rgba_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } return; } if sse41_available() { if f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf32_to_rgba_f16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf32_to_rgba_f16_row_f16c::(g, b, r, out, width); } } else { - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } return; } @@ -480,14 +480,14 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( if simd128_available() { // wasm32 has no native f16 narrowing — delegates to scalar narrow. // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar::gbrpf32_to_rgba_f16_row(g, b, r, out, width); + scalar::gbrpf32_to_rgba_f16_row::(g, b, r, out, width); } // ---- Gbrpf32 → u8 luma ------------------------------------------------------ @@ -495,7 +495,7 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( /// Dispatch `gbrpf32_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn gbrpf32_to_luma_row( +pub(crate) fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -514,38 +514,38 @@ pub(crate) fn gbrpf32_to_luma_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); } + unsafe { arch::neon::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); } + unsafe { arch::wasm_simd128::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); } + unsafe { arch::x86_avx512::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); } + unsafe { arch::x86_avx2::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); } + unsafe { arch::x86_sse41::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } return; } }, _ => {} } } - scalar::gbrpf32_to_luma_row(g, b, r, out, width, matrix, full_range); + scalar::gbrpf32_to_luma_row::(g, b, r, out, width, matrix, full_range); } // ---- Gbrpf32 → u16 luma ----------------------------------------------------- @@ -553,7 +553,7 @@ pub(crate) fn gbrpf32_to_luma_row( /// Dispatch `gbrpf32_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn gbrpf32_to_luma_u16_row( +pub(crate) fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -573,7 +573,7 @@ pub(crate) fn gbrpf32_to_luma_u16_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + arch::neon::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } return; } @@ -582,7 +582,7 @@ pub(crate) fn gbrpf32_to_luma_u16_row( if simd128_available() { // SAFETY: simd128 verified available at compile time. unsafe { - arch::wasm_simd128::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + arch::wasm_simd128::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } return; } @@ -591,21 +591,21 @@ pub(crate) fn gbrpf32_to_luma_u16_row( if avx512_available() { // SAFETY: AVX-512F + BW verified available. unsafe { - arch::x86_avx512::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + arch::x86_avx512::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + arch::x86_avx2::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + arch::x86_sse41::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } return; } @@ -613,7 +613,7 @@ pub(crate) fn gbrpf32_to_luma_u16_row( _ => {} } } - scalar::gbrpf32_to_luma_u16_row(g, b, r, out, width, matrix, full_range); + scalar::gbrpf32_to_luma_u16_row::(g, b, r, out, width, matrix, full_range); } // ---- Gbrpf32 → HSV ---------------------------------------------------------- @@ -621,7 +621,7 @@ pub(crate) fn gbrpf32_to_luma_u16_row( /// Dispatch `gbrpf32_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn gbrpf32_to_hsv_row( +pub(crate) fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -643,7 +643,7 @@ pub(crate) fn gbrpf32_to_hsv_row( if neon_available() { // SAFETY: NEON verified available. unsafe { - arch::neon::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + arch::neon::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } return; } @@ -652,7 +652,7 @@ pub(crate) fn gbrpf32_to_hsv_row( if simd128_available() { // SAFETY: simd128 verified available at compile time. unsafe { - arch::wasm_simd128::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + arch::wasm_simd128::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } return; } @@ -661,21 +661,21 @@ pub(crate) fn gbrpf32_to_hsv_row( if avx512_available() { // SAFETY: AVX-512F + BW verified available. unsafe { - arch::x86_avx512::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + arch::x86_avx512::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + arch::x86_avx2::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + arch::x86_sse41::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } return; } @@ -683,14 +683,14 @@ pub(crate) fn gbrpf32_to_hsv_row( _ => {} } } - scalar::gbrpf32_to_hsv_row(g, b, r, h_out, s_out, v_out, width); + scalar::gbrpf32_to_hsv_row::(g, b, r, h_out, s_out, v_out, width); } // ---- Gbrapf32 → u8 RGBA (source α) ----------------------------------------- /// Dispatch `gbrapf32_to_rgba_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_row( +pub(crate) fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -710,45 +710,45 @@ pub(crate) fn gbrapf32_to_rgba_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrapf32_to_rgba_row(g, b, r, a, out, width); } + unsafe { arch::neon::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrapf32_to_rgba_row(g, b, r, a, out, width); } + unsafe { arch::wasm_simd128::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrapf32_to_rgba_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx512::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrapf32_to_rgba_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx2::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrapf32_to_rgba_row(g, b, r, a, out, width); } + unsafe { arch::x86_sse41::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } return; } }, _ => {} } } - scalar::gbrapf32_to_rgba_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → u16 RGBA (source α) ---------------------------------------- /// Dispatch `gbrapf32_to_rgba_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_u16_row( +pub(crate) fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -768,45 +768,45 @@ pub(crate) fn gbrapf32_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); } + unsafe { arch::neon::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); } + unsafe { arch::wasm_simd128::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available. - unsafe { arch::x86_avx512::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx512::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available. - unsafe { arch::x86_avx2::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx2::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available. - unsafe { arch::x86_sse41::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); } + unsafe { arch::x86_sse41::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } return; } }, _ => {} } } - scalar::gbrapf32_to_rgba_u16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_u16_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f32 RGBA (lossless source α) -------------------------------- /// Dispatch `gbrapf32_to_rgba_f32_row` (lossless). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_f32_row( +pub(crate) fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -826,7 +826,7 @@ pub(crate) fn gbrapf32_to_rgba_f32_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available. - unsafe { arch::neon::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); } + unsafe { arch::neon::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } return; } }, @@ -834,14 +834,14 @@ pub(crate) fn gbrapf32_to_rgba_f32_row( _ => {} } } - scalar::gbrapf32_to_rgba_f32_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f32_row::(g, b, r, a, out, width); } // ---- Gbrapf32 → f16 RGBA (fused narrow, source α) --------------------------- /// Dispatch `gbrapf32_to_rgba_f16_row`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_f16_row( +pub(crate) fn gbrapf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -863,9 +863,9 @@ pub(crate) fn gbrapf32_to_rgba_f16_row( // fp16 feature needed for vcvt_f16_f32. if fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrapf32_to_rgba_f16_row_fp16(g, b, r, a, out, width); } + unsafe { arch::neon::gbrapf32_to_rgba_f16_row_fp16::(g, b, r, a, out, width); } } else { - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } return; } @@ -874,27 +874,27 @@ pub(crate) fn gbrapf32_to_rgba_f16_row( if avx512_available() { if f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrapf32_to_rgba_f16_row_f16c(g, b, r, a, out, width); } + unsafe { arch::x86_avx512::gbrapf32_to_rgba_f16_row_f16c::(g, b, r, a, out, width); } } else { - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } return; } if avx2_available() { if f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrapf32_to_rgba_f16_row_f16c(g, b, r, a, out, width); } + unsafe { arch::x86_avx2::gbrapf32_to_rgba_f16_row_f16c::(g, b, r, a, out, width); } } else { - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } return; } if sse41_available() { if f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrapf32_to_rgba_f16_row_f16c(g, b, r, a, out, width); } + unsafe { arch::x86_sse41::gbrapf32_to_rgba_f16_row_f16c::(g, b, r, a, out, width); } } else { - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } return; } @@ -903,21 +903,21 @@ pub(crate) fn gbrapf32_to_rgba_f16_row( if simd128_available() { // wasm32 has no native f16 narrowing — delegates to scalar narrow. // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::wasm_simd128::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } return; } }, _ => {} } } - scalar::gbrapf32_to_rgba_f16_row(g, b, r, a, out, width); + scalar::gbrapf32_to_rgba_f16_row::(g, b, r, a, out, width); } // ---- Gbrpf16 → f16 RGB (lossless, f16-native) -------------------------------- /// Dispatch `gbrpf16_to_rgb_f16_row` (lossless f16 interleave). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgb_f16_row( +pub(crate) fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -935,45 +935,45 @@ pub(crate) fn gbrpf16_to_rgb_f16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available (no fp16 needed — lossless u16 reinterpret). - unsafe { arch::neon::gbrpf16_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time (lossless, delegates scalar). - unsafe { arch::wasm_simd128::gbrpf16_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx512::gbrpf16_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx2::gbrpf16_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_sse41::gbrpf16_to_rgb_f16_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar_f16::gbrpf16_to_rgb_f16_row(g, b, r, out, width); + scalar_f16::gbrpf16_to_rgb_f16_row::(g, b, r, out, width); } // ---- Gbrpf16 → f16 RGBA (lossless, f16-native) ------------------------------ /// Dispatch `gbrpf16_to_rgba_f16_row` (lossless f16 interleave + α = f16(1.0)). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgba_f16_row( +pub(crate) fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -991,45 +991,45 @@ pub(crate) fn gbrpf16_to_rgba_f16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available (no fp16 needed — lossless u16 reinterpret). - unsafe { arch::neon::gbrpf16_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time (lossless, delegates scalar). - unsafe { arch::wasm_simd128::gbrpf16_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx512::gbrpf16_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx2::gbrpf16_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_sse41::gbrpf16_to_rgba_f16_row(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } return; } }, _ => {} } } - scalar_f16::gbrpf16_to_rgba_f16_row(g, b, r, out, width); + scalar_f16::gbrpf16_to_rgba_f16_row::(g, b, r, out, width); } // ---- Gbrapf16 → f16 RGBA (lossless, source α) -------------------------------- /// Dispatch `gbrapf16_to_rgba_f16_row` (lossless f16 interleave + source α). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf16_to_rgba_f16_row( +pub(crate) fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1049,38 +1049,38 @@ pub(crate) fn gbrapf16_to_rgba_f16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified available (no fp16 needed — lossless u16 reinterpret). - unsafe { arch::neon::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::neon::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time (lossless, delegates scalar). - unsafe { arch::wasm_simd128::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::wasm_simd128::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512F + BW verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx512::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx512::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } return; } if avx2_available() { // SAFETY: AVX2 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_avx2::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::x86_avx2::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } return; } if sse41_available() { // SAFETY: SSE4.1 verified available (no F16C needed — lossless u16 reinterpret). - unsafe { arch::x86_sse41::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); } + unsafe { arch::x86_sse41::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } return; } }, _ => {} } } - scalar_f16::gbrapf16_to_rgba_f16_row(g, b, r, a, out, width); + scalar_f16::gbrapf16_to_rgba_f16_row::(g, b, r, a, out, width); } // ---- Gbrpf16 → u16 RGB (fp16 NEON / F16C x86 widen / wasm simd128 / scalar) -- @@ -1088,7 +1088,7 @@ pub(crate) fn gbrapf16_to_rgba_f16_row( /// Dispatch `gbrpf16_to_rgb_u16_row`: NEON fp16 or F16C x86 widen+SIMD when /// available, wasm-simd128 widen+SIMD on wasm32, else scalar widen fallback. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgb_u16_row( +pub(crate) fn gbrpf16_to_rgb_u16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1106,31 +1106,31 @@ pub(crate) fn gbrpf16_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() && fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf16_to_rgb_u16_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgb_u16_row_fp16::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf16_to_rgb_u16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgb_u16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf16_to_rgb_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgb_u16_row_f16c::(g, b, r, out, width); } return; } if avx2_available() && f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf16_to_rgb_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgb_u16_row_f16c::(g, b, r, out, width); } return; } if sse41_available() && f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf16_to_rgb_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgb_u16_row_f16c::(g, b, r, out, width); } return; } }, @@ -1150,7 +1150,7 @@ pub(crate) fn gbrpf16_to_rgb_u16_row( bf[i] = b[offset + i].to_f32(); rf[i] = r[offset + i].to_f32(); } - scalar::gbrpf32_to_rgb_u16_row(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); + scalar::gbrpf32_to_rgb_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); offset += n; } } @@ -1160,7 +1160,7 @@ pub(crate) fn gbrpf16_to_rgb_u16_row( /// Dispatch `gbrpf16_to_rgba_u16_row`: NEON fp16 or F16C x86 widen+SIMD when /// available, wasm-simd128 widen+SIMD on wasm32, else scalar widen fallback. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgba_u16_row( +pub(crate) fn gbrpf16_to_rgba_u16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1178,31 +1178,31 @@ pub(crate) fn gbrpf16_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() && fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf16_to_rgba_u16_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgba_u16_row_fp16::(g, b, r, out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf16_to_rgba_u16_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgba_u16_row::(g, b, r, out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() && f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf16_to_rgba_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgba_u16_row_f16c::(g, b, r, out, width); } return; } if avx2_available() && f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf16_to_rgba_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgba_u16_row_f16c::(g, b, r, out, width); } return; } if sse41_available() && f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf16_to_rgba_u16_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgba_u16_row_f16c::(g, b, r, out, width); } return; } }, @@ -1222,7 +1222,7 @@ pub(crate) fn gbrpf16_to_rgba_u16_row( bf[i] = b[offset + i].to_f32(); rf[i] = r[offset + i].to_f32(); } - scalar::gbrpf32_to_rgba_u16_row(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 4..], n); + scalar::gbrpf32_to_rgba_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 4..], n); offset += n; } } @@ -1232,7 +1232,7 @@ pub(crate) fn gbrpf16_to_rgba_u16_row( /// Dispatch `gbrpf16_to_rgb_row`: NEON fp16 or SSE4.1+F16C widening when /// available, wasm-simd128 widen+SIMD on wasm32, else scalar fallback. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgb_row( +pub(crate) fn gbrpf16_to_rgb_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1251,7 +1251,7 @@ pub(crate) fn gbrpf16_to_rgb_row( if neon_available() { if fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf16_to_rgb_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgb_row_fp16::(g, b, r, out, width); } } else { // NEON available but no fp16 — widen scalar, then NEON f32→u8. const CHUNK: usize = 64; @@ -1268,7 +1268,7 @@ pub(crate) fn gbrpf16_to_rgb_row( } // SAFETY: NEON verified available. unsafe { - arch::neon::gbrpf32_to_rgb_row(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); + arch::neon::gbrpf32_to_rgb_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); } offset += n; } @@ -1279,7 +1279,7 @@ pub(crate) fn gbrpf16_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf16_to_rgb_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgb_row::(g, b, r, out, width); } return; } }, @@ -1287,7 +1287,7 @@ pub(crate) fn gbrpf16_to_rgb_row( if avx512_available() { if f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf16_to_rgb_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgb_row_f16c::(g, b, r, out, width); } } else { // AVX-512 available but no F16C — widen scalar, then AVX-512 f32→u8. const CHUNK: usize = 64; @@ -1304,7 +1304,7 @@ pub(crate) fn gbrpf16_to_rgb_row( } // SAFETY: AVX-512F + BW verified available. unsafe { - arch::x86_avx512::gbrpf32_to_rgb_row( + arch::x86_avx512::gbrpf32_to_rgb_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1320,7 +1320,7 @@ pub(crate) fn gbrpf16_to_rgb_row( if avx2_available() { if f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf16_to_rgb_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgb_row_f16c::(g, b, r, out, width); } } else { // AVX2 available but no F16C — widen scalar, then AVX2 f32→u8. const CHUNK: usize = 64; @@ -1337,7 +1337,7 @@ pub(crate) fn gbrpf16_to_rgb_row( } // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbrpf32_to_rgb_row( + arch::x86_avx2::gbrpf32_to_rgb_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1353,7 +1353,7 @@ pub(crate) fn gbrpf16_to_rgb_row( if sse41_available() { if f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf16_to_rgb_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgb_row_f16c::(g, b, r, out, width); } } else { // SSE4.1 available but no F16C — widen scalar, then SSE4.1 f32→u8. const CHUNK: usize = 64; @@ -1370,7 +1370,7 @@ pub(crate) fn gbrpf16_to_rgb_row( } // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbrpf32_to_rgb_row( + arch::x86_sse41::gbrpf32_to_rgb_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1400,7 +1400,7 @@ pub(crate) fn gbrpf16_to_rgb_row( bf[i] = b[offset + i].to_f32(); rf[i] = r[offset + i].to_f32(); } - scalar::gbrpf32_to_rgb_row(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); + scalar::gbrpf32_to_rgb_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 3..], n); offset += n; } } @@ -1410,7 +1410,7 @@ pub(crate) fn gbrpf16_to_rgb_row( /// Dispatch `gbrpf16_to_rgba_row`: NEON fp16 or SSE4.1+F16C widening when /// available, wasm-simd128 widen+SIMD on wasm32, else scalar fallback. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgba_row( +pub(crate) fn gbrpf16_to_rgba_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -1429,7 +1429,7 @@ pub(crate) fn gbrpf16_to_rgba_row( if neon_available() { if fp16_available() { // SAFETY: NEON + fp16 verified available. - unsafe { arch::neon::gbrpf16_to_rgba_row_fp16(g, b, r, out, width); } + unsafe { arch::neon::gbrpf16_to_rgba_row_fp16::(g, b, r, out, width); } } else { // NEON available but no fp16 — widen scalar, then NEON f32→u8. const CHUNK: usize = 64; @@ -1446,7 +1446,7 @@ pub(crate) fn gbrpf16_to_rgba_row( } // SAFETY: NEON verified available. unsafe { - arch::neon::gbrpf32_to_rgba_row( + arch::neon::gbrpf32_to_rgba_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1463,7 +1463,7 @@ pub(crate) fn gbrpf16_to_rgba_row( target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 verified available at compile time. - unsafe { arch::wasm_simd128::gbrpf16_to_rgba_row(g, b, r, out, width); } + unsafe { arch::wasm_simd128::gbrpf16_to_rgba_row::(g, b, r, out, width); } return; } }, @@ -1471,7 +1471,7 @@ pub(crate) fn gbrpf16_to_rgba_row( if avx512_available() { if f16c_available() { // SAFETY: AVX-512F + BW + F16C verified available. - unsafe { arch::x86_avx512::gbrpf16_to_rgba_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx512::gbrpf16_to_rgba_row_f16c::(g, b, r, out, width); } } else { // AVX-512 available but no F16C — widen scalar, then AVX-512 f32→u8. const CHUNK: usize = 64; @@ -1488,7 +1488,7 @@ pub(crate) fn gbrpf16_to_rgba_row( } // SAFETY: AVX-512F + BW verified available. unsafe { - arch::x86_avx512::gbrpf32_to_rgba_row( + arch::x86_avx512::gbrpf32_to_rgba_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1504,7 +1504,7 @@ pub(crate) fn gbrpf16_to_rgba_row( if avx2_available() { if f16c_available() { // SAFETY: AVX2 + F16C verified available. - unsafe { arch::x86_avx2::gbrpf16_to_rgba_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_avx2::gbrpf16_to_rgba_row_f16c::(g, b, r, out, width); } } else { // AVX2 available but no F16C — widen scalar, then AVX2 f32→u8. const CHUNK: usize = 64; @@ -1521,7 +1521,7 @@ pub(crate) fn gbrpf16_to_rgba_row( } // SAFETY: AVX2 verified available. unsafe { - arch::x86_avx2::gbrpf32_to_rgba_row( + arch::x86_avx2::gbrpf32_to_rgba_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1537,7 +1537,7 @@ pub(crate) fn gbrpf16_to_rgba_row( if sse41_available() { if f16c_available() { // SAFETY: SSE4.1 + F16C verified available. - unsafe { arch::x86_sse41::gbrpf16_to_rgba_row_f16c(g, b, r, out, width); } + unsafe { arch::x86_sse41::gbrpf16_to_rgba_row_f16c::(g, b, r, out, width); } } else { // SSE4.1 available but no F16C — widen scalar, then SSE4.1 f32→u8. const CHUNK: usize = 64; @@ -1554,7 +1554,7 @@ pub(crate) fn gbrpf16_to_rgba_row( } // SAFETY: SSE4.1 verified available. unsafe { - arch::x86_sse41::gbrpf32_to_rgba_row( + arch::x86_sse41::gbrpf32_to_rgba_row::( &gf[..n], &bf[..n], &rf[..n], @@ -1584,7 +1584,7 @@ pub(crate) fn gbrpf16_to_rgba_row( bf[i] = b[offset + i].to_f32(); rf[i] = r[offset + i].to_f32(); } - scalar::gbrpf32_to_rgba_row(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 4..], n); + scalar::gbrpf32_to_rgba_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[offset * 4..], n); offset += n; } } diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs index d3b76acb..fa74c199 100644 --- a/src/row/scalar/planar_gbr_f16.rs +++ b/src/row/scalar/planar_gbr_f16.rs @@ -7,6 +7,14 @@ //! from [`super::planar_gbr_float`]. No separate f16-source kernels are needed //! for those paths. //! +//! ## Endian support +//! +//! All `` kernels treat the source planes as opaque `u16` +//! bit-patterns (which they already are for the lossless f16 paths). +//! When `BE = true` each u16 element is byte-swapped before being written to +//! the interleaved output buffer — i.e. we load a big-endian f16 bit-pattern +//! and emit it as host-native f16. +//! //! ## Kernels in this file //! //! | Kernel | In | Out | Notes | @@ -23,6 +31,23 @@ // Kernels are not yet consumed by any sinker (Task 8 wires MixedSinker impls). #![cfg_attr(not(test), allow(dead_code))] +// ---- shared BE helper ------------------------------------------------------- + +/// Load a single `half::f16` sample with optional BE byte-swap. +/// +/// When `BE = true` the two bytes of the f16 bit-pattern are reversed (i.e. +/// we load a big-endian f16 from disk and convert to host-native). When +/// `BE = false` the value is returned as-is. The dead branch is eliminated +/// by the compiler when the caller is monomorphized. +#[inline(always)] +fn load_f16(plane: &[half::f16], i: usize) -> half::f16 { + if BE { + half::f16::from_bits(plane[i].to_bits().swap_bytes()) + } else { + plane[i] + } +} + // ---- Gbrpf16 → f16 RGB (lossless interleave) -------------------------------- /// Interleaves planar G/B/R `half::f16` rows into packed `R, G, B` @@ -30,8 +55,11 @@ /// /// Pure gather-scatter — no conversion. HDR values, NaN, and Inf are /// preserved bit-exact. Output order is **R, G, B** per pixel. +/// +/// `BE = true`: each f16 element is byte-swapped (BE → host-native) before +/// being written to the interleaved output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgb_f16_row( +pub(crate) fn gbrpf16_to_rgb_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -44,9 +72,9 @@ pub(crate) fn gbrpf16_to_rgb_f16_row( debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let dst = x * 3; - rgb_out[dst] = r[x]; - rgb_out[dst + 1] = g[x]; - rgb_out[dst + 2] = b[x]; + rgb_out[dst] = load_f16::(r, x); + rgb_out[dst + 1] = load_f16::(g, x); + rgb_out[dst + 2] = load_f16::(b, x); } } @@ -56,8 +84,11 @@ pub(crate) fn gbrpf16_to_rgb_f16_row( /// **`half::f16`** with constant opaque α = `half::f16::from_f32(1.0)`. /// /// Used for `Gbrpf16` sources (no α plane) when `with_rgba_f16` is requested. +/// +/// `BE = true`: each f16 element is byte-swapped (BE → host-native) before +/// being written. α is always host-native f16(1.0) regardless of `BE`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf16_to_rgba_f16_row( +pub(crate) fn gbrpf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -71,9 +102,9 @@ pub(crate) fn gbrpf16_to_rgba_f16_row( let one_f16 = half::f16::from_f32(1.0); for x in 0..width { let dst = x * 4; - rgba_out[dst] = r[x]; - rgba_out[dst + 1] = g[x]; - rgba_out[dst + 2] = b[x]; + rgba_out[dst] = load_f16::(r, x); + rgba_out[dst + 1] = load_f16::(g, x); + rgba_out[dst + 2] = load_f16::(b, x); rgba_out[dst + 3] = one_f16; } } @@ -85,8 +116,10 @@ pub(crate) fn gbrpf16_to_rgba_f16_row( /// /// Pure gather-scatter. All four channels including α are copied losslessly — /// HDR, NaN, and Inf preserved bit-exact. +/// +/// `BE = true`: each f16 element (including α) is byte-swapped before write. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf16_to_rgba_f16_row( +pub(crate) fn gbrapf16_to_rgba_f16_row( g: &[half::f16], b: &[half::f16], r: &[half::f16], @@ -101,10 +134,10 @@ pub(crate) fn gbrapf16_to_rgba_f16_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = r[x]; - rgba_out[dst + 1] = g[x]; - rgba_out[dst + 2] = b[x]; - rgba_out[dst + 3] = a[x]; + rgba_out[dst] = load_f16::(r, x); + rgba_out[dst + 1] = load_f16::(g, x); + rgba_out[dst + 2] = load_f16::(b, x); + rgba_out[dst + 3] = load_f16::(a, x); } } @@ -135,6 +168,12 @@ pub(crate) fn copy_alpha_plane_f16(alpha: &[half::f16], rgba_out: &mut [half::f1 mod tests { use super::*; + // ---- helper: byte-swap a slice of f16 to simulate BE source ---------------- + + fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { + src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + } + // ---- gbrpf16_to_rgb_f16_row ---------------------------------------------- #[test] @@ -148,7 +187,7 @@ mod tests { let b = [half::f16::from_f32(0.5)]; let r = [half::f16::from_f32(1.0)]; let mut out = vec![half::f16::ZERO; 3]; - gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut out, 1); + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], half::f16::from_f32(1.0), "R"); assert_eq!(out[1], half::f16::from_f32(0.25), "G"); assert_eq!(out[2], half::f16::from_f32(0.5), "B"); @@ -166,10 +205,45 @@ mod tests { let b = [half::f16::from_f32(0.0)]; let r = [half::f16::from_f32(0.0)]; let mut out = vec![half::f16::ZERO; 3]; - gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut out, 1); + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[1], hdr, "HDR G preserved bit-exact"); } + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrpf16_to_rgb_f16_be_parity() { + // BE-encoded source must decode to same output as LE source. + let g = [ + half::f16::from_f32(0.0), + half::f16::from_f32(0.25), + half::f16::from_f32(0.5), + half::f16::from_f32(1.0), + ]; + let b = [ + half::f16::from_f32(0.1), + half::f16::from_f32(0.3), + half::f16::from_f32(0.7), + half::f16::from_f32(0.9), + ]; + let r = [ + half::f16::from_f32(0.5), + half::f16::from_f32(0.8), + half::f16::from_f32(0.2), + half::f16::from_f32(0.6), + ]; + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let mut le_out = vec![half::f16::ZERO; 4 * 3]; + let mut be_out = vec![half::f16::ZERO; 4 * 3]; + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf16_to_rgb_f16_row must match LE"); + } + // ---- gbrpf16_to_rgba_f16_row --------------------------------------------- #[test] @@ -182,10 +256,44 @@ mod tests { let b = [half::f16::from_f32(0.5)]; let r = [half::f16::from_f32(0.5)]; let mut out = vec![half::f16::ZERO; 4]; - gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut out, 1); + gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[3], half::f16::from_f32(1.0), "alpha must be f16(1.0)"); } + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrpf16_to_rgba_f16_be_parity() { + let g = [ + half::f16::from_f32(0.0), + half::f16::from_f32(0.25), + half::f16::from_f32(0.5), + half::f16::from_f32(1.0), + ]; + let b = [ + half::f16::from_f32(0.1), + half::f16::from_f32(0.3), + half::f16::from_f32(0.7), + half::f16::from_f32(0.9), + ]; + let r = [ + half::f16::from_f32(0.5), + half::f16::from_f32(0.8), + half::f16::from_f32(0.2), + half::f16::from_f32(0.6), + ]; + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let mut le_out = vec![half::f16::ZERO; 4 * 4]; + let mut be_out = vec![half::f16::ZERO; 4 * 4]; + gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf16_to_rgba_f16_row must match LE"); + } + // ---- gbrapf16_to_rgba_f16_row -------------------------------------------- #[test] @@ -199,13 +307,54 @@ mod tests { let r = [half::f16::from_f32(0.75)]; let a = [half::f16::from_f32(0.9)]; let mut out = vec![half::f16::ZERO; 4]; - gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut out, 1); + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[0], half::f16::from_f32(0.75), "R"); assert_eq!(out[1], half::f16::from_f32(0.25), "G"); assert_eq!(out[2], half::f16::from_f32(0.5), "B"); assert_eq!(out[3], half::f16::from_f32(0.9), "A from source"); } + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrapf16_to_rgba_f16_be_parity() { + let g = [ + half::f16::from_f32(0.0), + half::f16::from_f32(0.25), + half::f16::from_f32(0.5), + half::f16::from_f32(1.0), + ]; + let b = [ + half::f16::from_f32(0.1), + half::f16::from_f32(0.3), + half::f16::from_f32(0.7), + half::f16::from_f32(0.9), + ]; + let r = [ + half::f16::from_f32(0.5), + half::f16::from_f32(0.8), + half::f16::from_f32(0.2), + half::f16::from_f32(0.6), + ]; + let a = [ + half::f16::from_f32(0.2), + half::f16::from_f32(0.4), + half::f16::from_f32(0.6), + half::f16::from_f32(0.8), + ]; + let g_be = be_encode_f16(&g); + let b_be = be_encode_f16(&b); + let r_be = be_encode_f16(&r); + let a_be = be_encode_f16(&a); + let mut le_out = vec![half::f16::ZERO; 4 * 4]; + let mut be_out = vec![half::f16::ZERO; 4 * 4]; + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, 4); + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrapf16_to_rgba_f16_row must match LE"); + } + // ---- copy_alpha_plane_f16 ------------------------------------------------ #[test] diff --git a/src/row/scalar/planar_gbr_float.rs b/src/row/scalar/planar_gbr_float.rs index 963d008f..fe519eb9 100644 --- a/src/row/scalar/planar_gbr_float.rs +++ b/src/row/scalar/planar_gbr_float.rs @@ -17,6 +17,13 @@ //! documented caller-visible behaviour; callers needing full HDR range use //! the f32 pass-through accessors. //! +//! # Endian support +//! +//! All `` kernels take the source planes as raw byte slices +//! (`&[u8]` reinterpreted as `&[u32]` / `&[u16]` with byte-swap when +//! `BE = true`). The `BE = false` path is identical to the original LE kernels +//! — the compiler eliminates the dead branch. +//! //! # Rounding (float → integer) //! //! `(y.clamp(0.0, 1.0) * scale + 0.5) as T` @@ -63,14 +70,33 @@ fn f32_to_f16(y: f32) -> half::f16 { half::f16::from_f32(y) } +/// Load a single f32 sample from a `&[f32]` plane with optional BE byte-swap. +/// +/// When `BE = true` the four bytes of the f32 representation are reversed +/// (equivalent to loading a big-endian IEEE-754 single from disk). When +/// `BE = false` the value is returned as-is (host-native / LE). +#[inline(always)] +fn load_f32(plane: &[f32], i: usize) -> f32 { + if BE { + // SAFETY: reinterpret f32 bits as u32, swap bytes, reinterpret back. + let bits = plane[i].to_bits().swap_bytes(); + f32::from_bits(bits) + } else { + plane[i] + } +} + // ---- Gbrpf32 → u8 RGB ------------------------------------------------------ /// Interleaves planar G/B/R `f32` rows into packed `R, G, B` **bytes**. /// /// Each f32 sample is clamped to `[0.0, 1.0]` and scaled to `[0, 255]` /// with round-half-up. Output order is **R, G, B** per pixel. +/// +/// When `BE = true` each f32 element is loaded as a big-endian u32 bit +/// pattern (4-byte swap before reinterpret). `BE = false` is LE / host-native. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_row( +pub(crate) fn gbrpf32_to_rgb_row( g: &[f32], b: &[f32], r: &[f32], @@ -83,9 +109,9 @@ pub(crate) fn gbrpf32_to_rgb_row( debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let dst = x * 3; - rgb_out[dst] = f32_to_u8(r[x]); - rgb_out[dst + 1] = f32_to_u8(g[x]); - rgb_out[dst + 2] = f32_to_u8(b[x]); + rgb_out[dst] = f32_to_u8(load_f32::(r, x)); + rgb_out[dst + 1] = f32_to_u8(load_f32::(g, x)); + rgb_out[dst + 2] = f32_to_u8(load_f32::(b, x)); } } @@ -93,8 +119,10 @@ pub(crate) fn gbrpf32_to_rgb_row( /// Interleaves planar G/B/R `f32` rows into packed `R, G, B, A` **bytes** /// with constant opaque α = `0xFF`. Used for `Gbrpf32` sources (no α plane). +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_row( +pub(crate) fn gbrpf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -107,9 +135,9 @@ pub(crate) fn gbrpf32_to_rgba_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_u8(r[x]); - rgba_out[dst + 1] = f32_to_u8(g[x]); - rgba_out[dst + 2] = f32_to_u8(b[x]); + rgba_out[dst] = f32_to_u8(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_u8(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_u8(load_f32::(b, x)); rgba_out[dst + 3] = 0xFF; } } @@ -120,8 +148,10 @@ pub(crate) fn gbrpf32_to_rgba_row( /// /// Each f32 sample is clamped to `[0.0, 1.0]` and scaled to `[0, 65535]` /// with round-half-up (full-range). +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_u16_row( +pub(crate) fn gbrpf32_to_rgb_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -134,9 +164,9 @@ pub(crate) fn gbrpf32_to_rgb_u16_row( debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let dst = x * 3; - rgb_out[dst] = f32_to_u16(r[x]); - rgb_out[dst + 1] = f32_to_u16(g[x]); - rgb_out[dst + 2] = f32_to_u16(b[x]); + rgb_out[dst] = f32_to_u16(load_f32::(r, x)); + rgb_out[dst + 1] = f32_to_u16(load_f32::(g, x)); + rgb_out[dst + 2] = f32_to_u16(load_f32::(b, x)); } } @@ -144,8 +174,10 @@ pub(crate) fn gbrpf32_to_rgb_u16_row( /// Interleaves planar G/B/R `f32` rows into packed `R, G, B, A` **`u16`** /// with constant opaque α = `0xFFFF`. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_u16_row( +pub(crate) fn gbrpf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -158,9 +190,9 @@ pub(crate) fn gbrpf32_to_rgba_u16_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_u16(r[x]); - rgba_out[dst + 1] = f32_to_u16(g[x]); - rgba_out[dst + 2] = f32_to_u16(b[x]); + rgba_out[dst] = f32_to_u16(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_u16(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_u16(load_f32::(b, x)); rgba_out[dst + 3] = 0xFFFF; } } @@ -171,8 +203,11 @@ pub(crate) fn gbrpf32_to_rgba_u16_row( /// /// Lossless interleave — no clamping, no rounding. HDR values > 1.0, /// NaN, and Inf are preserved bit-exact. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap) +/// before being written to the output. The output is always host-native f32. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_f32_row( +pub(crate) fn gbrpf32_to_rgb_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -185,9 +220,9 @@ pub(crate) fn gbrpf32_to_rgb_f32_row( debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let dst = x * 3; - rgb_out[dst] = r[x]; - rgb_out[dst + 1] = g[x]; - rgb_out[dst + 2] = b[x]; + rgb_out[dst] = load_f32::(r, x); + rgb_out[dst + 1] = load_f32::(g, x); + rgb_out[dst + 2] = load_f32::(b, x); } } @@ -195,8 +230,10 @@ pub(crate) fn gbrpf32_to_rgb_f32_row( /// Interleaves planar G/B/R `f32` rows into packed `R, G, B, A` **`f32`** /// with α = `1.0` (opaque). Lossless — HDR, NaN, and Inf preserved. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_f32_row( +pub(crate) fn gbrpf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -209,9 +246,9 @@ pub(crate) fn gbrpf32_to_rgba_f32_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = r[x]; - rgba_out[dst + 1] = g[x]; - rgba_out[dst + 2] = b[x]; + rgba_out[dst] = load_f32::(r, x); + rgba_out[dst + 1] = load_f32::(g, x); + rgba_out[dst + 2] = load_f32::(b, x); rgba_out[dst + 3] = 1.0; } } @@ -224,8 +261,10 @@ pub(crate) fn gbrpf32_to_rgba_f32_row( /// interleave in a single pass. HDR values exceeding the f16 maximum (~65504) /// saturate to `half::f16::INFINITY`. Callers needing full HDR range use /// `gbrpf32_to_rgb_f32_row` instead. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgb_f16_row( +pub(crate) fn gbrpf32_to_rgb_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -238,9 +277,9 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let dst = x * 3; - rgb_out[dst] = f32_to_f16(r[x]); - rgb_out[dst + 1] = f32_to_f16(g[x]); - rgb_out[dst + 2] = f32_to_f16(b[x]); + rgb_out[dst] = f32_to_f16(load_f32::(r, x)); + rgb_out[dst + 1] = f32_to_f16(load_f32::(g, x)); + rgb_out[dst + 2] = f32_to_f16(load_f32::(b, x)); } } @@ -248,8 +287,10 @@ pub(crate) fn gbrpf32_to_rgb_f16_row( /// Interleaves planar G/B/R `f32` rows into packed `R, G, B, A` **`half::f16`** /// with α = `half::f16::from_f32(1.0)`. HDR > ~65504 saturates to f16 ±Inf. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_rgba_f16_row( +pub(crate) fn gbrpf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -263,9 +304,9 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( let one_f16 = half::f16::from_f32(1.0); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_f16(r[x]); - rgba_out[dst + 1] = f32_to_f16(g[x]); - rgba_out[dst + 2] = f32_to_f16(b[x]); + rgba_out[dst] = f32_to_f16(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_f16(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_f16(load_f32::(b, x)); rgba_out[dst + 3] = one_f16; } } @@ -277,9 +318,11 @@ pub(crate) fn gbrpf32_to_rgba_f16_row( /// /// The intermediate u8 RGB uses round-half-up clamping; luma is then computed /// by `rgb_to_luma_row`. `matrix` and `full_range` control the luma weighting. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn gbrpf32_to_luma_row( +pub(crate) fn gbrpf32_to_luma_row( g: &[f32], b: &[f32], r: &[f32], @@ -297,7 +340,7 @@ pub(crate) fn gbrpf32_to_luma_row( let mut offset = 0; while offset < width { let n = (width - offset).min(CHUNK); - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -323,9 +366,11 @@ pub(crate) fn gbrpf32_to_luma_row( /// The u16 luma value has the same dynamic range as the u8 path (0–255), zero- /// extended into the u16 carrier — matching the convention of packed-YUV /// `*_to_luma_u16_row` kernels for 8-bit-equivalent sources. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn gbrpf32_to_luma_u16_row( +pub(crate) fn gbrpf32_to_luma_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -343,7 +388,7 @@ pub(crate) fn gbrpf32_to_luma_u16_row( let mut offset = 0; while offset < width { let n = (width - offset).min(CHUNK); - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -368,8 +413,10 @@ pub(crate) fn gbrpf32_to_luma_u16_row( /// /// Matches OpenCV `cv2.COLOR_RGB2HSV` semantics: `H ∈ [0, 179]`, `S, V ∈ /// [0, 255]`. f32 values are clamped via `f32_to_u8` before the RGB→HSV step. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrpf32_to_hsv_row( +pub(crate) fn gbrpf32_to_hsv_row( g: &[f32], b: &[f32], r: &[f32], @@ -389,7 +436,7 @@ pub(crate) fn gbrpf32_to_hsv_row( let mut offset = 0; while offset < width { let n = (width - offset).min(CHUNK); - gbrpf32_to_rgb_row( + gbrpf32_to_rgb_row::( &g[offset..], &b[offset..], &r[offset..], @@ -413,8 +460,10 @@ pub(crate) fn gbrpf32_to_hsv_row( /// /// α is sourced from the `a` plane: clamped to `[0.0, 1.0]` and scaled by 255 /// with round-half-up. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_row( +pub(crate) fn gbrapf32_to_rgba_row( g: &[f32], b: &[f32], r: &[f32], @@ -429,10 +478,10 @@ pub(crate) fn gbrapf32_to_rgba_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_u8(r[x]); - rgba_out[dst + 1] = f32_to_u8(g[x]); - rgba_out[dst + 2] = f32_to_u8(b[x]); - rgba_out[dst + 3] = f32_to_u8(a[x]); + rgba_out[dst] = f32_to_u8(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_u8(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_u8(load_f32::(b, x)); + rgba_out[dst + 3] = f32_to_u8(load_f32::(a, x)); } } @@ -442,8 +491,10 @@ pub(crate) fn gbrapf32_to_rgba_row( /// /// α is sourced from the `a` plane: clamped to `[0.0, 1.0]` and scaled by /// 65535 with round-half-up. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_u16_row( +pub(crate) fn gbrapf32_to_rgba_u16_row( g: &[f32], b: &[f32], r: &[f32], @@ -458,10 +509,10 @@ pub(crate) fn gbrapf32_to_rgba_u16_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_u16(r[x]); - rgba_out[dst + 1] = f32_to_u16(g[x]); - rgba_out[dst + 2] = f32_to_u16(b[x]); - rgba_out[dst + 3] = f32_to_u16(a[x]); + rgba_out[dst] = f32_to_u16(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_u16(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_u16(load_f32::(b, x)); + rgba_out[dst + 3] = f32_to_u16(load_f32::(a, x)); } } @@ -471,8 +522,10 @@ pub(crate) fn gbrapf32_to_rgba_u16_row( /// /// Lossless — HDR, NaN, and Inf are preserved bit-exact in all four channels /// including α. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_f32_row( +pub(crate) fn gbrapf32_to_rgba_f32_row( g: &[f32], b: &[f32], r: &[f32], @@ -487,10 +540,10 @@ pub(crate) fn gbrapf32_to_rgba_f32_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = r[x]; - rgba_out[dst + 1] = g[x]; - rgba_out[dst + 2] = b[x]; - rgba_out[dst + 3] = a[x]; + rgba_out[dst] = load_f32::(r, x); + rgba_out[dst + 1] = load_f32::(g, x); + rgba_out[dst + 2] = load_f32::(b, x); + rgba_out[dst + 3] = load_f32::(a, x); } } @@ -501,8 +554,10 @@ pub(crate) fn gbrapf32_to_rgba_f32_row( /// /// Fused narrow: all four channels converted via IEEE-754 round-to-nearest-even /// in a single pass. HDR > ~65504 saturates to f16 ±Inf. +/// +/// `BE = true`: each f32 loaded as big-endian u32 bit pattern (4-byte swap). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn gbrapf32_to_rgba_f16_row( +pub(crate) fn gbrapf32_to_rgba_f16_row( g: &[f32], b: &[f32], r: &[f32], @@ -517,10 +572,10 @@ pub(crate) fn gbrapf32_to_rgba_f16_row( debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let dst = x * 4; - rgba_out[dst] = f32_to_f16(r[x]); - rgba_out[dst + 1] = f32_to_f16(g[x]); - rgba_out[dst + 2] = f32_to_f16(b[x]); - rgba_out[dst + 3] = f32_to_f16(a[x]); + rgba_out[dst] = f32_to_f16(load_f32::(r, x)); + rgba_out[dst + 1] = f32_to_f16(load_f32::(g, x)); + rgba_out[dst + 2] = f32_to_f16(load_f32::(b, x)); + rgba_out[dst + 3] = f32_to_f16(load_f32::(a, x)); } } @@ -531,6 +586,12 @@ mod tests { use super::*; use crate::ColorMatrix; + // ---- helper: byte-swap a slice of f32 to simulate BE source ---------------- + + fn be_encode(src: &[f32]) -> std::vec::Vec { + src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + } + // ---- gbrpf32_to_rgb_row -------------------------------------------------- #[test] @@ -545,7 +606,7 @@ mod tests { let b = [*v; 1]; let r = [*v; 1]; let mut out = [0u8; 3]; - gbrpf32_to_rgb_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgb_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], *e, "R: v={v}, expected={e}"); assert_eq!(out[1], *e, "G: v={v}, expected={e}"); assert_eq!(out[2], *e, "B: v={v}, expected={e}"); @@ -559,12 +620,28 @@ mod tests { let b = [0.5f32]; let r = [1.0f32]; let mut out = [0u8; 3]; - gbrpf32_to_rgb_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgb_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 255, "R"); assert_eq!(out[1], 0, "G"); assert_eq!(out[2], 128, "B"); } + #[test] + fn gbrpf32_to_rgb_be_parity() { + // BE-encoded source must decode to same output as LE source. + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u8; 4 * 3]; + let mut be_out = std::vec![0u8; 4 * 3]; + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgb_row must match LE"); + } + // ---- gbrpf32_to_rgba_row ------------------------------------------------- #[test] @@ -573,7 +650,7 @@ mod tests { let b = [0.5f32]; let r = [0.5f32]; let mut out = [0u8; 4]; - gbrpf32_to_rgba_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[3], 0xFF, "alpha must be 0xFF"); } @@ -586,12 +663,27 @@ mod tests { let b = [*v; 1]; let r = [*v; 1]; let mut out = [0u8; 4]; - gbrpf32_to_rgba_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], *e, "R: v={v}"); assert_eq!(out[3], 0xFF, "alpha must remain 0xFF"); } } + #[test] + fn gbrpf32_to_rgba_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u8; 4 * 4]; + let mut be_out = std::vec![0u8; 4 * 4]; + gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgba_row must match LE"); + } + // ---- gbrpf32_to_rgb_u16_row ---------------------------------------------- #[test] @@ -605,13 +697,28 @@ mod tests { let b = [*v; 1]; let r = [*v; 1]; let mut out = [0u16; 3]; - gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], *e, "R u16: v={v}"); assert_eq!(out[1], *e, "G u16: v={v}"); assert_eq!(out[2], *e, "B u16: v={v}"); } } + #[test] + fn gbrpf32_to_rgb_u16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u16; 4 * 3]; + let mut be_out = std::vec![0u16; 4 * 3]; + gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgb_u16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgb_u16_row must match LE"); + } + // ---- gbrpf32_to_rgba_u16_row --------------------------------------------- #[test] @@ -620,10 +727,25 @@ mod tests { let b = [0.5f32]; let r = [0.5f32]; let mut out = [0u16; 4]; - gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[3], 0xFFFF, "alpha must be 0xFFFF"); } + #[test] + fn gbrpf32_to_rgba_u16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u16; 4 * 4]; + let mut be_out = std::vec![0u16; 4 * 4]; + gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgba_u16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgba_u16_row must match LE"); + } + // ---- gbrpf32_to_rgb_f32_row (lossless) ------------------------------------ #[test] @@ -633,7 +755,7 @@ mod tests { let b = [0.1f32, 0.2, 0.3, 0.4]; let r = [0.5f32, 0.6, 0.7, 0.8]; let mut out = [0.0f32; 12]; - gbrpf32_to_rgb_f32_row(&g, &b, &r, &mut out, 4); + gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut out, 4); // Check R channel (index 0, 3, 6, 9 in RGBA interleave = index 0, 3, 6, 9) assert_eq!(out[0], r[0]); assert_eq!(out[3], r[1]); @@ -646,6 +768,21 @@ mod tests { assert_eq!(out[10], g[3], "G negative preserved"); } + #[test] + fn gbrpf32_to_rgb_f32_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0.0f32; 4 * 3]; + let mut be_out = std::vec![0.0f32; 4 * 3]; + gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgb_f32_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgb_f32_row must match LE"); + } + // ---- gbrpf32_to_rgba_f32_row (lossless, α = 1.0) ------------------------- #[test] @@ -654,7 +791,7 @@ mod tests { let b = [0.5f32]; let r = [0.5f32]; let mut out = [0.0f32; 4]; - gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[3], 1.0, "alpha must be 1.0"); } @@ -664,13 +801,28 @@ mod tests { let g = [f32::NAN]; let b = [f32::NEG_INFINITY]; let mut out = [0.0f32; 4]; - gbrpf32_to_rgba_f32_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[0], 2.5, "R HDR preserved"); assert!(out[1].is_nan(), "G NaN preserved"); assert!(out[2].is_infinite() && out[2] < 0.0, "B -Inf preserved"); assert_eq!(out[3], 1.0, "alpha = 1.0"); } + #[test] + fn gbrpf32_to_rgba_f32_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0.0f32; 4 * 4]; + let mut be_out = std::vec![0.0f32; 4 * 4]; + gbrpf32_to_rgba_f32_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgba_f32_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgba_f32_row must match LE"); + } + // ---- gbrpf32_to_rgb_f16_row ---------------------------------------------- #[test] @@ -683,7 +835,7 @@ mod tests { let b = [0.25f32, 0.75, 0.0]; let r = [1.0f32, 0.0, 0.5]; let mut out = vec![half::f16::ZERO; 9]; - gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut out, 3); + gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut out, 3); assert_eq!(out[0], half::f16::from_f32(1.0), "R[0]"); assert_eq!(out[1], half::f16::from_f32(0.0), "G[0]"); assert_eq!(out[2], half::f16::from_f32(0.25), "B[0]"); @@ -700,13 +852,32 @@ mod tests { let b = [-70_000.0f32]; let r = [0.5f32]; let mut out = vec![half::f16::ZERO; 3]; - gbrpf32_to_rgb_f16_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut out, 1); // G maps to index 1 assert!(out[1].is_infinite() && out[1].to_f32() > 0.0, "G +Inf"); // B maps to index 2 assert!(out[2].is_infinite() && out[2].to_f32() < 0.0, "B -Inf"); } + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrpf32_to_rgb_f16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = vec![half::f16::ZERO; 4 * 3]; + let mut be_out = vec![half::f16::ZERO; 4 * 3]; + gbrpf32_to_rgb_f16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgb_f16_row must match LE"); + } + // ---- gbrpf32_to_rgba_f16_row --------------------------------------------- #[test] @@ -719,10 +890,29 @@ mod tests { let b = [0.5f32]; let r = [0.5f32]; let mut out = vec![half::f16::ZERO; 4]; - gbrpf32_to_rgba_f16_row(&g, &b, &r, &mut out, 1); + gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut out, 1); assert_eq!(out[3], half::f16::from_f32(1.0), "alpha must be f16(1.0)"); } + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrpf32_to_rgba_f16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = vec![half::f16::ZERO; 4 * 4]; + let mut be_out = vec![half::f16::ZERO; 4 * 4]; + gbrpf32_to_rgba_f16_row::(&g, &b, &r, &mut le_out, 4); + gbrpf32_to_rgba_f16_row::(&g_be, &b_be, &r_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrpf32_to_rgba_f16_row must match LE"); + } + // ---- gbrpf32_to_luma_row ------------------------------------------------- #[test] @@ -731,7 +921,7 @@ mod tests { let b = [0.0f32]; let r = [0.0f32]; let mut out = [0xFFu8; 1]; - gbrpf32_to_luma_row(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert_eq!(out[0], 0); } @@ -741,10 +931,25 @@ mod tests { let b = [1.0f32]; let r = [1.0f32]; let mut out = [0u8; 1]; - gbrpf32_to_luma_row(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert_eq!(out[0], 255); } + #[test] + fn gbrpf32_to_luma_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u8; 4]; + let mut be_out = std::vec![0u8; 4]; + gbrpf32_to_luma_row::(&g, &b, &r, &mut le_out, 4, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::(&g_be, &b_be, &r_be, &mut be_out, 4, ColorMatrix::Bt709, true); + assert_eq!(be_out, le_out, "BE gbrpf32_to_luma_row must match LE"); + } + // ---- gbrpf32_to_luma_u16_row --------------------------------------------- #[test] @@ -753,7 +958,7 @@ mod tests { let b = [0.0f32]; let r = [0.0f32]; let mut out = [0xFFFFu16; 1]; - gbrpf32_to_luma_u16_row(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert_eq!(out[0], 0); } @@ -763,10 +968,25 @@ mod tests { let b = [1.0f32]; let r = [1.0f32]; let mut out = [0u16; 1]; - gbrpf32_to_luma_u16_row(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out, 1, ColorMatrix::Bt709, true); assert_eq!(out[0], 255, "luma_u16 is zero-extended u8 luma"); } + #[test] + fn gbrpf32_to_luma_u16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_out = std::vec![0u16; 4]; + let mut be_out = std::vec![0u16; 4]; + gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut le_out, 4, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::(&g_be, &b_be, &r_be, &mut be_out, 4, ColorMatrix::Bt709, true); + assert_eq!(be_out, le_out, "BE gbrpf32_to_luma_u16_row must match LE"); + } + // ---- gbrpf32_to_hsv_row -------------------------------------------------- #[test] @@ -777,7 +997,7 @@ mod tests { let mut h = [0xFFu8; 1]; let mut s = [0xFFu8; 1]; let mut v = [0xFFu8; 1]; - gbrpf32_to_hsv_row(&g, &b, &r, &mut h, &mut s, &mut v, 1); + gbrpf32_to_hsv_row::(&g, &b, &r, &mut h, &mut s, &mut v, 1); assert_eq!(v[0], 0, "V must be 0 for black"); assert_eq!(s[0], 0, "S must be 0 for achromatic"); } @@ -790,11 +1010,32 @@ mod tests { let mut h = [0u8; 1]; let mut s = [0u8; 1]; let mut v = [0u8; 1]; - gbrpf32_to_hsv_row(&g, &b, &r, &mut h, &mut s, &mut v, 1); + gbrpf32_to_hsv_row::(&g, &b, &r, &mut h, &mut s, &mut v, 1); assert_eq!(v[0], 255, "V must be 255 for white"); assert_eq!(s[0], 0, "S must be 0 for achromatic"); } + #[test] + fn gbrpf32_to_hsv_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let mut le_h = std::vec![0u8; 4]; + let mut le_s = std::vec![0u8; 4]; + let mut le_v = std::vec![0u8; 4]; + let mut be_h = std::vec![0u8; 4]; + let mut be_s = std::vec![0u8; 4]; + let mut be_v = std::vec![0u8; 4]; + gbrpf32_to_hsv_row::(&g, &b, &r, &mut le_h, &mut le_s, &mut le_v, 4); + gbrpf32_to_hsv_row::(&g_be, &b_be, &r_be, &mut be_h, &mut be_s, &mut be_v, 4); + assert_eq!(be_h, le_h, "BE hsv H must match LE"); + assert_eq!(be_s, le_s, "BE hsv S must match LE"); + assert_eq!(be_v, le_v, "BE hsv V must match LE"); + } + // ---- gbrapf32_to_rgba_row ------------------------------------------------ #[test] @@ -804,7 +1045,7 @@ mod tests { let r = [0.5f32]; let a = [0.5f32]; let mut out = [0u8; 4]; - gbrapf32_to_rgba_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut out, 1); // 0.5 → (0.5 * 255 + 0.5) as u8 = 128 assert_eq!(out[3], 128, "alpha from source plane"); } @@ -819,12 +1060,29 @@ mod tests { let a_low = [-0.1f32]; let mut out_high = [0u8; 4]; let mut out_low = [0u8; 4]; - gbrapf32_to_rgba_row(&g, &b, &r, &a_high, &mut out_high, 1); - gbrapf32_to_rgba_row(&g, &b, &r, &a_low, &mut out_low, 1); + gbrapf32_to_rgba_row::(&g, &b, &r, &a_high, &mut out_high, 1); + gbrapf32_to_rgba_row::(&g, &b, &r, &a_low, &mut out_low, 1); assert_eq!(out_high[3], 255, "alpha HDR clamps to 255"); assert_eq!(out_low[3], 0, "alpha negative clamps to 0"); } + #[test] + fn gbrapf32_to_rgba_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let a = [0.2f32, 0.4, 0.6, 0.8]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let a_be = be_encode(&a); + let mut le_out = std::vec![0u8; 4 * 4]; + let mut be_out = std::vec![0u8; 4 * 4]; + gbrapf32_to_rgba_row::(&g, &b, &r, &a, &mut le_out, 4); + gbrapf32_to_rgba_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrapf32_to_rgba_row must match LE"); + } + // ---- gbrapf32_to_rgba_u16_row -------------------------------------------- #[test] @@ -834,7 +1092,7 @@ mod tests { let r = [0.5f32]; let a = [0.5f32]; let mut out = [0u16; 4]; - gbrapf32_to_rgba_u16_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut out, 1); // 0.5 → (0.5 * 65535 + 0.5) as u16 = 32768 assert_eq!(out[3], 32768, "u16 alpha from source plane"); } @@ -848,12 +1106,29 @@ mod tests { let a_low = [-0.1f32]; let mut out_high = [0u16; 4]; let mut out_low = [0u16; 4]; - gbrapf32_to_rgba_u16_row(&g, &b, &r, &a_high, &mut out_high, 1); - gbrapf32_to_rgba_u16_row(&g, &b, &r, &a_low, &mut out_low, 1); + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a_high, &mut out_high, 1); + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a_low, &mut out_low, 1); assert_eq!(out_high[3], 65535, "u16 alpha HDR clamps to 65535"); assert_eq!(out_low[3], 0, "u16 alpha negative clamps to 0"); } + #[test] + fn gbrapf32_to_rgba_u16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let a = [0.2f32, 0.4, 0.6, 0.8]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let a_be = be_encode(&a); + let mut le_out = std::vec![0u16; 4 * 4]; + let mut be_out = std::vec![0u16; 4 * 4]; + gbrapf32_to_rgba_u16_row::(&g, &b, &r, &a, &mut le_out, 4); + gbrapf32_to_rgba_u16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrapf32_to_rgba_u16_row must match LE"); + } + // ---- gbrapf32_to_rgba_f32_row (lossless source α) ------------------------- #[test] @@ -864,7 +1139,7 @@ mod tests { let r = [0.5f32]; let a = [2.5f32]; let mut out = [0.0f32; 4]; - gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[3], 2.5, "HDR alpha preserved bit-exact"); } @@ -875,10 +1150,27 @@ mod tests { let r = [0.5f32]; let a = [f32::NAN]; let mut out = [0.0f32; 4]; - gbrapf32_to_rgba_f32_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut out, 1); assert!(out[3].is_nan(), "NaN alpha preserved"); } + #[test] + fn gbrapf32_to_rgba_f32_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let a = [0.2f32, 0.4, 0.6, 0.8]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let a_be = be_encode(&a); + let mut le_out = std::vec![0.0f32; 4 * 4]; + let mut be_out = std::vec![0.0f32; 4 * 4]; + gbrapf32_to_rgba_f32_row::(&g, &b, &r, &a, &mut le_out, 4); + gbrapf32_to_rgba_f32_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrapf32_to_rgba_f32_row must match LE"); + } + // ---- gbrapf32_to_rgba_f16_row -------------------------------------------- #[test] @@ -892,7 +1184,7 @@ mod tests { let r = [0.5f32]; let a = [0.75f32]; let mut out = vec![half::f16::ZERO; 4]; - gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut out, 1); assert_eq!(out[3], half::f16::from_f32(0.75), "f16 alpha from source"); } @@ -907,10 +1199,31 @@ mod tests { let r = [0.5f32]; let a = [70_000.0f32]; let mut out = vec![half::f16::ZERO; 4]; - gbrapf32_to_rgba_f16_row(&g, &b, &r, &a, &mut out, 1); + gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut out, 1); assert!( out[3].is_infinite() && out[3].to_f32() > 0.0, "HDR alpha saturates to +Inf" ); } + + #[test] + #[cfg_attr( + miri, + ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri" + )] + fn gbrapf32_to_rgba_f16_be_parity() { + let g = [0.0f32, 0.25, 0.5, 1.0]; + let b = [0.1f32, 0.3, 0.7, 0.9]; + let r = [0.5f32, 0.8, 0.2, 0.6]; + let a = [0.2f32, 0.4, 0.6, 0.8]; + let g_be = be_encode(&g); + let b_be = be_encode(&b); + let r_be = be_encode(&r); + let a_be = be_encode(&a); + let mut le_out = vec![half::f16::ZERO; 4 * 4]; + let mut be_out = vec![half::f16::ZERO; 4 * 4]; + gbrapf32_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, 4); + gbrapf32_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4); + assert_eq!(be_out, le_out, "BE gbrapf32_to_rgba_f16_row must match LE"); + } } diff --git a/src/sinker/mixed/planar_gbr_f16.rs b/src/sinker/mixed/planar_gbr_f16.rs index 4e1106c5..1873b360 100644 --- a/src/sinker/mixed/planar_gbr_f16.rs +++ b/src/sinker/mixed/planar_gbr_f16.rs @@ -309,7 +309,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -321,7 +321,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { height: h, channels: 4, })?; - gbrpf16_to_rgba_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -358,29 +358,29 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_f32_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrpf32_to_rgba_u16_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgba_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -393,7 +393,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -406,7 +406,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row( + gbrpf32_to_hsv_row::( gf, bf, rf, @@ -431,7 +431,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf16_to_rgba_row(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -455,7 +455,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf16> { w, h, )?; - gbrpf16_to_rgb_row(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A: expand RGB → RGBA (constant α = 0xFF). if let Some(buf) = rgba.as_deref_mut() { @@ -712,7 +712,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { // rgb_f16: no source α — use the no-α kernel (lossless scatter). let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf16_to_rgb_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf16_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -725,7 +725,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { height: h, channels: 4, })?; - gbrapf16_to_rgba_f16_row(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); + gbrapf16_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- Paths that require widening f16 → f32 --------------------------- @@ -764,31 +764,31 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_f32_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_f32_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { // gbrapf32_to_rgba_f32_row with widened source α (lossless). let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_f32_row(gf, bf, rf, af, &mut buf[start..end], n, use_simd); + gbrapf32_to_rgba_f32_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = chunk_plane_start * 3; let end = chunk_plane_end * 3; - gbrpf32_to_rgb_u16_row(gf, bf, rf, &mut buf[start..end], n, use_simd); + gbrpf32_to_rgb_u16_row::(gf, bf, rf, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { // gbrapf32_to_rgba_u16_row with widened source α. let start = chunk_plane_start * 4; let end = chunk_plane_end * 4; - gbrapf32_to_rgba_u16_row(gf, bf, rf, af, &mut buf[start..end], n, use_simd); + gbrapf32_to_rgba_u16_row::(gf, bf, rf, af, &mut buf[start..end], n, use_simd); } if let Some(buf) = self.luma.as_deref_mut() { - gbrpf32_to_luma_row( + gbrpf32_to_luma_row::( gf, bf, rf, @@ -801,7 +801,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(buf) = self.luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row( + gbrpf32_to_luma_u16_row::( gf, bf, rf, @@ -814,7 +814,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { } if let Some(hsv) = self.hsv.as_mut() { - gbrpf32_to_hsv_row( + gbrpf32_to_hsv_row::( gf, bf, rf, @@ -845,7 +845,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; // Write opaque RGB → RGBA (α = 0xFF), then overwrite α from source. - gbrpf16_to_rgba_row(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf16_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); // Scatter f16 α → u8 slot 3: widen + clamp + scale. widen_and_scatter_f16_alpha_to_u8(a_in, rgba_row, w); return Ok(()); @@ -871,7 +871,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf16> { w, h, )?; - gbrpf16_to_rgb_row(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf16_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); // Strategy A+: expand RGB → RGBA (0xFF stub), then overwrite α from source. if let Some(buf) = rgba.as_deref_mut() { diff --git a/src/sinker/mixed/planar_gbr_float.rs b/src/sinker/mixed/planar_gbr_float.rs index a40208e8..cb71de5f 100644 --- a/src/sinker/mixed/planar_gbr_float.rs +++ b/src/sinker/mixed/planar_gbr_float.rs @@ -291,7 +291,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -303,7 +303,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f32_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgba_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -311,7 +311,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -323,7 +323,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { height: h, channels: 4, })?; - gbrpf32_to_rgba_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgba_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB / RGBA path (direct float → u16, no staging) ----------- @@ -331,12 +331,12 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_u16_row(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_u16_row::(g_in, b_in, r_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path (no RGB / luma / HSV needed) ------- @@ -351,7 +351,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrpf32_to_rgba_row(g_in, b_in, r_in, rgba_row, w, use_simd); + gbrpf32_to_rgba_row::(g_in, b_in, r_in, rgba_row, w, use_simd); return Ok(()); } @@ -378,10 +378,10 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { w, h, )?; - gbrpf32_to_rgb_row(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -394,7 +394,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -407,7 +407,7 @@ impl PixelSink for MixedSinker<'_, Gbrpf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, @@ -677,7 +677,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f32.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f32_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f32_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f32.as_deref_mut() { @@ -689,7 +689,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f32_row(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); + gbrapf32_to_rgba_f32_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- f16 narrowing (independent of integer paths) -------------------- @@ -697,7 +697,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_f16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_f16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_f16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } if let Some(buf) = self.rgba_f16.as_deref_mut() { @@ -709,7 +709,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { height: h, channels: 4, })?; - gbrapf32_to_rgba_f16_row(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); + gbrapf32_to_rgba_f16_row::(g_in, b_in, r_in, a_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGB path (direct, no staging) ------------------------------ @@ -717,14 +717,14 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if let Some(buf) = self.rgb_u16.as_deref_mut() { let start = one_plane_start * 3; let end = one_plane_end * 3; - gbrpf32_to_rgb_u16_row(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); + gbrpf32_to_rgb_u16_row::(g_in, b_in, r_in, &mut buf[start..end], w, use_simd); } // ---- u16 RGBA path (direct — source α clamped + scaled) ------------- if let Some(buf) = self.rgba_u16.as_deref_mut() { let rgba_row = rgba_u16_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_u16_row(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_u16_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); } // ---- u8 RGBA standalone fast path ------------------------------------ @@ -739,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { if want_rgba && !need_u8_rgb { let rgba_buf = self.rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - gbrapf32_to_rgba_row(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); + gbrapf32_to_rgba_row::(g_in, b_in, r_in, a_in, rgba_row, w, use_simd); return Ok(()); } @@ -766,10 +766,10 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { w, h, )?; - gbrpf32_to_rgb_row(g_in, b_in, r_in, rgb_row, w, use_simd); + gbrpf32_to_rgb_row::(g_in, b_in, r_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { - gbrpf32_to_luma_row( + gbrpf32_to_luma_row::( g_in, b_in, r_in, @@ -782,7 +782,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(luma_u16) = luma_u16.as_deref_mut() { - gbrpf32_to_luma_u16_row( + gbrpf32_to_luma_u16_row::( g_in, b_in, r_in, @@ -795,7 +795,7 @@ impl PixelSink for MixedSinker<'_, Gbrapf32> { } if let Some(hsv) = hsv.as_mut() { - gbrpf32_to_hsv_row( + gbrpf32_to_hsv_row::( g_in, b_in, r_in, From 0e7b54ddef31f9214b9a1e80a446c3192a00c098 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 00:52:24 +1200 Subject: [PATCH 2/3] fix(be-tier10-float): cross-arch build/clippy/fmt compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply post-rebase fixups to make the tier 10 float work pass CI on all targets (x86_64, wasm32, no-default-features) — original branch was forked off pre-be-infra and only verified on aarch64. Compile fixes: - Add load_endian_u16x8 helper to x86_avx2/endian.rs (128-bit lane load for use with _mm256_cvtph_ps in 8-pixel f16 widening); planar_gbr_f16 AVX2 paths needed it. - Import endian module in x86_sse41/planar_gbr_float.rs (was previously resolved via a missing path; SSE4.1 backend's f16 widening calls endian::load_endian_u16x4 / load_endian_u32x4 helpers). - Add :: turbofish to recursive gbrpf16_to_rgb_row_f16c calls in AVX-512 / AVX2 / SSE4.1 luma/HSV staged paths (3 sites each). - Add :: turbofish to wasm_simd128 inner gbrpf32_to_rgba_row / gbrpf32_to_rgb_u16_row / gbrpf32_to_rgba_u16_row recursive calls in the f16-widen + f32-SIMD path (3 sites). Test-only fixes: - Add :: turbofish to f16-row scalar/SIMD calls in wasm_simd128 tests/planar_gbr_float.rs (now that the kernels are ). - Add :: turbofish to overflow-panic dispatch tests in dispatch/planar_gbr_float.rs and sinker/mixed/tests/planar_gbr_float.rs. Pre-existing wasm-only unused-imports cleanup (caught now under -Dwarnings on wasm32): - src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs: drop unused high_bit_plane_wasm, interleave_uv_wasm imports. - src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs: drop full helper-list import (none used in this file). - src/row/arch/wasm_simd128/tests/yuva.rs: drop unused p_n_packed_plane, p010_uv_interleave imports. cargo fmt rewraps long lines in the existing tier10-float kernels (line length over 100 — pre-existing in the original commit, only surfaced after rebase exposed cross-arch build). Verified: - cargo test --target aarch64-apple-darwin: 2183 passed, 0 failed - RUSTFLAGS=-Dwarnings cargo build --target x86_64-apple-darwin --tests: ok - RUSTFLAGS="-C target-feature=+simd128 -Dwarnings" cargo build --target wasm32-unknown-unknown --tests: ok - RUSTFLAGS="-C target-feature=+simd128 -Dwarnings" cargo build --target wasm32-wasip1 --tests: ok - cargo build --no-default-features: ok - cargo fmt --check: clean - cargo clippy --all-targets --all-features -- -D warnings: ok Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon/planar_gbr_float.rs | 160 ++++++-- src/row/arch/neon/tests/planar_gbr_float.rs | 58 ++- src/row/arch/wasm_simd128/planar_gbr_float.rs | 147 +++++-- .../arch/wasm_simd128/tests/high_bit_4_2_0.rs | 5 +- .../wasm_simd128/tests/planar_8bit_and_nv.rs | 5 +- .../wasm_simd128/tests/planar_gbr_float.rs | 68 +++- src/row/arch/wasm_simd128/tests/yuva.rs | 5 +- src/row/arch/x86_avx2/endian.rs | 55 +++ src/row/arch/x86_avx2/planar_gbr_float.rs | 363 ++++++++++++++---- .../arch/x86_avx2/tests/planar_gbr_float.rs | 54 ++- src/row/arch/x86_avx512/endian.rs | 3 +- src/row/arch/x86_avx512/planar_gbr_float.rs | 363 ++++++++++++++---- .../arch/x86_avx512/tests/planar_gbr_float.rs | 54 ++- src/row/arch/x86_sse41/endian.rs | 4 +- src/row/arch/x86_sse41/planar_gbr_float.rs | 363 ++++++++++++++---- .../arch/x86_sse41/tests/planar_gbr_float.rs | 54 ++- src/row/dispatch/planar_gbr_float.rs | 8 +- src/row/scalar/planar_gbr_f16.rs | 5 +- src/row/scalar/planar_gbr_float.rs | 25 +- src/sinker/mixed/tests/planar_gbr_float.rs | 8 +- 20 files changed, 1444 insertions(+), 363 deletions(-) diff --git a/src/row/arch/neon/planar_gbr_float.rs b/src/row/arch/neon/planar_gbr_float.rs index 536f7c02..52af16b7 100644 --- a/src/row/arch/neon/planar_gbr_float.rs +++ b/src/row/arch/neon/planar_gbr_float.rs @@ -253,7 +253,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -337,7 +343,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f32_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f32_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f32_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -438,7 +450,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_fp16( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -854,9 +872,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -924,9 +948,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -993,9 +1023,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1059,9 +1095,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1119,9 +1161,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); vst3q_f32(out.as_mut_ptr().add(x * 3), float32x4x3_t(rv, gv, bv)); x += 4; } @@ -1174,9 +1222,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_fp16( let one_v = vdupq_n_f32(1.0); let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); vst4q_f32( out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, one_v), @@ -1242,7 +1296,13 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 3..], + width - x, + ); } } } @@ -1285,7 +1345,13 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -1478,10 +1544,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1555,10 +1629,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_fp16( let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + ))); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1623,10 +1705,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_fp16( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(g.as_ptr().add(x).cast::()))); - let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(b.as_ptr().add(x).cast::()))); - let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(r.as_ptr().add(x).cast::()))); - let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::(a.as_ptr().add(x).cast::()))); + let gv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + ))); + let bv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + ))); + let rv = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + ))); + let av = vcvt_f32_f16(vreinterpret_f16_u16(load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + ))); vst4q_f32(out.as_mut_ptr().add(x * 4), float32x4x4_t(rv, gv, bv, av)); x += 4; } diff --git a/src/row/arch/neon/tests/planar_gbr_float.rs b/src/row/arch/neon/tests/planar_gbr_float.rs index 1f35cdbc..0fe86a07 100644 --- a/src/row/arch/neon/tests/planar_gbr_float.rs +++ b/src/row/arch/neon/tests/planar_gbr_float.rs @@ -852,11 +852,17 @@ fn neon_gbrapf16_to_rgba_f16_lossless_matches_scalar() { // ---- BE parity helpers ------------------------------------------------------ fn be_encode_f32(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- @@ -873,11 +879,15 @@ fn neon_gbrpf32_to_rgb_be_parity() { prng_f32(&mut r, 0xBE01_0003); let mut le_out = std::vec![0u8; w * 3]; let mut be_out = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); } } @@ -896,11 +906,15 @@ fn neon_gbrpf32_to_rgba_be_parity() { prng_f32(&mut r, 0xBE02_0003); let mut le_out = std::vec![0u8; w * 4]; let mut be_out = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); } } @@ -919,11 +933,15 @@ fn neon_gbrpf32_to_rgb_f32_be_parity() { prng_f32(&mut r, 0xBE05_0003); let mut le_out = std::vec![0.0f32; w * 3]; let mut be_out = std::vec![0.0f32; w * 3]; - unsafe { gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_f32_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_f32_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_f32_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgb_f32 BE parity width={w}"); } } @@ -942,11 +960,15 @@ fn neon_gbrpf16_to_rgb_f16_be_parity() { prng_f16(&mut r, 0xBE07_0003); let mut le_out = std::vec![half::f16::ZERO; w * 3]; let mut be_out = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); } } @@ -965,11 +987,15 @@ fn neon_gbrpf16_to_rgba_f16_be_parity() { prng_f16(&mut r, 0xBE08_0003); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf16_to_rgba_f16 BE parity width={w}"); } } @@ -990,12 +1016,16 @@ fn neon_gbrapf16_to_rgba_f16_be_parity() { prng_f16(&mut a, 0xBE0F_0004); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); let a_be = be_encode_f16(&a); - unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); } } diff --git a/src/row/arch/wasm_simd128/planar_gbr_float.rs b/src/row/arch/wasm_simd128/planar_gbr_float.rs index 2dc6a42f..e5fbf728 100644 --- a/src/row/arch/wasm_simd128/planar_gbr_float.rs +++ b/src/row/arch/wasm_simd128/planar_gbr_float.rs @@ -27,7 +27,10 @@ use core::arch::wasm32::*; use crate::{ ColorMatrix, - row::{arch::wasm_simd128::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, + row::{ + arch::wasm_simd128::endian, + scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + }, }; // ---- shared helpers ---------------------------------------------------------- @@ -77,9 +80,21 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -142,9 +157,21 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -206,9 +233,21 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -268,9 +307,21 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -602,10 +653,26 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); - let av = clamp01(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); + let av = clamp01( + endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -682,10 +749,26 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { unsafe { - let gv = clamp01(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), zero, one); - let bv = clamp01(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), zero, one); - let rv = clamp01(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), zero, one); - let av = clamp01(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), zero, one); + let gv = clamp01( + endian::load_endian_u32x4::(g.as_ptr().add(x).cast::()), + zero, + one, + ); + let bv = clamp01( + endian::load_endian_u32x4::(b.as_ptr().add(x).cast::()), + zero, + one, + ); + let rv = clamp01( + endian::load_endian_u32x4::(r.as_ptr().add(x).cast::()), + zero, + one, + ); + let av = clamp01( + endian::load_endian_u32x4::(a.as_ptr().add(x).cast::()), + zero, + one, + ); let gi = scale_round_i32(gv, scale, half); let bi = scale_round_i32(bv, scale, half); let ri = scale_round_i32(rv, scale, half); @@ -963,7 +1046,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row( widen_f16_plane(b, x, CHUNK, &mut bf); widen_f16_plane(r, x, CHUNK, &mut rf); unsafe { - gbrpf32_to_rgba_row(&gf, &bf, &rf, &mut out[x * 4..(x + CHUNK) * 4], CHUNK); + gbrpf32_to_rgba_row::(&gf, &bf, &rf, &mut out[x * 4..(x + CHUNK) * 4], CHUNK); } x += CHUNK; } @@ -1009,7 +1092,7 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( widen_f16_plane(b, x, CHUNK, &mut bf); widen_f16_plane(r, x, CHUNK, &mut rf); unsafe { - gbrpf32_to_rgb_u16_row(&gf, &bf, &rf, &mut out[x * 3..(x + CHUNK) * 3], CHUNK); + gbrpf32_to_rgb_u16_row::(&gf, &bf, &rf, &mut out[x * 3..(x + CHUNK) * 3], CHUNK); } x += CHUNK; } @@ -1018,7 +1101,13 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgb_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 3..width * 3], n); + scalar::gbrpf32_to_rgb_u16_row::( + &gf[..n], + &bf[..n], + &rf[..n], + &mut out[x * 3..width * 3], + n, + ); } } @@ -1055,7 +1144,7 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row( widen_f16_plane(b, x, CHUNK, &mut bf); widen_f16_plane(r, x, CHUNK, &mut rf); unsafe { - gbrpf32_to_rgba_u16_row(&gf, &bf, &rf, &mut out[x * 4..(x + CHUNK) * 4], CHUNK); + gbrpf32_to_rgba_u16_row::(&gf, &bf, &rf, &mut out[x * 4..(x + CHUNK) * 4], CHUNK); } x += CHUNK; } @@ -1064,6 +1153,12 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row( widen_f16_plane(g, x, n, &mut gf); widen_f16_plane(b, x, n, &mut bf); widen_f16_plane(r, x, n, &mut rf); - scalar::gbrpf32_to_rgba_u16_row::(&gf[..n], &bf[..n], &rf[..n], &mut out[x * 4..width * 4], n); + scalar::gbrpf32_to_rgba_u16_row::( + &gf[..n], + &bf[..n], + &rf[..n], + &mut out[x * 4..width * 4], + n, + ); } } diff --git a/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs b/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs index 9a78695d..ad74796c 100644 --- a/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs +++ b/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs @@ -1,7 +1,4 @@ -use super::{ - super::*, high_bit_plane_wasm, interleave_uv_wasm, p_n_packed_plane, p010_uv_interleave, - p16_plane_wasm, planar_n_plane, -}; +use super::{super::*, p_n_packed_plane, p010_uv_interleave, p16_plane_wasm, planar_n_plane}; // ---- rgb_to_hsv_row equivalence -------------------------------------- diff --git a/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs b/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs index 42f5515a..87f3feb6 100644 --- a/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs +++ b/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs @@ -1,7 +1,4 @@ -use super::{ - super::*, high_bit_plane_wasm, interleave_uv_wasm, p_n_packed_plane, p010_uv_interleave, - p16_plane_wasm, planar_n_plane, -}; +use super::super::*; fn check_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); diff --git a/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs b/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs index 3a1e3fd1..f987cfcd 100644 --- a/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs +++ b/src/row/arch/wasm_simd128/tests/planar_gbr_float.rs @@ -210,7 +210,15 @@ fn wasm_gbrpf32_to_luma_u16_matches_scalar() { let r = gbr_plane_f32(w, 0x7968_5748); let mut out_scalar = std::vec![0u16; w]; let mut out_simd = std::vec![0u16; w]; - scalar::gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out_scalar, w, ColorMatrix::Bt709, true); + scalar::gbrpf32_to_luma_u16_row::( + &g, + &b, + &r, + &mut out_scalar, + w, + ColorMatrix::Bt709, + true, + ); unsafe { gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut out_simd, w, ColorMatrix::Bt709, true); } @@ -328,7 +336,7 @@ fn wasm_gbrpf16_to_rgb_f16_matches_scalar() { let r = gbr_plane_f16(w, 0xFEDC_BA98); let mut out_scalar = std::vec![half::f16::ZERO; w * 3]; let mut out_simd = std::vec![half::f16::ZERO; w * 3]; - scalar_f16::gbrpf16_to_rgb_f16_row(&g, &b, &r, &mut out_scalar, w); + scalar_f16::gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut out_simd, w); } @@ -346,7 +354,7 @@ fn wasm_gbrpf16_to_rgba_f16_matches_scalar() { let r = gbr_plane_f16(w, 0x6767_8989); let mut out_scalar = std::vec![half::f16::ZERO; w * 4]; let mut out_simd = std::vec![half::f16::ZERO; w * 4]; - scalar_f16::gbrpf16_to_rgba_f16_row(&g, &b, &r, &mut out_scalar, w); + scalar_f16::gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut out_scalar, w); unsafe { gbrpf16_to_rgba_f16_row::(&g, &b, &r, &mut out_simd, w); } @@ -365,7 +373,7 @@ fn wasm_gbrapf16_to_rgba_f16_matches_scalar() { let a = gbr_plane_f16(w, 0x7777_8888); let mut out_scalar = std::vec![half::f16::ZERO; w * 4]; let mut out_simd = std::vec![half::f16::ZERO; w * 4]; - scalar_f16::gbrapf16_to_rgba_f16_row(&g, &b, &r, &a, &mut out_scalar, w); + scalar_f16::gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut out_scalar, w); unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut out_simd, w); } @@ -391,7 +399,7 @@ fn wasm_gbrpf16_to_rgb_matches_scalar() { w, ); unsafe { - gbrpf16_to_rgb_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgb_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgb width={w}"); } @@ -415,7 +423,7 @@ fn wasm_gbrpf16_to_rgba_matches_scalar() { w, ); unsafe { - gbrpf16_to_rgba_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgba_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgba width={w}"); } @@ -439,7 +447,7 @@ fn wasm_gbrpf16_to_rgb_u16_matches_scalar() { w, ); unsafe { - gbrpf16_to_rgb_u16_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgb_u16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgb_u16 width={w}"); } @@ -463,7 +471,7 @@ fn wasm_gbrpf16_to_rgba_u16_matches_scalar() { w, ); unsafe { - gbrpf16_to_rgba_u16_row(&g, &b, &r, &mut out_simd, w); + gbrpf16_to_rgba_u16_row::(&g, &b, &r, &mut out_simd, w); } assert_eq!(out_scalar, out_simd, "wasm gbrpf16_to_rgba_u16 width={w}"); } @@ -499,11 +507,17 @@ fn wasm_gbrpf32_to_rgb_round_half_up() { // ---- BE parity helpers ------------------------------------------------------- fn be_encode_f32(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- @@ -516,11 +530,15 @@ fn wasm_gbrpf32_to_rgb_be_parity() { let r = gbr_plane_f32(w, 0xBE01_0003); let mut le_out = std::vec![0u8; w * 3]; let mut be_out = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "wasm gbrpf32_to_rgb BE parity width={w}"); } } @@ -535,12 +553,19 @@ fn wasm_gbrpf16_to_rgb_f16_be_parity() { let r = gbr_plane_f16(w, 0xBE07_0003); let mut le_out = std::vec![half::f16::ZERO; w * 3]; let mut be_out = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } - assert_eq!(le_out, be_out, "wasm gbrpf16_to_rgb_f16 BE parity width={w}"); + unsafe { + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } + assert_eq!( + le_out, be_out, + "wasm gbrpf16_to_rgb_f16 BE parity width={w}" + ); } } @@ -555,12 +580,19 @@ fn wasm_gbrapf16_to_rgba_f16_be_parity() { let a = gbr_plane_f16(w, 0xBE0F_0004); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); let a_be = be_encode_f16(&a); - unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } - assert_eq!(le_out, be_out, "wasm gbrapf16_to_rgba_f16 BE parity width={w}"); + unsafe { + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); + } + assert_eq!( + le_out, be_out, + "wasm gbrapf16_to_rgba_f16 BE parity width={w}" + ); } } diff --git a/src/row/arch/wasm_simd128/tests/yuva.rs b/src/row/arch/wasm_simd128/tests/yuva.rs index f35e2073..e9e25257 100644 --- a/src/row/arch/wasm_simd128/tests/yuva.rs +++ b/src/row/arch/wasm_simd128/tests/yuva.rs @@ -1,7 +1,4 @@ -use super::{ - super::*, high_bit_plane_wasm, interleave_uv_wasm, p_n_packed_plane, p010_uv_interleave, - p16_plane_wasm, planar_n_plane, -}; +use super::{super::*, high_bit_plane_wasm, interleave_uv_wasm, p16_plane_wasm, planar_n_plane}; // ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- // diff --git a/src/row/arch/x86_avx2/endian.rs b/src/row/arch/x86_avx2/endian.rs index d2dd6995..1176f6df 100644 --- a/src/row/arch/x86_avx2/endian.rs +++ b/src/row/arch/x86_avx2/endian.rs @@ -87,6 +87,61 @@ pub(crate) unsafe fn load_endian_u16x16(ptr: *const u8) -> __m25 } } +// ---- u16x8 loaders (via _mm_loadu_si128, for f16 widening) ---------------- +// +// AVX2 kernels widen 8 × f16 using `_mm256_cvtph_ps(__m128i)`, which requires +// a 128-bit lane load. The helpers below provide endian-aware loading of +// that 16-byte (8 × u16) block. + +/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) +/// lane. +const BYTESWAP_MASK_U16X8: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + +/// Loads 8 × u16 (16 bytes) from `ptr` (LE-encoded) into a `__m128i`, +/// host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have AVX2 +/// (which implies SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "big")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16X8) }; + v +} + +/// Loads 8 × u16 (16 bytes) from `ptr` (BE-encoded) into a `__m128i`, +/// host-native order. +/// +/// # Safety +/// +/// `ptr` must point to at least 16 readable bytes. Caller must have AVX2 +/// (which implies SSSE3) enabled. +#[inline(always)] +pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> __m128i { + let v = unsafe { _mm_loadu_si128(ptr.cast()) }; + #[cfg(target_endian = "little")] + let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16X8) }; + v +} + +/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8`. +/// +/// # Safety +/// +/// Same as `load_le_u16x8` / `load_be_u16x8`. +#[inline(always)] +pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> __m128i { + if BE { + unsafe { load_be_u16x8(ptr) } + } else { + unsafe { load_le_u16x8(ptr) } + } +} + // ---- u32x8 loaders --------------------------------------------------------- /// Loads 8 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order. diff --git a/src/row/arch/x86_avx2/planar_gbr_float.rs b/src/row/arch/x86_avx2/planar_gbr_float.rs index bec35099..99afdc73 100644 --- a/src/row/arch/x86_avx2/planar_gbr_float.rs +++ b/src/row/arch/x86_avx2/planar_gbr_float.rs @@ -49,7 +49,10 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, - row::{arch::x86_avx2::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, + row::{ + arch::x86_avx2::endian, + scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + }, }; // ---- shared helpers ---------------------------------------------------------- @@ -122,9 +125,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -178,9 +199,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -235,9 +274,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -291,9 +348,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -313,7 +388,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 8; } if x < width { - scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -407,9 +488,15 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -463,9 +550,15 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )); let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -486,7 +579,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 8; } if x < width { - scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -677,10 +776,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = narrow_i32x8_to_u8x8(scale_round_i32(gv, scale)); let b8 = narrow_i32x8_to_u8x8(scale_round_i32(bv, scale)); let r8 = narrow_i32x8_to_u8x8(scale_round_i32(rv, scale)); @@ -747,10 +870,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let gv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm256_castsi256_ps(endian::load_endian_u32x8::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = narrow_i32x8_to_u16x8(scale_round_i32(gv, scale)); let bw = narrow_i32x8_to_u16x8(scale_round_i32(bv, scale)); let rw = narrow_i32x8_to_u16x8(scale_round_i32(rv, scale)); @@ -845,10 +992,18 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::(r.as_ptr().add(x).cast::())); - let av = _mm256_castsi256_ps(endian::load_endian_u32x8::(a.as_ptr().add(x).cast::())); + let gv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_castsi256_ps(endian::load_endian_u32x8::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm256_castsi256_ps(endian::load_endian_u32x8::( + a.as_ptr().add(x).cast::(), + )); let gh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm256_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -916,9 +1071,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 8 <= width { // Load 8 f16 lanes (16 bytes) per plane and widen to f32x8. - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -992,9 +1153,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1069,9 +1236,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1145,9 +1318,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1218,9 +1397,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); // No 3-channel interleave intrinsic in AVX2 — scatter via scalar loop. let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; @@ -1285,9 +1470,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; let mut rf = [0.0f32; 8]; @@ -1374,7 +1565,13 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 8; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 3..], + width - x, + ); } } } @@ -1428,7 +1625,13 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 8; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -1466,7 +1669,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1518,7 +1721,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1571,7 +1774,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1624,10 +1827,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); - let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1711,10 +1922,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); - let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1794,10 +2013,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 8 <= width { - let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::(g.as_ptr().add(x).cast::())); - let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::(b.as_ptr().add(x).cast::())); - let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::(r.as_ptr().add(x).cast::())); - let av = _mm256_cvtph_ps(endian::load_endian_u16x8::(a.as_ptr().add(x).cast::())); + let gv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm256_cvtph_ps(endian::load_endian_u16x8::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm256_cvtph_ps(endian::load_endian_u16x8::( + a.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 8]; let mut bf = [0.0f32; 8]; let mut rf = [0.0f32; 8]; diff --git a/src/row/arch/x86_avx2/tests/planar_gbr_float.rs b/src/row/arch/x86_avx2/tests/planar_gbr_float.rs index 2423247b..07e39ddf 100644 --- a/src/row/arch/x86_avx2/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_avx2/tests/planar_gbr_float.rs @@ -978,7 +978,9 @@ fn avx2_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xB009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1013,7 +1015,9 @@ fn avx2_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xB00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1051,7 +1055,9 @@ fn avx2_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { + gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1235,11 +1241,17 @@ fn avx2_gbrapf16_to_rgba_f16_lane_order() { // ---- BE parity helpers ------------------------------------------------------- fn be_encode_f32(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- @@ -1259,11 +1271,15 @@ fn avx2_gbrpf32_to_rgb_be_parity() { prng_f32(&mut r, 0xBE01_0003); let mut le_out = std::vec![0u8; w * 3]; let mut be_out = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); } } @@ -1285,11 +1301,15 @@ fn avx2_gbrpf32_to_rgba_be_parity() { prng_f32(&mut r, 0xBE02_0003); let mut le_out = std::vec![0u8; w * 4]; let mut be_out = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); } } @@ -1311,11 +1331,15 @@ fn avx2_gbrpf16_to_rgb_f16_be_parity() { prng_f16(&mut r, 0xBE07_0003); let mut le_out = std::vec![half::f16::ZERO; w * 3]; let mut be_out = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); } } @@ -1339,12 +1363,16 @@ fn avx2_gbrapf16_to_rgba_f16_be_parity() { prng_f16(&mut a, 0xBE0F_0004); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); let a_be = be_encode_f16(&a); - unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); } } diff --git a/src/row/arch/x86_avx512/endian.rs b/src/row/arch/x86_avx512/endian.rs index 99bcffaf..91cb644f 100644 --- a/src/row/arch/x86_avx512/endian.rs +++ b/src/row/arch/x86_avx512/endian.rs @@ -100,8 +100,7 @@ pub(crate) unsafe fn load_endian_u16x32(ptr: *const u8) -> __m51 const BYTESWAP_MASK_U16X16: __m256i = unsafe { core::mem::transmute([ // low 128-bit lane - 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - // high 128-bit lane + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // high 128-bit lane 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, ]) }; diff --git a/src/row/arch/x86_avx512/planar_gbr_float.rs b/src/row/arch/x86_avx512/planar_gbr_float.rs index f3bc8a2d..c12ffa81 100644 --- a/src/row/arch/x86_avx512/planar_gbr_float.rs +++ b/src/row/arch/x86_avx512/planar_gbr_float.rs @@ -50,7 +50,10 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, - row::{arch::x86_avx512::endian, scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}}, + row::{ + arch::x86_avx512::endian, + scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + }, }; // ---- shared helpers ---------------------------------------------------------- @@ -99,9 +102,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -155,9 +176,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -212,9 +251,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -268,9 +325,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -290,7 +365,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 16; } if x < width { - scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -384,9 +465,15 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -440,9 +527,15 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )); let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -463,7 +556,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 16; } if x < width { - scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -655,10 +754,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let g8 = _mm512_cvtusepi32_epi8(scale_round_i32(gv, scale)); let b8 = _mm512_cvtusepi32_epi8(scale_round_i32(bv, scale)); let r8 = _mm512_cvtusepi32_epi8(scale_round_i32(rv, scale)); @@ -726,10 +849,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let gv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm512_castsi512_ps(endian::load_endian_u32x16::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gw = _mm512_cvtusepi32_epi16(scale_round_i32(gv, scale)); let bw = _mm512_cvtusepi32_epi16(scale_round_i32(bv, scale)); let rw = _mm512_cvtusepi32_epi16(scale_round_i32(rv, scale)); @@ -824,10 +971,18 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::(r.as_ptr().add(x).cast::())); - let av = _mm512_castsi512_ps(endian::load_endian_u32x16::(a.as_ptr().add(x).cast::())); + let gv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_castsi512_ps(endian::load_endian_u32x16::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm512_castsi512_ps(endian::load_endian_u32x16::( + a.as_ptr().add(x).cast::(), + )); let gh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm512_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -895,9 +1050,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 16 <= width { // Load 16 f16 lanes (32 bytes) per plane and widen to f32x16. - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -971,9 +1132,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1048,9 +1215,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1124,9 +1297,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1197,9 +1376,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); // No 3-channel interleave intrinsic in AVX-512 — scatter via scalar loop. let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; @@ -1264,9 +1449,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; let mut rf = [0.0f32; 16]; @@ -1354,7 +1545,13 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 16; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 3..], + width - x, + ); } } } @@ -1408,7 +1605,13 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 16; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -1446,7 +1649,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1498,7 +1701,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1551,7 +1754,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1604,10 +1807,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); - let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1691,10 +1902,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); - let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1774,10 +1993,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 16 <= width { - let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::(g.as_ptr().add(x).cast::())); - let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::(b.as_ptr().add(x).cast::())); - let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::(r.as_ptr().add(x).cast::())); - let av = _mm512_cvtph_ps(endian::load_endian_u16x16::(a.as_ptr().add(x).cast::())); + let gv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm512_cvtph_ps(endian::load_endian_u16x16::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm512_cvtph_ps(endian::load_endian_u16x16::( + a.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 16]; let mut bf = [0.0f32; 16]; let mut rf = [0.0f32; 16]; diff --git a/src/row/arch/x86_avx512/tests/planar_gbr_float.rs b/src/row/arch/x86_avx512/tests/planar_gbr_float.rs index 6650d80b..3cc7b244 100644 --- a/src/row/arch/x86_avx512/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_avx512/tests/planar_gbr_float.rs @@ -1011,7 +1011,9 @@ fn avx512_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xD009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1048,7 +1050,9 @@ fn avx512_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xD00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1088,7 +1092,9 @@ fn avx512_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { + gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -1280,11 +1286,17 @@ fn avx512_gbrapf16_to_rgba_f16_lane_order() { // ---- BE parity helpers ------------------------------------------------------- fn be_encode_f32(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- @@ -1304,11 +1316,15 @@ fn avx512_gbrpf32_to_rgb_be_parity() { prng_f32(&mut r, 0xBE01_0003); let mut le_out = std::vec![0u8; w * 3]; let mut be_out = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); } } @@ -1330,11 +1346,15 @@ fn avx512_gbrpf32_to_rgba_be_parity() { prng_f32(&mut r, 0xBE02_0003); let mut le_out = std::vec![0u8; w * 4]; let mut be_out = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); } } @@ -1356,11 +1376,15 @@ fn avx512_gbrpf16_to_rgb_f16_be_parity() { prng_f16(&mut r, 0xBE07_0003); let mut le_out = std::vec![half::f16::ZERO; w * 3]; let mut be_out = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); } } @@ -1384,12 +1408,16 @@ fn avx512_gbrapf16_to_rgba_f16_be_parity() { prng_f16(&mut a, 0xBE0F_0004); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); let a_be = be_encode_f16(&a); - unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); } } diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs index 292a44d8..b8920cfd 100644 --- a/src/row/arch/x86_sse41/endian.rs +++ b/src/row/arch/x86_sse41/endian.rs @@ -80,7 +80,9 @@ pub(crate) unsafe fn load_endian_u16x8(ptr: *const u8) -> __m128 /// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 2-byte (u16) /// lane in the LOW 8 bytes of a 128-bit register. Upper bytes are zeroed. const BYTESWAP_MASK_U16X4: __m128i = unsafe { - core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 0x80u8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80]) + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 0x80u8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + ]) }; /// Loads 4 × u16 (8 bytes) from `ptr` (LE-encoded) into the low 64 bits of diff --git a/src/row/arch/x86_sse41/planar_gbr_float.rs b/src/row/arch/x86_sse41/planar_gbr_float.rs index a4c5f9b3..d5834afd 100644 --- a/src/row/arch/x86_sse41/planar_gbr_float.rs +++ b/src/row/arch/x86_sse41/planar_gbr_float.rs @@ -36,7 +36,10 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, - row::scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + row::{ + arch::x86_sse41::endian, + scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar}, + }, }; // ---- shared helpers ---------------------------------------------------------- @@ -114,9 +117,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -164,9 +185,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -215,9 +254,27 @@ pub(crate) unsafe fn gbrpf32_to_rgb_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -265,9 +322,27 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -281,7 +356,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_u16_row( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_u16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_u16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -371,9 +452,15 @@ pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )); // F16C narrow: IEEE-754 round-to-nearest-even (NOT round-half-up). let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); @@ -428,9 +515,15 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )); let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -451,7 +544,13 @@ pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c( x += 4; } if x < width { - scalar::gbrpf32_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar::gbrpf32_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -642,10 +741,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gi = i32x4_to_u8x4(scale_round_i32(gv, scale)); let bi = i32x4_to_u8x4(scale_round_i32(bv, scale)); let ri = i32x4_to_u8x4(scale_round_i32(rv, scale)); @@ -704,10 +827,34 @@ pub(crate) unsafe fn gbrapf32_to_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let gv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())), zero, one); - let bv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())), zero, one); - let rv = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())), zero, one); - let av = clamp01(_mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())), zero, one); + let gv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let bv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let rv = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )), + zero, + one, + ); + let av = clamp01( + _mm_castsi128_ps(endian::load_endian_u32x4::( + a.as_ptr().add(x).cast::(), + )), + zero, + one, + ); let gu = i32x4_to_u16x4(scale_round_i32(gv, scale)); let bu = i32x4_to_u16x4(scale_round_i32(bv, scale)); let ru = i32x4_to_u16x4(scale_round_i32(rv, scale)); @@ -793,10 +940,18 @@ pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_castsi128_ps(endian::load_endian_u32x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_castsi128_ps(endian::load_endian_u32x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_castsi128_ps(endian::load_endian_u32x4::(r.as_ptr().add(x).cast::())); - let av = _mm_castsi128_ps(endian::load_endian_u32x4::(a.as_ptr().add(x).cast::())); + let gv = _mm_castsi128_ps(endian::load_endian_u32x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_castsi128_ps(endian::load_endian_u32x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_castsi128_ps(endian::load_endian_u32x4::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm_castsi128_ps(endian::load_endian_u32x4::( + a.as_ptr().add(x).cast::(), + )); let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv); let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv); let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv); @@ -864,9 +1019,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c( let mut x = 0usize; while x + 4 <= width { // _mm_loadl_epi64: 64-bit load into the low half of __m128i (4 × u16 = 4 × f16). - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -934,9 +1095,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1004,9 +1171,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1074,9 +1247,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1141,9 +1320,15 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); // No interleave intrinsic in SSE4.1 — scatter via scalar loop. let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; @@ -1208,9 +1393,15 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; let mut rf = [0.0f32; 4]; @@ -1308,7 +1499,13 @@ pub(crate) unsafe fn gbrpf16_to_rgb_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgb_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x); + scalar_f16::gbrpf16_to_rgb_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 3..], + width - x, + ); } } } @@ -1373,7 +1570,13 @@ pub(crate) unsafe fn gbrpf16_to_rgba_f16_row( x += 4; } if x < width { - scalar_f16::gbrpf16_to_rgba_f16_row::(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x); + scalar_f16::gbrpf16_to_rgba_f16_row::( + &g[x..], + &b[x..], + &r[x..], + &mut out[x * 4..], + width - x, + ); } } } @@ -1411,7 +1614,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1463,7 +1666,7 @@ pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1516,7 +1719,7 @@ pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c( while offset < width { let n = (width - offset).min(CHUNK); unsafe { - gbrpf16_to_rgb_row_f16c( + gbrpf16_to_rgb_row_f16c::( &g[offset..], &b[offset..], &r[offset..], @@ -1569,10 +1772,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); - let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1648,10 +1859,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c( let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); - let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + )); let gc = clamp01(gv, zero, one); let bc = clamp01(bv, zero, one); let rc = clamp01(rv, zero, one); @@ -1723,10 +1942,18 @@ pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c( unsafe { let mut x = 0usize; while x + 4 <= width { - let gv = _mm_cvtph_ps(endian::load_endian_u16x4::(g.as_ptr().add(x).cast::())); - let bv = _mm_cvtph_ps(endian::load_endian_u16x4::(b.as_ptr().add(x).cast::())); - let rv = _mm_cvtph_ps(endian::load_endian_u16x4::(r.as_ptr().add(x).cast::())); - let av = _mm_cvtph_ps(endian::load_endian_u16x4::(a.as_ptr().add(x).cast::())); + let gv = _mm_cvtph_ps(endian::load_endian_u16x4::( + g.as_ptr().add(x).cast::(), + )); + let bv = _mm_cvtph_ps(endian::load_endian_u16x4::( + b.as_ptr().add(x).cast::(), + )); + let rv = _mm_cvtph_ps(endian::load_endian_u16x4::( + r.as_ptr().add(x).cast::(), + )); + let av = _mm_cvtph_ps(endian::load_endian_u16x4::( + a.as_ptr().add(x).cast::(), + )); let mut gf = [0.0f32; 4]; let mut bf = [0.0f32; 4]; let mut rf = [0.0f32; 4]; diff --git a/src/row/arch/x86_sse41/tests/planar_gbr_float.rs b/src/row/arch/x86_sse41/tests/planar_gbr_float.rs index b5cbda18..56822455 100644 --- a/src/row/arch/x86_sse41/tests/planar_gbr_float.rs +++ b/src/row/arch/x86_sse41/tests/planar_gbr_float.rs @@ -643,7 +643,9 @@ fn sse41_gbrpf16_to_luma_f16c_matches_scalar() { prng_f16(&mut r, 0xE009_0003); let mut simd = std::vec![0u8; w]; let mut scal = std::vec![0u8; w]; - unsafe { gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -679,7 +681,9 @@ fn sse41_gbrpf16_to_luma_u16_f16c_matches_scalar() { prng_f16(&mut r, 0xE00A_0003); let mut simd = std::vec![0u16; w]; let mut scal = std::vec![0u16; w]; - unsafe { gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) }; + unsafe { + gbrpf16_to_luma_u16_row_f16c::(&g, &b, &r, &mut simd, w, ColorMatrix::Bt709, true) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -718,7 +722,9 @@ fn sse41_gbrpf16_to_hsv_f16c_matches_scalar() { let mut scal_h = std::vec![0u8; w]; let mut scal_s = std::vec![0u8; w]; let mut scal_v = std::vec![0u8; w]; - unsafe { gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) }; + unsafe { + gbrpf16_to_hsv_row_f16c::(&g, &b, &r, &mut simd_h, &mut simd_s, &mut simd_v, w) + }; let gf: std::vec::Vec = g.iter().map(|v| v.to_f32()).collect(); let bf: std::vec::Vec = b.iter().map(|v| v.to_f32()).collect(); let rf: std::vec::Vec = r.iter().map(|v| v.to_f32()).collect(); @@ -855,11 +861,17 @@ fn sse41_gbrapf16_to_rgba_f16_lossless_matches_scalar() { // ---- BE parity helpers ------------------------------------------------------- fn be_encode_f32(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- BE parity: Gbrpf32 → u8 RGB ------------------------------------------- @@ -879,11 +891,15 @@ fn sse41_gbrpf32_to_rgb_be_parity() { prng_f32(&mut r, 0xBE01_0003); let mut le_out = std::vec![0u8; w * 3]; let mut be_out = std::vec![0u8; w * 3]; - unsafe { gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgb_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgb BE parity width={w}"); } } @@ -905,11 +921,15 @@ fn sse41_gbrpf32_to_rgba_be_parity() { prng_f32(&mut r, 0xBE02_0003); let mut le_out = std::vec![0u8; w * 4]; let mut be_out = std::vec![0u8; w * 4]; - unsafe { gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f32(&g); let b_be = be_encode_f32(&b); let r_be = be_encode_f32(&r); - unsafe { gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf32_to_rgba_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf32_to_rgba BE parity width={w}"); } } @@ -931,11 +951,15 @@ fn sse41_gbrpf16_to_rgb_f16_be_parity() { prng_f16(&mut r, 0xBE07_0003); let mut le_out = std::vec![half::f16::ZERO; w * 3]; let mut be_out = std::vec![half::f16::ZERO; w * 3]; - unsafe { gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g, &b, &r, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); - unsafe { gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); } + unsafe { + gbrpf16_to_rgb_f16_row::(&g_be, &b_be, &r_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrpf16_to_rgb_f16 BE parity width={w}"); } } @@ -959,12 +983,16 @@ fn sse41_gbrapf16_to_rgba_f16_be_parity() { prng_f16(&mut a, 0xBE0F_0004); let mut le_out = std::vec![half::f16::ZERO; w * 4]; let mut be_out = std::vec![half::f16::ZERO; w * 4]; - unsafe { gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g, &b, &r, &a, &mut le_out, w); + } let g_be = be_encode_f16(&g); let b_be = be_encode_f16(&b); let r_be = be_encode_f16(&r); let a_be = be_encode_f16(&a); - unsafe { gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); } + unsafe { + gbrapf16_to_rgba_f16_row::(&g_be, &b_be, &r_be, &a_be, &mut be_out, w); + } assert_eq!(le_out, be_out, "gbrapf16_to_rgba_f16 BE parity width={w}"); } } diff --git a/src/row/dispatch/planar_gbr_float.rs b/src/row/dispatch/planar_gbr_float.rs index 4ac016e4..aa13bd62 100644 --- a/src/row/dispatch/planar_gbr_float.rs +++ b/src/row/dispatch/planar_gbr_float.rs @@ -1607,7 +1607,7 @@ mod tests { let r: &[f32] = &[]; let mut out: [u8; 0] = []; let w = usize::MAX / 2 + 1; - gbrpf32_to_rgb_row(g, b, r, &mut out, w, false); + gbrpf32_to_rgb_row::(g, b, r, &mut out, w, false); } #[test] @@ -1622,7 +1622,7 @@ mod tests { let r: &[f32] = &[]; let mut out: [u8; 0] = []; let w = usize::MAX / 2 + 1; - gbrpf32_to_rgba_row(g, b, r, &mut out, w, false); + gbrpf32_to_rgba_row::(g, b, r, &mut out, w, false); } #[test] @@ -1637,7 +1637,7 @@ mod tests { let r: &[f32] = &[]; let mut out: [u16; 0] = []; let w = usize::MAX / 2 + 1; - gbrpf32_to_rgb_u16_row(g, b, r, &mut out, w, false); + gbrpf32_to_rgb_u16_row::(g, b, r, &mut out, w, false); } #[test] @@ -1652,6 +1652,6 @@ mod tests { let r: &[f32] = &[]; let mut out: [u16; 0] = []; let w = usize::MAX / 2 + 1; - gbrpf32_to_rgba_u16_row(g, b, r, &mut out, w, false); + gbrpf32_to_rgba_u16_row::(g, b, r, &mut out, w, false); } } diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs index fa74c199..d2d41c3f 100644 --- a/src/row/scalar/planar_gbr_f16.rs +++ b/src/row/scalar/planar_gbr_f16.rs @@ -171,7 +171,10 @@ mod tests { // ---- helper: byte-swap a slice of f16 to simulate BE source ---------------- fn be_encode_f16(src: &[half::f16]) -> std::vec::Vec { - src.iter().map(|v| half::f16::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| half::f16::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- gbrpf16_to_rgb_f16_row ---------------------------------------------- diff --git a/src/row/scalar/planar_gbr_float.rs b/src/row/scalar/planar_gbr_float.rs index fe519eb9..91952b75 100644 --- a/src/row/scalar/planar_gbr_float.rs +++ b/src/row/scalar/planar_gbr_float.rs @@ -589,7 +589,10 @@ mod tests { // ---- helper: byte-swap a slice of f32 to simulate BE source ---------------- fn be_encode(src: &[f32]) -> std::vec::Vec { - src.iter().map(|v| f32::from_bits(v.to_bits().swap_bytes())).collect() + src + .iter() + .map(|v| f32::from_bits(v.to_bits().swap_bytes())) + .collect() } // ---- gbrpf32_to_rgb_row -------------------------------------------------- @@ -946,7 +949,15 @@ mod tests { let mut le_out = std::vec![0u8; 4]; let mut be_out = std::vec![0u8; 4]; gbrpf32_to_luma_row::(&g, &b, &r, &mut le_out, 4, ColorMatrix::Bt709, true); - gbrpf32_to_luma_row::(&g_be, &b_be, &r_be, &mut be_out, 4, ColorMatrix::Bt709, true); + gbrpf32_to_luma_row::( + &g_be, + &b_be, + &r_be, + &mut be_out, + 4, + ColorMatrix::Bt709, + true, + ); assert_eq!(be_out, le_out, "BE gbrpf32_to_luma_row must match LE"); } @@ -983,7 +994,15 @@ mod tests { let mut le_out = std::vec![0u16; 4]; let mut be_out = std::vec![0u16; 4]; gbrpf32_to_luma_u16_row::(&g, &b, &r, &mut le_out, 4, ColorMatrix::Bt709, true); - gbrpf32_to_luma_u16_row::(&g_be, &b_be, &r_be, &mut be_out, 4, ColorMatrix::Bt709, true); + gbrpf32_to_luma_u16_row::( + &g_be, + &b_be, + &r_be, + &mut be_out, + 4, + ColorMatrix::Bt709, + true, + ); assert_eq!(be_out, le_out, "BE gbrpf32_to_luma_u16_row must match LE"); } diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index 3429deea..681764a9 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -880,7 +880,7 @@ fn gbr_float_dispatch_panics_on_width_overflow_gbrpf32_rgb() { let b = [0.0f32; 1]; let r = [0.0f32; 1]; let mut out = [0u8; 3]; - crate::row::gbrpf32_to_rgb_row(&g, &b, &r, &mut out, bad_width, false); + crate::row::gbrpf32_to_rgb_row::(&g, &b, &r, &mut out, bad_width, false); } #[cfg(target_pointer_width = "32")] @@ -892,7 +892,7 @@ fn gbr_float_dispatch_panics_on_width_overflow_gbrpf32_rgba() { let b = [0.0f32; 1]; let r = [0.0f32; 1]; let mut out = [0u8; 4]; - crate::row::gbrpf32_to_rgba_row(&g, &b, &r, &mut out, bad_width, false); + crate::row::gbrpf32_to_rgba_row::(&g, &b, &r, &mut out, bad_width, false); } #[cfg(target_pointer_width = "32")] @@ -904,7 +904,7 @@ fn gbr_float_dispatch_panics_on_width_overflow_gbrpf32_rgb_u16() { let b = [0.0f32; 1]; let r = [0.0f32; 1]; let mut out = [0u16; 3]; - crate::row::gbrpf32_to_rgb_u16_row(&g, &b, &r, &mut out, bad_width, false); + crate::row::gbrpf32_to_rgb_u16_row::(&g, &b, &r, &mut out, bad_width, false); } #[cfg(target_pointer_width = "32")] @@ -916,5 +916,5 @@ fn gbr_float_dispatch_panics_on_width_overflow_gbrpf32_rgba_u16() { let b = [0.0f32; 1]; let r = [0.0f32; 1]; let mut out = [0u16; 4]; - crate::row::gbrpf32_to_rgba_u16_row(&g, &b, &r, &mut out, bad_width, false); + crate::row::gbrpf32_to_rgba_u16_row::(&g, &b, &r, &mut out, bad_width, false); } From b86e038d5faed1c3f35b3d5455effe7d4f50c94a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 01:01:37 +1200 Subject: [PATCH 3/3] fix(be-tier10-float): make scalar BE conversion target-endian aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scalar `load_f32::` / `load_f16::` helpers used an unconditional `swap_bytes()` regardless of host endianness. The corresponding SIMD `load_endian_u32x4::` / `load_endian_u16x8::` helpers (added in PR #81 be-infra) are target-endian aware via `cfg(target_endian = ...)`, so SIMD and scalar disagreed on big-endian hosts. Tail loops dispatch to the scalar fallback, so any width whose tail is non-zero on s390x corrupted the row. Why s390x corrupts with the old code: when reading a `&[f32]` reinterpreted from raw bytes, the host CPU reads the four bytes in host-native order. On LE hosts that matches LE-on-disk; on BE hosts it matches BE-on-disk. An unconditional swap therefore: - LE host + BE data: correct (swap turns BE bytes into native LE) — the case the original code targeted. - BE host + LE data: correct (swap turns LE bytes into native BE). - BE host + BE data: WRONG (host-native is already BE, swap inverts it). - LE host + LE data: handled by `BE = false` no-op — fine. The fix routes both branches through `u32::from_be` / `u32::from_le` (and `u16::from_be` / `u16::from_le` for f16): BE branch: `f32::from_bits(u32::from_be(raw.to_bits()))` LE branch: `f32::from_bits(u32::from_le(raw.to_bits()))` `u32::from_le` is a no-op on LE hosts and a byte-swap on BE hosts; symmetric for `from_be`. This makes both `` monomorphizations correct on every target endianness and matches the contract the SIMD endian helpers already implement. f32 / f16 paths use `from_bits(u{32,16}::from_be(raw.to_bits()))` so the result is host-native f32 / `half::f16` regardless of the source encoding. The test helpers (`be_encode` in `planar_gbr_float.rs`, `be_encode_f16` in `planar_gbr_f16.rs`) intentionally use unconditional `swap_bytes` to synthesise BE-on-disk fixtures from LE input on an LE host. They are not load helpers and remain unchanged. No SIMD code paths needed changes — the per-arch `load_endian_*` helpers already use `cfg(target_endian = ...)`. Tail loops still call the scalar helpers, which are now correct. Verified: - `cargo test --target aarch64-apple-darwin --lib`: 2176 passed - `cargo build --target x86_64-apple-darwin --tests`: 0 warnings - `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests`: clean - `cargo build --no-default-features`: clean - `cargo fmt --check`: clean - `cargo clippy --all-targets --all-features -- -D warnings`: clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/scalar/planar_gbr_f16.rs | 24 +++++++++++++++++------- src/row/scalar/planar_gbr_float.rs | 25 +++++++++++++++++-------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/row/scalar/planar_gbr_f16.rs b/src/row/scalar/planar_gbr_f16.rs index d2d41c3f..e981d88c 100644 --- a/src/row/scalar/planar_gbr_f16.rs +++ b/src/row/scalar/planar_gbr_f16.rs @@ -33,18 +33,28 @@ // ---- shared BE helper ------------------------------------------------------- -/// Load a single `half::f16` sample with optional BE byte-swap. +/// Load a single `half::f16` sample, target-endian aware. /// -/// When `BE = true` the two bytes of the f16 bit-pattern are reversed (i.e. -/// we load a big-endian f16 from disk and convert to host-native). When -/// `BE = false` the value is returned as-is. The dead branch is eliminated -/// by the compiler when the caller is monomorphized. +/// The source plane is the raw on-disk / on-wire byte stream reinterpreted +/// as `&[half::f16]`. Each f16 read picks up two bytes in **host-native** +/// order. We then convert that host-native u16 to the value the encoded +/// stream represents: +/// +/// - `BE = true`: bytes on disk are big-endian → `u16::from_be` is a no-op +/// on BE hosts and a byte-swap on LE hosts. +/// - `BE = false`: bytes on disk are little-endian → `u16::from_le` is a +/// no-op on LE hosts and a byte-swap on BE hosts. +/// +/// **Both** branches go through `from_be` / `from_le` so the +/// LE-data-on-BE-host case is handled correctly too. An unconditional +/// `swap_bytes` would corrupt rows on big-endian hosts (e.g. s390x). #[inline(always)] fn load_f16(plane: &[half::f16], i: usize) -> half::f16 { + let raw = plane[i]; if BE { - half::f16::from_bits(plane[i].to_bits().swap_bytes()) + half::f16::from_bits(u16::from_be(raw.to_bits())) } else { - plane[i] + half::f16::from_bits(u16::from_le(raw.to_bits())) } } diff --git a/src/row/scalar/planar_gbr_float.rs b/src/row/scalar/planar_gbr_float.rs index 91952b75..d090e931 100644 --- a/src/row/scalar/planar_gbr_float.rs +++ b/src/row/scalar/planar_gbr_float.rs @@ -70,19 +70,28 @@ fn f32_to_f16(y: f32) -> half::f16 { half::f16::from_f32(y) } -/// Load a single f32 sample from a `&[f32]` plane with optional BE byte-swap. +/// Load a single f32 sample from a `&[f32]` plane, target-endian aware. /// -/// When `BE = true` the four bytes of the f32 representation are reversed -/// (equivalent to loading a big-endian IEEE-754 single from disk). When -/// `BE = false` the value is returned as-is (host-native / LE). +/// The source plane is the raw on-disk / on-wire byte stream reinterpreted +/// as `&[f32]`. Each f32 read therefore picks up four bytes in **host-native** +/// order. We then convert that host-native u32 to the value the encoded +/// stream represents: +/// +/// - `BE = true`: bytes on disk are big-endian → `u32::from_be` is a no-op +/// on BE hosts and a byte-swap on LE hosts. +/// - `BE = false`: bytes on disk are little-endian → `u32::from_le` is a +/// no-op on LE hosts and a byte-swap on BE hosts. +/// +/// **Both** branches go through `from_be` / `from_le` so the +/// LE-data-on-BE-host case is handled correctly too. An unconditional +/// `swap_bytes` would corrupt rows on big-endian hosts (e.g. s390x). #[inline(always)] fn load_f32(plane: &[f32], i: usize) -> f32 { + let raw = plane[i]; if BE { - // SAFETY: reinterpret f32 bits as u32, swap bytes, reinterpret back. - let bits = plane[i].to_bits().swap_bytes(); - f32::from_bits(bits) + f32::from_bits(u32::from_be(raw.to_bits())) } else { - plane[i] + f32::from_bits(u32::from_le(raw.to_bits())) } }