Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions src/row/arch/neon/endian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,53 @@ pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> uint32
unsafe { load_le_u32x4(ptr) }
}
}

// ---- u16x4 loaders (8-byte half-vector) ------------------------------------
//
// These load only 8 bytes (4 × u16) into a `uint16x4_t` in host-native order.
// Used by Rgbf16 widen kernels (`vcvt_f32_f16` reads 4 × f16 from a
// `uint16x4_t`) when the caller can only guarantee 8 readable bytes — using
// the 16-byte `load_endian_u16x8` would tail-overread.

/// Loads 4 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
#[inline(always)]
pub(crate) unsafe fn load_le_u16x4(ptr: *const u8) -> uint16x4_t {
let v = unsafe { vld1_u16(ptr.cast()) };
#[cfg(target_endian = "big")]
let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) };
v
}

/// Loads 4 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
#[inline(always)]
pub(crate) unsafe fn load_be_u16x4(ptr: *const u8) -> uint16x4_t {
let v = unsafe { vld1_u16(ptr.cast()) };
#[cfg(target_endian = "little")]
let v = unsafe { vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(v))) };
v
}

/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4` based on
/// the compile-time `BE` const parameter. Reads exactly 8 bytes.
///
/// # Safety
///
/// Same as `load_le_u16x4` / `load_be_u16x4`.
#[inline(always)]
pub(crate) unsafe fn load_endian_u16x4<const BE: bool>(ptr: *const u8) -> uint16x4_t {
if BE {
unsafe { load_be_u16x4(ptr) }
} else {
unsafe { load_le_u16x4(ptr) }
}
}
Loading
Loading