Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions rust/lance-arrow/src/bfloat16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ impl FromIterator<Option<bf16>> for BFloat16Array {
.len(len)
.add_buffer(buffer.into())
.null_bit_buffer(null_buffer);
// SAFETY: the value buffer contains exactly `2 * len` bytes (two bytes
// pushed per iteration of the loop above, including the zero-fill for
// null slots), which matches the `FixedSizeBinary(2)` storage layout.
// The null bit buffer, when present, has `len` bits appended above, so
// its length covers the array's logical range.
let array_data = unsafe { array_data.build_unchecked() };
Self {
inner: FixedSizeBinaryArray::from(array_data),
Expand All @@ -170,6 +175,10 @@ impl From<Vec<bf16>> for BFloat16Array {
let array_data = ArrayData::builder(DataType::FixedSizeBinary(2))
.len(data.len())
.add_buffer(buffer.into());
// SAFETY: the buffer contains exactly `2 * data.len()` bytes — each
// `bf16` writes its two little-endian bytes once — matching the
// `FixedSizeBinary(2)` storage layout. No null buffer is attached, so
// every element is logically valid.
let array_data = unsafe { array_data.build_unchecked() };
Self {
inner: FixedSizeBinaryArray::from(array_data),
Expand Down Expand Up @@ -268,12 +277,63 @@ mod from_arrow {
impl FloatArray<BFloat16Type> for FixedSizeBinaryArray {
type FloatType = BFloat16Type;

/// Returns the underlying `bf16` values as a borrowed slice.
///
/// # Preconditions
///
/// - `value_length()` must be 2 (the `FixedSizeBinary(2)` storage shape
/// used by [`BFloat16Array`]). Asserted at entry.
/// - The value buffer must be at least 2-byte aligned. Lance's in-tree
/// constructors always satisfy this (every value buffer goes through
/// `MutableBuffer`, which is aligned to arrow-buffer's `ALIGNMENT`
/// constant — ≥32 bytes on every supported target). Externally-built
/// `FixedSizeBinaryArray`s arriving via FFI, IPC, or
/// `Buffer::from_custom_allocation` are not required by arrow-rs to be
/// aligned beyond a single byte; passing one to this method violates the
/// precondition. A `debug_assert` below catches such inputs in debug and
/// test builds.
///
/// # Endianness
///
/// `lance-arrow` is gated on `target_endian = "little"` at the crate root,
/// so this method always returns values in the same byte order Lance writes
/// (see [`BFloat16Array::value`] and the [`FromIterator`] impls).
fn as_slice(&self) -> &[bf16] {
assert_eq!(
self.value_length(),
2,
"BFloat16 arrays must use FixedSizeBinary(2) storage"
);
debug_assert_eq!(
(self.value_data().as_ptr() as usize) % std::mem::align_of::<bf16>(),
0,
"BFloat16 value buffer must be at least 2-byte aligned"
);
// SAFETY:
// - The assert above pins `value_size == 2`, so `value_data().len() / 2`
// equals the array's logical element count.
// `FixedSizeBinaryArray::From<ArrayData>` constructs its value buffer
// as `buffers[0].slice_with_length(offset * 2, len * 2)` (arrow-array
// `fixed_size_binary_array.rs`), so `value_data()` already returns
// the offset-adjusted slice. Do not replace `value_data()` with an
// accessor that returns the un-sliced backing buffer.
// - `bf16` is `#[repr(transparent)]` over `u16` (size 2, alignment 2);
// every `u16` bit pattern is a valid `bf16`, so any byte content
// yields a defined value — never UB.
// - Alignment is the caller's responsibility per the precondition
// documented above. The `debug_assert_eq!` immediately preceding this
// block catches violations in debug and test builds only — release
// builds rely on callers honoring the precondition. arrow-rs
// declares `FixedSizeBinary(n)`'s
// `BufferSpec::FixedWidth { alignment: align_of::<u8>() == 1 }`
// (arrow-data `data.rs`), so arrow-rs alone does not guarantee
// 2-byte alignment. Lance's in-tree construction paths build value
// buffers via `MutableBuffer` (arrow-buffer `ALIGNMENT` constant,
// ≥32 bytes on every supported target), which trivially satisfies
// `bf16`'s 2-byte requirement.
// - The returned slice borrows from `self`; the underlying ref-counted,
// immutable Arrow buffer cannot be mutated or freed for the slice's
// lifetime.
unsafe {
slice::from_raw_parts(
self.value_data().as_ptr() as *const bf16,
Expand Down
22 changes: 22 additions & 0 deletions rust/lance-arrow/src/deepcopy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer {
pub fn deep_copy_nulls(nulls: Option<&NullBuffer>) -> Option<NullBuffer> {
let nulls = nulls?;
let bit_buffer = deep_copy_buffer(nulls.inner().inner());
// SAFETY: `null_count` is taken from the source `NullBuffer`, which already
// upheld `NullBuffer::new_unchecked`'s invariant — the unset-bit count over
// the logical bit slice `[bit_offset, bit_offset + bit_len)`. `NullBuffer::slice`
// adjusts only `BooleanBuffer::bit_offset` / `bit_len` and never byte-advances
// the inner `Buffer`, so `deep_copy_buffer` (which copies the source `Buffer`'s
// `as_slice()` view from byte 0) reproduces the exact bit pattern at the same
// bit offsets; the unset-bit count is therefore preserved. `BooleanBuffer::new`
// panics (does not UB) if `bit_offset + bit_len > 8 * buffer.len()`, and the
// copy has the same length, so that check still passes.
Some(unsafe {
NullBuffer::new_unchecked(
BooleanBuffer::new(bit_buffer, nulls.offset(), nulls.len()),
Expand All @@ -37,6 +46,19 @@ pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData {
.iter()
.map(deep_copy_array_data)
.collect::<Vec<_>>();
// SAFETY: `build_unchecked` inherits `ArrayData::new_unchecked`'s contract —
// `(data_type, len, offset, nulls, buffers, child_data)` must form a valid
// Arrow array. This call reproduces `data` structurally: `data_type`, `len`,
// and `offset` are forwarded unchanged; each buffer is replaced by a byte-
// identical copy of its offset-applied `as_slice()` view (the output buffer
// is `MutableBuffer`-allocated, at least as aligned as the source); `nulls`
// is deep-copied with the same bit offset/length and unset-bit count (see
// `deep_copy_nulls`); `child_data` is recursively cloned with the same
// guarantee. Every value-level invariant the source upheld — UTF-8 validity,
// monotonic offsets, in-bounds dictionary indices, run-end monotonicity,
// struct child-length matching — therefore transfers to the copy. If the
// source `ArrayData` was itself constructed via `new_unchecked` with an
// invalid payload, this function faithfully reproduces that invalidity.
unsafe {
ArrayDataBuilder::new(data_type)
.len(len)
Expand Down
16 changes: 16 additions & 0 deletions rust/lance-arrow/src/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,22 @@ pub trait FloatArray<T: ArrowFloatType + ?Sized>: Array + Clone + 'static {
type FloatType: ArrowFloatType;

/// Returns a reference to the underlying data as a slice.
///
/// # Panics
///
/// Implementations may panic if the array's storage shape does not match
/// the expected element layout. In particular, the `bf16` impl panics if
/// `value_length() != 2` (the `FixedSizeBinary(2)` shape required by
/// `BFloat16Array`).
///
/// # Preconditions
///
/// Implementations may impose additional invariants on the underlying
/// buffer. The `bf16` impl requires the value buffer to be at least
/// 2-byte aligned — satisfied automatically by every in-tree Lance
/// constructor, but external callers passing externally-built arrays
/// (FFI, IPC, `Buffer::from_custom_allocation`) must ensure alignment.
/// See the impl's docstring for details.
fn as_slice(&self) -> &[T::Native];

/// Construct an array from a vector of values.
Expand Down
10 changes: 10 additions & 0 deletions rust/lance-arrow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
//!
//! To improve Arrow-RS ergonomic

#![warn(clippy::undocumented_unsafe_blocks)]

// lance-arrow reinterprets value bytes as native numeric types in
// `FloatArray::as_slice` for `bf16` (rust/lance-arrow/src/bfloat16.rs), which
// requires the host byte order to match the on-disk byte order Lance writes.
// Lance writes little-endian; building on a big-endian target would silently
// produce wrong numeric values.
#[cfg(not(target_endian = "little"))]
compile_error!("lance-arrow only supports little-endian targets");

use std::sync::Arc;
use std::{collections::HashMap, ptr::NonNull};

Expand Down
Loading