From cfd691dfc6cd29a526c074c3f1030742de232fef Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Mon, 29 Jun 2026 19:44:28 +0800 Subject: [PATCH 1/3] docs(arrow): add SAFETY comments to lance-arrow unsafe blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the project's Rust security rule that every unsafe block must document its invariants, annotate five unsafe blocks in lance-arrow: - deepcopy.rs deep_copy_nulls (NullBuffer::new_unchecked): null_count is taken from the source NullBuffer, which already upheld the invariant over its bit slice. NullBuffer::slice only adjusts BooleanBuffer's bit_offset/bit_len and never byte-advances the inner Buffer, so the deep copy reproduces the same bit pattern at the same offsets and the unset-bit count is preserved. - deepcopy.rs deep_copy_array_data (ArrayDataBuilder::build_unchecked): reproduces data structurally — data_type/len/offset forwarded, each buffer byte-copied via deep_copy_buffer (MutableBuffer-allocated, at least as aligned as the source), nulls deep-copied with the same bit offset/length and unset-bit count, child_data recursively cloned. Every value-level invariant the source upheld (UTF-8, monotonic offsets, in-bounds dictionary indices, run-end monotonicity, struct child-length matching) transfers to the copy. - bfloat16.rs FromIterator> build_unchecked: the value buffer contains exactly 2 * len bytes (two bytes per iteration of the loop above, including null-slot zero-fill); the null bit buffer has len bits, satisfying FixedSizeBinary(2)'s layout. - bfloat16.rs From> build_unchecked: each bf16 writes its two little-endian bytes once, so the buffer is exactly 2 * data.len() bytes; no null buffer is attached, every element is logically valid. - bfloat16.rs FloatArray::as_slice slice::from_raw_parts: the assert pins value_size == 2; FixedSizeBinaryArray::From slices the value buffer at construction so value_data() is offset-adjusted; bf16 is #[repr(transparent)] over u16 with every bit pattern valid; alignment is the caller's responsibility per the documented precondition (a debug_assert above the unsafe block catches violations in debug/test builds); the slice borrows from self. Additional hardening (no semantic change for compliant callers): - lib.rs: add #[cfg(not(target_endian = "little"))] compile_error! so the byte-reinterpretation in as_slice's unsafe block is guaranteed by the crate itself rather than transitively through a dependent crate. - floats.rs: hoist # Panics and # Preconditions sections into the FloatArray::as_slice trait method docstring so polymorphic callers see the contract documented at the trait level. - bfloat16.rs: add a debug_assert checking 2-byte alignment with the exact modulo form ((ptr as usize) % align_of::()) to catch externally-built FixedSizeBinaryArrays (FFI, IPC, Buffer::from_custom_allocation) that arrow-rs alone does not require to be more than 1-byte aligned. - bfloat16.rs as_slice impl: add rustdoc # Preconditions and # Endianness sections so the contract is visible without opening the source. Verification: cargo fmt --all -- --check, cargo clippy -p lance-arrow --tests -- -D warnings, and cargo test -p lance-arrow (91 unit tests + 6 doc tests) all pass. --- rust/lance-arrow/src/bfloat16.rs | 60 ++++++++++++++++++++++++++++++++ rust/lance-arrow/src/deepcopy.rs | 22 ++++++++++++ rust/lance-arrow/src/floats.rs | 16 +++++++++ rust/lance-arrow/src/lib.rs | 8 +++++ 4 files changed, 106 insertions(+) diff --git a/rust/lance-arrow/src/bfloat16.rs b/rust/lance-arrow/src/bfloat16.rs index 3049be1117e..eb730852880 100644 --- a/rust/lance-arrow/src/bfloat16.rs +++ b/rust/lance-arrow/src/bfloat16.rs @@ -144,6 +144,11 @@ impl FromIterator> for BFloat16Array { .len(len) .add_buffer(buffer.into()) .null_bit_buffer(null_buffer); + // SAFETY: the value buffer contains exactly `2 * len` bytes (two bytes + // pushed per iteration of the loop above, including the zero-fill for + // null slots), which matches the `FixedSizeBinary(2)` storage layout. + // The null bit buffer, when present, has `len` bits appended above, so + // its length covers the array's logical range. let array_data = unsafe { array_data.build_unchecked() }; Self { inner: FixedSizeBinaryArray::from(array_data), @@ -170,6 +175,10 @@ impl From> for BFloat16Array { let array_data = ArrayData::builder(DataType::FixedSizeBinary(2)) .len(data.len()) .add_buffer(buffer.into()); + // SAFETY: the buffer contains exactly `2 * data.len()` bytes — each + // `bf16` writes its two little-endian bytes once — matching the + // `FixedSizeBinary(2)` storage layout. No null buffer is attached, so + // every element is logically valid. let array_data = unsafe { array_data.build_unchecked() }; Self { inner: FixedSizeBinaryArray::from(array_data), @@ -268,12 +277,63 @@ mod from_arrow { impl FloatArray for FixedSizeBinaryArray { type FloatType = BFloat16Type; + /// Returns the underlying `bf16` values as a borrowed slice. + /// + /// # Preconditions + /// + /// - `value_length()` must be 2 (the [`FixedSizeBinary(2)`] storage shape + /// used by [`BFloat16Array`]). Asserted at entry. + /// - The value buffer must be at least 2-byte aligned. Lance's in-tree + /// constructors always satisfy this (every value buffer goes through + /// `MutableBuffer`, which is aligned to arrow-buffer's `ALIGNMENT` + /// constant — ≥32 bytes on every supported target). Externally-built + /// `FixedSizeBinaryArray`s arriving via FFI, IPC, or + /// `Buffer::from_custom_allocation` are not required by arrow-rs to be + /// aligned beyond a single byte; passing one to this method violates the + /// precondition. A `debug_assert` below catches such inputs in debug and + /// test builds. + /// + /// # Endianness + /// + /// `lance-arrow` is gated on `target_endian = "little"` at the crate root, + /// so this method always returns values in the same byte order Lance writes + /// (see [`BFloat16Array::value`] and the [`FromIterator`] impls). fn as_slice(&self) -> &[bf16] { assert_eq!( self.value_length(), 2, "BFloat16 arrays must use FixedSizeBinary(2) storage" ); + debug_assert_eq!( + (self.value_data().as_ptr() as usize) % std::mem::align_of::(), + 0, + "BFloat16 value buffer must be at least 2-byte aligned" + ); + // SAFETY: + // - The assert above pins `value_size == 2`, so `value_data().len() / 2` + // equals the array's logical element count. + // `FixedSizeBinaryArray::From` constructs its value buffer + // as `buffers[0].slice_with_length(offset * 2, len * 2)` (arrow-array + // `fixed_size_binary_array.rs`), so `value_data()` already returns + // the offset-adjusted slice. Do not replace `value_data()` with an + // accessor that returns the un-sliced backing buffer. + // - `bf16` is `#[repr(transparent)]` over `u16` (size 2, alignment 2); + // every `u16` bit pattern is a valid `bf16`, so any byte content + // yields a defined value — never UB. + // - Alignment is the caller's responsibility per the precondition + // documented above. The `debug_assert_eq!` immediately preceding this + // block catches violations in debug and test builds only — release + // builds rely on callers honoring the precondition. arrow-rs + // declares `FixedSizeBinary(n)`'s + // `BufferSpec::FixedWidth { alignment: align_of::() == 1 }` + // (arrow-data `data.rs`), so arrow-rs alone does not guarantee + // 2-byte alignment. Lance's in-tree construction paths build value + // buffers via `MutableBuffer` (arrow-buffer `ALIGNMENT` constant, + // ≥32 bytes on every supported target), which trivially satisfies + // `bf16`'s 2-byte requirement. + // - The returned slice borrows from `self`; the underlying ref-counted, + // immutable Arrow buffer cannot be mutated or freed for the slice's + // lifetime. unsafe { slice::from_raw_parts( self.value_data().as_ptr() as *const bf16, diff --git a/rust/lance-arrow/src/deepcopy.rs b/rust/lance-arrow/src/deepcopy.rs index b747b24d466..afeb853350b 100644 --- a/rust/lance-arrow/src/deepcopy.rs +++ b/rust/lance-arrow/src/deepcopy.rs @@ -14,6 +14,15 @@ pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer { pub fn deep_copy_nulls(nulls: Option<&NullBuffer>) -> Option { let nulls = nulls?; let bit_buffer = deep_copy_buffer(nulls.inner().inner()); + // SAFETY: `null_count` is taken from the source `NullBuffer`, which already + // upheld `NullBuffer::new_unchecked`'s invariant — the unset-bit count over + // the logical bit slice `[bit_offset, bit_offset + bit_len)`. `NullBuffer::slice` + // adjusts only `BooleanBuffer::bit_offset` / `bit_len` and never byte-advances + // the inner `Buffer`, so `deep_copy_buffer` (which copies the source `Buffer`'s + // `as_slice()` view from byte 0) reproduces the exact bit pattern at the same + // bit offsets; the unset-bit count is therefore preserved. `BooleanBuffer::new` + // panics (does not UB) if `bit_offset + bit_len > 8 * buffer.len()`, and the + // copy has the same length, so that check still passes. Some(unsafe { NullBuffer::new_unchecked( BooleanBuffer::new(bit_buffer, nulls.offset(), nulls.len()), @@ -37,6 +46,19 @@ pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData { .iter() .map(deep_copy_array_data) .collect::>(); + // SAFETY: `build_unchecked` inherits `ArrayData::new_unchecked`'s contract — + // `(data_type, len, offset, nulls, buffers, child_data)` must form a valid + // Arrow array. This call reproduces `data` structurally: `data_type`, `len`, + // and `offset` are forwarded unchanged; each buffer is replaced by a byte- + // identical copy of its offset-applied `as_slice()` view (the output buffer + // is `MutableBuffer`-allocated, at least as aligned as the source); `nulls` + // is deep-copied with the same bit offset/length and unset-bit count (see + // `deep_copy_nulls`); `child_data` is recursively cloned with the same + // guarantee. Every value-level invariant the source upheld — UTF-8 validity, + // monotonic offsets, in-bounds dictionary indices, run-end monotonicity, + // struct child-length matching — therefore transfers to the copy. If the + // source `ArrayData` was itself constructed via `new_unchecked` with an + // invalid payload, this function faithfully reproduces that invalidity. unsafe { ArrayDataBuilder::new(data_type) .len(len) diff --git a/rust/lance-arrow/src/floats.rs b/rust/lance-arrow/src/floats.rs index 054f4418e0b..b6f6a9fff74 100644 --- a/rust/lance-arrow/src/floats.rs +++ b/rust/lance-arrow/src/floats.rs @@ -184,6 +184,22 @@ pub trait FloatArray: Array + Clone + 'static { type FloatType: ArrowFloatType; /// Returns a reference to the underlying data as a slice. + /// + /// # Panics + /// + /// Implementations may panic if the array's storage shape does not match + /// the expected element layout. In particular, the `bf16` impl panics if + /// `value_length() != 2` (the [`FixedSizeBinary(2)`] shape required by + /// `BFloat16Array`). + /// + /// # Preconditions + /// + /// Implementations may impose additional invariants on the underlying + /// buffer. The `bf16` impl requires the value buffer to be at least + /// 2-byte aligned — satisfied automatically by every in-tree Lance + /// constructor, but external callers passing externally-built arrays + /// (FFI, IPC, `Buffer::from_custom_allocation`) must ensure alignment. + /// See the impl's docstring for details. fn as_slice(&self) -> &[T::Native]; /// Construct an array from a vector of values. diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index 34a67600543..e6c5af2df34 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -5,6 +5,14 @@ //! //! To improve Arrow-RS ergonomic +// lance-arrow reinterprets value bytes as native numeric types in +// `FloatArray::as_slice` for `bf16` (rust/lance-arrow/src/bfloat16.rs), which +// requires the host byte order to match the on-disk byte order Lance writes. +// Lance writes little-endian; building on a big-endian target would silently +// produce wrong numeric values. +#[cfg(not(target_endian = "little"))] +compile_error!("lance-arrow only supports little-endian targets"); + use std::sync::Arc; use std::{collections::HashMap, ptr::NonNull}; From cdc25d7f4a7ab21bb14af330361820b8db53abde Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Mon, 29 Jun 2026 20:29:41 +0800 Subject: [PATCH 2/3] fix(arrow): drop broken intra-doc links to FixedSizeBinary(2) `FixedSizeBinary` is a variant of `arrow_schema::DataType`, not a standalone item, so `[`FixedSizeBinary(2)`]` had no resolvable target. With rustdoc's broken-intra-doc-links lint promoted to a deny-warning in CI (the rustdoc job runs with `-D warnings`), this failed the build. Convert the references to plain code-formatted prose. The lint also covers the new doc comments added in cfd691dfc. Verified locally: RUSTDOCFLAGS="-D warnings" cargo doc -p lance-arrow --no-deps now passes. cargo clippy -p lance-arrow --tests -- -D warnings and cargo test -p lance-arrow (91 unit + 6 doc tests) still pass. --- rust/lance-arrow/src/bfloat16.rs | 2 +- rust/lance-arrow/src/floats.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/lance-arrow/src/bfloat16.rs b/rust/lance-arrow/src/bfloat16.rs index eb730852880..20e94cf179d 100644 --- a/rust/lance-arrow/src/bfloat16.rs +++ b/rust/lance-arrow/src/bfloat16.rs @@ -281,7 +281,7 @@ impl FloatArray for FixedSizeBinaryArray { /// /// # Preconditions /// - /// - `value_length()` must be 2 (the [`FixedSizeBinary(2)`] storage shape + /// - `value_length()` must be 2 (the `FixedSizeBinary(2)` storage shape /// used by [`BFloat16Array`]). Asserted at entry. /// - The value buffer must be at least 2-byte aligned. Lance's in-tree /// constructors always satisfy this (every value buffer goes through diff --git a/rust/lance-arrow/src/floats.rs b/rust/lance-arrow/src/floats.rs index b6f6a9fff74..ad5f3768980 100644 --- a/rust/lance-arrow/src/floats.rs +++ b/rust/lance-arrow/src/floats.rs @@ -189,7 +189,7 @@ pub trait FloatArray: Array + Clone + 'static { /// /// Implementations may panic if the array's storage shape does not match /// the expected element layout. In particular, the `bf16` impl panics if - /// `value_length() != 2` (the [`FixedSizeBinary(2)`] shape required by + /// `value_length() != 2` (the `FixedSizeBinary(2)` shape required by /// `BFloat16Array`). /// /// # Preconditions From f2596e9b87e5b03223dd33878a4767237c78ec61 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Tue, 30 Jun 2026 15:59:33 +0800 Subject: [PATCH 3/3] chore(arrow): enable clippy::undocumented_unsafe_blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All unsafe blocks in lance-arrow now carry SAFETY comments (the previous commit on this branch added the missing ones). Enable the `clippy::undocumented_unsafe_blocks` lint at the crate root so future unsafe blocks must come with a SAFETY comment too — making the project's Rust security rule mechanically enforced rather than convention. Verified: cargo clippy -p lance-arrow --tests --benches --all-targets -- -D warnings passes; cargo test -p lance-arrow still 91 unit + 6 doc tests; RUSTDOCFLAGS="-D warnings" cargo doc -p lance-arrow --no-deps clean. Sanity-checked by temporarily removing two SAFETY comments — clippy correctly emitted "unsafe block missing a safety comment" warnings. Per wjones127's review suggestion on #7511. --- rust/lance-arrow/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index e6c5af2df34..b38dcf25e12 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -5,6 +5,8 @@ //! //! To improve Arrow-RS ergonomic +#![warn(clippy::undocumented_unsafe_blocks)] + // lance-arrow reinterprets value bytes as native numeric types in // `FloatArray::as_slice` for `bf16` (rust/lance-arrow/src/bfloat16.rs), which // requires the host byte order to match the on-disk byte order Lance writes.