Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ impl Bitmap {
pub fn build_query_bitmap_fp32(&self, q: &[f32]) -> Vec<u64> {
assert_eq!(q.len(), self.dim);
assert_all_finite(q);
// Index the dim sorted by |q[j]| desc; alternative: by q[j] desc.
// Index the dim sorted by q[j] desc; alternative: by |q[j]| desc.
// We use raw value desc so the top bits flag where the query
// points positively, matching the doc-side semantics.
let mut idx: Vec<u16> = (0..self.dim as u16).collect();
Expand Down Expand Up @@ -178,7 +178,7 @@ impl Bitmap {
/// streaming top-k buffer (each replacement triggers a linear
/// recompute_min). Instead we scan once into a contiguous
/// `Vec<u32>` of all N scores and `select_nth_unstable` the
/// top-`m`: O(N + m log m). The 828 KiB temp at N=207k is
/// top-`m`: O(N + m log m). The ~808 KiB temp at N=207k is
/// cheap relative to the cost it saves at M ≥ 1000.
pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec<u32> {
assert_all_finite(q);
Expand Down Expand Up @@ -257,7 +257,7 @@ impl Bitmap {
// One doc-scan pass writes `batch * n` u32 scores, layout
// scores[bi * n + di]. At B=8, N=207k that buffer is ~6.6 MB —
// L3-resident, not per-core L2. The parallel select_nth below
// streams one query's ~828 KiB score slice per worker; it backs
// streams one query's ~808 KiB score slice per worker; it backs
// from L3, but the selection is a single linear pass, so it stays
// bandwidth-bound rather than thrashing a small cache.
let scores_len = batch
Expand Down
2 changes: 1 addition & 1 deletion src/multi_bucket.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
//! just rearranges the same sum.
//!
//! Storage: `dim × 2^bits / 8` bytes per document
//! (b=2: 512 B/doc at D=1024 = matches RankQuant b=2;
//! (b=2: 512 B/doc at D=1024 = RankQuant b=2;
//! b=4: 2048 B/doc at D=1024 = 4× RankQuant b=4).
//!
//! The full 16×16 (b=4) probe is *not* a faster scoring kernel — it
Expand Down
25 changes: 14 additions & 11 deletions src/quant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
//! dispatches AVX-512 → AVX2 → scalar via the kernels in
//! [`crate::quant_kernels`].
//!
//! The byte-LUT path ([`search_asymmetric_byte_lut`]) is exposed
//! publicly via `ordvec::search_asymmetric_byte_lut` so
//! `examples/bench_rank.rs` can compare it against the production
//! The byte-LUT path ([`search_asymmetric_byte_lut`]) is re-exported
//! `#[doc(hidden)]` (reachable as `ordvec::search_asymmetric_byte_lut`)
//! so `examples/bench_rank.rs` can compare it against the production
//! AVX path on the same data.

use rayon::prelude::*;
Expand Down Expand Up @@ -74,10 +74,12 @@ enum SimdTier {
/// `dim % (1 << bits) == 0` and `dim % (8 / bits) == 0`, which is
/// *weaker* than the SIMD invariants (e.g. dim 48 / 80 / 20 are valid
/// constructor dims that violate them). A kernel whose invariant is
/// unmet silently drops its trailing chunk in release builds and
/// returns the wrong top-k. This selector returns the highest tier
/// whose invariant holds — falling back to [`SimdTier::None`] (scalar
/// LUT, which handles any valid dim) when neither SIMD tier fits.
/// unmet hits a hard `assert!` and panics in release — the kernels
/// enforce their lane invariant in every build, by design. This
/// selector returns the highest tier whose invariant holds — falling
/// back to [`SimdTier::None`] (scalar LUT, which handles any valid dim)
/// when neither SIMD tier fits, so a constructor-valid-but-SIMD-invalid
/// dim never reaches a kernel that would reject it.
#[inline]
fn select_simd_tier(dim: usize, bits: u8) -> SimdTier {
// SIMD asymmetric kernels exist only for b ∈ {2, 4}. b=1 (and any
Expand Down Expand Up @@ -293,9 +295,10 @@ impl RankQuant {
// processes 8 codes/chunk → needs dim % 8). The constructor only
// guarantees `dim % (1 << bits) == 0` and `dim % (8 / bits) == 0`,
// so constructor-valid dims like 48 / 80 / 20 can violate the
// SIMD invariant. In release builds the kernels' `debug_assert`
// is compiled out and they silently drop the trailing chunk →
// wrong top-k. The dispatch below must therefore only select a
// SIMD invariant. Each kernel enforces its lane invariant with a
// real `assert!` (not a `debug_assert!`), so a mis-dispatch panics
// loudly in release rather than silently dropping a chunk. The
// dispatch below must therefore only select a
// tier whose invariant holds for (dim, bits); otherwise it falls
// back to the scalar LUT path which handles any valid dim.
#[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))]
Expand Down Expand Up @@ -651,7 +654,7 @@ impl RankQuant {
// B=2: 256 groups × 256 entries × 4 B = 256 KiB per query (fits L2)
// B=4: 512 groups × 256 entries × 4 B = 512 KiB per query (spills L2 a little)
//
// Exposed publicly for benchmarking. Production callers should reach
// Re-exported `#[doc(hidden)]` for benchmarking. Production callers should reach
// for [`RankQuant::search_asymmetric`] which dispatches to the
// fastest implementation for the current CPU.
// -------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/rank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
//! the value, and rank-vector norms are analytical (a permutation of
//! `{0..D-1}` has fixed L2 norm after mean-centring).
//!
//! See `tests/rank.rs` for the round-trip and norm-invariant tests.
//! See the `tests` module below for the round-trip and norm-invariant tests.

use ordered_float::OrderedFloat;
use rayon::prelude::*;
Expand Down
7 changes: 4 additions & 3 deletions src/rank_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@
//!
//! All loaders validate header fields *before* allocating the payload
//! buffer:
//! * `dim` and `n_vectors` are bounded by [`MAX_DIM`] and [`MAX_VECTORS`].
//! * `dim` and `n_vectors` are bounded by [`MAX_DIM`] (or
//! [`MAX_SIGN_BITMAP_DIM`] for sign bitmaps) and [`MAX_VECTORS`].
//! * `bits` is checked against `{1, 2, 4}` before any multiplication.
//! * Total payload size is computed via [`usize::checked_mul`] and
//! rejected if it overflows or exceeds the 128 GiB `MAX_PAYLOAD` cap.
//! (`MAX_DIM * MAX_VECTORS` alone is ~8 TiB, so `MAX_PAYLOAD` is the
//! binding byte ceiling, not the `dim` / `n_vectors` caps.)
//! (`MAX_DIM * MAX_VECTORS * 2` bytes alone is ~8 TiB, so `MAX_PAYLOAD`
//! is the binding byte ceiling, not the `dim` / `n_vectors` caps.)
//! * The declared payload must match the file's remaining bytes
//! *exactly* — a structurally-valid file with trailing bytes is
//! rejected (v1 formats have no footer or reserved trailing section).
Expand Down
6 changes: 5 additions & 1 deletion src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
//! path (full ranks, bucketed ranks, bitmap overlap).
//! - [`l2_normalise`] returns the unit-norm copy of a query vector for
//! the asymmetric scoring path.
//! - The checked-allocation guards (`result_buffer_len`, `checked_new_len`),
//! the finite-input assert (`assert_all_finite`), and the portable AND/XOR
//! popcount reductions (`and_popcount` / `xor_popcount`) round out the
//! shared helpers.
//!
//! Both items are `pub(crate)` so they are reachable from the sibling
//! These items are all `pub(crate)` so they are reachable from the sibling
//! index modules (`rank`, `quant`, `bitmap`, `multi_bucket`, `fastscan`)
//! but not from outside the crate.

Expand Down
10 changes: 5 additions & 5 deletions tests/index/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Integration tests for the rank-cosine index family.
//!
//! Three substrate types and their kernels:
//! Three kinds of check across the substrate types and their kernels:
//!
//! 1. Scalar correctness — each kernel agrees with a hand-written
//! reference implementation on the same inputs (top-k indices
Expand All @@ -11,8 +11,8 @@
//! 3. Loader robustness — malformed serialisation files surface as
//! `Err`, never panic.
//!
//! The file split mirrors `ordvec::index` (`rank.rs`,
//! `quant.rs`, `bitmap.rs`, `multi_bucket.rs`). Shared corpus +
//! The file split mirrors the crate's flat per-type modules
//! (`rank.rs`, `quant.rs`, `bitmap.rs`, `multi_bucket.rs`). Shared corpus +
//! reference helpers live here; loader fuzz lives here because it
//! crosses all four loader types (rank, rankquant, bitmap, sign
//! bitmap) in a single hermetic test.
Expand All @@ -26,10 +26,10 @@ use rand_chacha::ChaCha8Rng;

mod bitmap;
mod fastscan;
mod rank;
// `MultiBucketBitmap` is gated behind the `experimental` feature.
mod finite;
mod loader_validation;
mod rank;
// `MultiBucketBitmap` is gated behind the `experimental` feature.
#[cfg(feature = "experimental")]
mod multi_bucket;
mod quant;
Expand Down
3 changes: 2 additions & 1 deletion tests/redteam_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ fn rt1_subset_in_range_matches_reference_popcount() {
let q: Vec<f32> = (0..DIM).map(|i| ((i * 17) % 89) as f32 - 44.0).collect();
let qb = idx.build_query_bitmap_fp32(&q);

// Ascending subset (the public contract requires sorted ids).
// Ascending subset (the recommended order for cache locality; sorting
// is a performance preference, not a correctness requirement).
let doc_ids = [0u32, 2, 3, 5];
let mut out = vec![0u32; doc_ids.len()];
idx.body_overlap_scores_subset(&qb, &doc_ids, &mut out);
Expand Down
Loading