Project-Navi · Navi Bot (project-navi-bot) · May 26, 2026 · May 26, 2026 · May 26, 2026
@@ -103,7 +103,7 @@ impl Bitmap {
     pub fn build_query_bitmap_fp32(&self, q: &[f32]) -> Vec<u64> {
         assert_eq!(q.len(), self.dim);
         assert_all_finite(q);
-        // Index the dim sorted by |q[j]| desc; alternative: by q[j] desc.
+        // Index the dim sorted by q[j] desc; alternative: by |q[j]| desc.
         // We use raw value desc so the top bits flag where the query
         // points positively, matching the doc-side semantics.
         let mut idx: Vec<u16> = (0..self.dim as u16).collect();
@@ -178,7 +178,7 @@ impl Bitmap {
     /// streaming top-k buffer (each replacement triggers a linear
     /// recompute_min). Instead we scan once into a contiguous
     /// `Vec<u32>` of all N scores and `select_nth_unstable` the
-    /// top-`m`: O(N + m log m). The 828 KiB temp at N=207k is
+    /// top-`m`: O(N + m log m). The ~808 KiB temp at N=207k is
     /// cheap relative to the cost it saves at M ≥ 1000.
     pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec<u32> {
         assert_all_finite(q);
@@ -257,7 +257,7 @@ impl Bitmap {
         // One doc-scan pass writes `batch * n` u32 scores, layout
         // scores[bi * n + di]. At B=8, N=207k that buffer is ~6.6 MB —
         // L3-resident, not per-core L2. The parallel select_nth below
-        // streams one query's ~828 KiB score slice per worker; it backs
+        // streams one query's ~808 KiB score slice per worker; it backs
         // from L3, but the selection is a single linear pass, so it stays
         // bandwidth-bound rather than thrashing a small cache.
         let scores_len = batch

@@ -16,7 +16,7 @@
 //! just rearranges the same sum.
 //!
 //! Storage: `dim × 2^bits / 8` bytes per document
-//! (b=2: 512 B/doc at D=1024 = matches RankQuant b=2;
+//! (b=2: 512 B/doc at D=1024 = 2× RankQuant b=2;
 //!  b=4: 2048 B/doc at D=1024 = 4× RankQuant b=4).
 //!
 //! The full 16×16 (b=4) probe is *not* a faster scoring kernel — it

@@ -5,9 +5,9 @@
 //! dispatches AVX-512 → AVX2 → scalar via the kernels in
 //! [`crate::quant_kernels`].
 //!
-//! The byte-LUT path ([`search_asymmetric_byte_lut`]) is exposed
-//! publicly via `ordvec::search_asymmetric_byte_lut` so
-//! `examples/bench_rank.rs` can compare it against the production
+//! The byte-LUT path ([`search_asymmetric_byte_lut`]) is re-exported
+//! `#[doc(hidden)]` (reachable as `ordvec::search_asymmetric_byte_lut`)
+//! so `examples/bench_rank.rs` can compare it against the production
 //! AVX path on the same data.
 
 use rayon::prelude::*;
@@ -74,10 +74,12 @@ enum SimdTier {
 /// `dim % (1 << bits) == 0` and `dim % (8 / bits) == 0`, which is
 /// *weaker* than the SIMD invariants (e.g. dim 48 / 80 / 20 are valid
 /// constructor dims that violate them). A kernel whose invariant is
-/// unmet silently drops its trailing chunk in release builds and
-/// returns the wrong top-k. This selector returns the highest tier
-/// whose invariant holds — falling back to [`SimdTier::None`] (scalar
-/// LUT, which handles any valid dim) when neither SIMD tier fits.
+/// unmet hits a hard `assert!` and panics in release — the kernels
+/// enforce their lane invariant in every build, by design. This
+/// selector returns the highest tier whose invariant holds — falling
+/// back to [`SimdTier::None`] (scalar LUT, which handles any valid dim)
+/// when neither SIMD tier fits, so a constructor-valid-but-SIMD-invalid
+/// dim never reaches a kernel that would reject it.
 #[inline]
 fn select_simd_tier(dim: usize, bits: u8) -> SimdTier {
     // SIMD asymmetric kernels exist only for b ∈ {2, 4}. b=1 (and any
@@ -293,9 +295,10 @@ impl RankQuant {
         // processes 8 codes/chunk → needs dim % 8). The constructor only
         // guarantees `dim % (1 << bits) == 0` and `dim % (8 / bits) == 0`,
         // so constructor-valid dims like 48 / 80 / 20 can violate the
-        // SIMD invariant. In release builds the kernels' `debug_assert`
-        // is compiled out and they silently drop the trailing chunk →
-        // wrong top-k. The dispatch below must therefore only select a
+        // SIMD invariant. Each kernel enforces its lane invariant with a
+        // real `assert!` (not a `debug_assert!`), so a mis-dispatch panics
+        // loudly in release rather than silently dropping a chunk. The
+        // dispatch below must therefore only select a
         // tier whose invariant holds for (dim, bits); otherwise it falls
         // back to the scalar LUT path which handles any valid dim.
         #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))]
@@ -651,7 +654,7 @@ impl RankQuant {
 //   B=2: 256 groups × 256 entries × 4 B = 256 KiB per query (fits L2)
 //   B=4: 512 groups × 256 entries × 4 B = 512 KiB per query (spills L2 a little)
 //
-// Exposed publicly for benchmarking. Production callers should reach
+// Re-exported `#[doc(hidden)]` for benchmarking. Production callers should reach
 // for [`RankQuant::search_asymmetric`] which dispatches to the
 // fastest implementation for the current CPU.
 // -------------------------------------------------------------------

@@ -19,7 +19,7 @@
 //! the value, and rank-vector norms are analytical (a permutation of
 //! `{0..D-1}` has fixed L2 norm after mean-centring).
 //!
-//! See `tests/rank.rs` for the round-trip and norm-invariant tests.
+//! See the `tests` module below for the round-trip and norm-invariant tests.
 
 use ordered_float::OrderedFloat;
 use rayon::prelude::*;

@@ -20,12 +20,13 @@
 //!
 //! All loaders validate header fields *before* allocating the payload
 //! buffer:
-//! * `dim` and `n_vectors` are bounded by [`MAX_DIM`] and [`MAX_VECTORS`].
+//! * `dim` and `n_vectors` are bounded by [`MAX_DIM`] (or
+//!   [`MAX_SIGN_BITMAP_DIM`] for sign bitmaps) and [`MAX_VECTORS`].
 //! * `bits` is checked against `{1, 2, 4}` before any multiplication.
 //! * Total payload size is computed via [`usize::checked_mul`] and
 //!   rejected if it overflows or exceeds the 128 GiB `MAX_PAYLOAD` cap.
-//!   (`MAX_DIM * MAX_VECTORS` alone is ~8 TiB, so `MAX_PAYLOAD` is the
-//!   binding byte ceiling, not the `dim` / `n_vectors` caps.)
+//!   (`MAX_DIM * MAX_VECTORS * 2` bytes alone is ~8 TiB, so `MAX_PAYLOAD`
+//!   is the binding byte ceiling, not the `dim` / `n_vectors` caps.)
 //! * The declared payload must match the file's remaining bytes
 //!   *exactly* — a structurally-valid file with trailing bytes is
 //!   rejected (v1 formats have no footer or reserved trailing section).

@@ -4,8 +4,12 @@
 //!   path (full ranks, bucketed ranks, bitmap overlap).
 //! - [`l2_normalise`] returns the unit-norm copy of a query vector for
 //!   the asymmetric scoring path.
+//! - The checked-allocation guards (`result_buffer_len`, `checked_new_len`),
+//!   the finite-input assert (`assert_all_finite`), and the portable AND/XOR
+//!   popcount reductions (`and_popcount` / `xor_popcount`) round out the
+//!   shared helpers.
 //!
-//! Both items are `pub(crate)` so they are reachable from the sibling
+//! These items are all `pub(crate)` so they are reachable from the sibling
 //! index modules (`rank`, `quant`, `bitmap`, `multi_bucket`, `fastscan`)
 //! but not from outside the crate.
 

@@ -1,6 +1,6 @@
 //! Integration tests for the rank-cosine index family.
 //!
-//! Three substrate types and their kernels:
+//! Three kinds of check across the substrate types and their kernels:
 //!
 //! 1. Scalar correctness — each kernel agrees with a hand-written
 //!    reference implementation on the same inputs (top-k indices
@@ -11,8 +11,8 @@
 //! 3. Loader robustness — malformed serialisation files surface as
 //!    `Err`, never panic.
 //!
-//! The file split mirrors `ordvec::index` (`rank.rs`,
-//! `quant.rs`, `bitmap.rs`, `multi_bucket.rs`). Shared corpus +
+//! The file split mirrors the crate's flat per-type modules
+//! (`rank.rs`, `quant.rs`, `bitmap.rs`, `multi_bucket.rs`). Shared corpus +
 //! reference helpers live here; loader fuzz lives here because it
 //! crosses all four loader types (rank, rankquant, bitmap, sign
 //! bitmap) in a single hermetic test.
@@ -26,10 +26,10 @@ use rand_chacha::ChaCha8Rng;
 
 mod bitmap;
 mod fastscan;
-mod rank;
-// `MultiBucketBitmap` is gated behind the `experimental` feature.
 mod finite;
 mod loader_validation;
+mod rank;
+// `MultiBucketBitmap` is gated behind the `experimental` feature.
 #[cfg(feature = "experimental")]
 mod multi_bucket;
 mod quant;

@@ -89,7 +89,8 @@ fn rt1_subset_in_range_matches_reference_popcount() {
     let q: Vec<f32> = (0..DIM).map(|i| ((i * 17) % 89) as f32 - 44.0).collect();
     let qb = idx.build_query_bitmap_fp32(&q);
 
-    // Ascending subset (the public contract requires sorted ids).
+    // Ascending subset (the recommended order for cache locality; sorting
+    // is a performance preference, not a correctness requirement).
     let doc_ids = [0u32, 2, 3, 5];
     let mut out = vec![0u32; doc_ids.len()];
     idx.body_overlap_scores_subset(&qb, &doc_ids, &mut out);