diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 07cb825c..755f4ecb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -185,6 +185,8 @@ jobs: run: cargo test - name: cargo test (experimental) run: cargo test --features experimental + - name: cargo test (test-utils) + run: cargo test --features test-utils - name: cargo test (no default features) run: cargo test --no-default-features - name: cargo build --release --example bench_rank diff --git a/CHANGELOG.md b/CHANGELOG.md index f330e02f..bd5c1cc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 VPOPCNTDQ scan kernels are active on the current CPU. The scan dispatch reads only this predicate (no per-dimension gate). +### Changed + +- **Release-hardened the caller-owned serial two-stage primitives** (no API + change; added in 0.5.0). The trust model is now explicit and tested: + - Rejection-path regression tests for the full CSR/query/buffer validation set + on the rerank entry points — overlong row (the guard that bounds the unsafe + gather), non-monotonic / wrong-final / non-zero-first offsets, non-finite and + ragged queries, and wrong output-buffer length — so a malformed-but-accepted + input can never reach the SIMD scan. + - A counting-allocator test proving `search_asymmetric_subset_batched_serial_into` + performs **zero heap allocations** in steady state (warmed `SubsetScratch`, + reused caller buffers) **on the AVX-512/AVX2 rerank path** — the strong form of + the prior capacity-stability proxy. (The scalar fallback, e.g. aarch64, + allocates a per-query scoring LUT; the test skips the strict check there.) + - A focused `two_stage_bench` example decomposing stage-1 candidate-gen / + single-query rerank loop / batched `_into` / full two-stage at the + Harrier-1024 shape, with a committed reference capture + (`benchmarks/two_stage_caller_owned_dim1024.txt`, SYNTHETIC corpus). + - User-facing docs for the caller-owned / no-rayon / allocation-free contract + (README + rustdoc examples on the `_into` hot path and the CSR candidate-gen). + ### Fixed - **`ordvec-manifest` crate and wheel now ship license text.** Both declared @@ -53,8 +74,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `RankQuant::search_asymmetric_subset_batched_serial(..) -> SearchResults` and `..._serial_into(.., &mut SubsetScratch, &mut out_scores, &mut out_indices)` — serial batched subset rerank; the `_into` form is allocation-free after - scratch warmup (the integration contract for runtimes that own their own - thread pool / GIL release). + scratch warmup on the AVX-512/AVX2 rerank path (the integration contract for + runtimes that own their own thread pool / GIL release). - New public types `CandidateBatch` (CSR candidate carrier) and `SubsetScratch` (reusable rerank scratch). - These primitives never enter rayon; the caller owns parallelism. No bundled diff --git a/Cargo.toml b/Cargo.toml index 3a936a00..3a63ad89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,10 @@ rand_chacha = "0.10" # target takes the scalar fallback. `experimental` exposes MultiBucketBitmap # (research scaffold), kept off the stable surface. experimental = [] +# `test-utils` exposes internal dispatch probes used by the crate's own integration +# tests (e.g. the allocation-free guarantee check). Gated off the default surface +# because these helpers are not part of the public API and carry no semver guarantee. +test-utils = [] [profile.release] lto = true diff --git a/README.md b/README.md index 86b5be9e..f6d77306 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,49 @@ For the two-stage compressed-scan path (`Bitmap` / `SignBitmap` candidate generation → `RankQuant` rerank) and the full mode comparison, see [`docs/RANK_MODES.md`](docs/RANK_MODES.md). +### Caller-owned serial two-stage (DB / runtime integration) + +For runtimes that own their own parallelism — an embedded vector DB driving a +bounded thread pool, or a binding releasing the GIL — ordvec exposes a +**no-rayon** serial two-stage path so the *caller* schedules the work, with an +**allocation-free rerank step** (`_into`, on the AVX-512/AVX2 path) for the +steady-state hot loop: + +```rust +use ordvec::{RankQuant, SignBitmap, SubsetScratch}; +// Shape sketch (not standalone): `rq: RankQuant` and `sign: SignBitmap` are +// built and `add`-ed as in the Quickstart above; `queries` is your flat +// `dim * nq` f32 batch, `m` the shortlist size, `k` the top-k. +// Stage 1 — serial CSR candidate generation (never enters rayon): +let cb = sign.top_m_candidates_batched_serial_csr(&queries, m); // CandidateBatch { offsets, candidates } +// Stage 2 — rerank into CALLER-OWNED buffers with a reusable scratch: +let nq = queries.len() / dim; +let out_k = k.min(rq.len()); +let mut scratch = SubsetScratch::new(); // reuse across batches +let mut out_scores = vec![f32::NEG_INFINITY; nq * out_k]; +let mut out_indices = vec![-1i64; nq * out_k]; +rq.search_asymmetric_subset_batched_serial_into( + &queries, &cb.offsets, &cb.candidates, k, + &mut scratch, &mut out_scores, &mut out_indices, +); +``` + +Contract: candidates are **CSR** (`offsets.len() == nq + 1`; row `qi` is +`candidates[offsets[qi]..offsets[qi+1]]`; rows need **not** be sorted). Output is +**rectangular** `nq * out_k` and **sentinel-padded** (`-1` / `NEG_INFINITY`) for +underfull rows — size both buffers to `nq * k.min(index.len())`. Scores, row ids, +and the deterministic tie policy (`score desc, global row-id asc`) match the +single-query `search_asymmetric_subset`. **Only the `_into` rerank step is +allocation-free** — on the **AVX-512 / AVX2** SIMD path, and only on repeated +calls of the *same* batch shape — reusing the warmed `SubsetScratch` and your +output buffers (no per-row alloc, no whole-buffer preclear). The scalar fallback +(no AVX2, e.g. aarch64) allocates a per-query scoring LUT. Stage 1 +(`top_m_candidates_batched_serial_csr`) also allocates a fresh `CandidateBatch` +each call. Neither primitive enters rayon — +partition the query batch and call `_into` once per worker range from your own +pool. A focused decomposition benchmark lives in +[`examples/two_stage_bench.rs`](examples/two_stage_bench.rs). + ### Python The same `Rank` / `RankQuant` / `Bitmap` / `SignBitmap` API is available from diff --git a/benchmarks/two_stage_caller_owned_dim1024.txt b/benchmarks/two_stage_caller_owned_dim1024.txt new file mode 100644 index 00000000..765a3a65 --- /dev/null +++ b/benchmarks/two_stage_caller_owned_dim1024.txt @@ -0,0 +1,21 @@ +Caller-owned serial two-stage decomposition — Harrier-1024 shape (SYNTHETIC corpus) +Reproduce: + cargo run --release --example two_stage_bench -- --dim 1024 --n 50000 --queries 200 --m 256 --k 10 --reps 15 +Host: AMD Ryzen 9 9950X (Zen5), AVX-512 VPOPCNTDQ, single core (taskset -c 12), single-thread. + + dim=1024 n=50000 queries=200 m=256 k=10 bits=2 out_k=10 candidates=51200 reps=15 + 1. stage-1 candidate gen (CSR) 31.920 ms 6265.59 q/s 159.60 us/query + 2. single-query rerank loop 2.086 ms 95858.02 q/s 10.43 us/query + 3. batched rerank _into 2.031 ms 98463.67 q/s 10.16 us/query + 4. full two-stage (1+3) 34.485 ms 5799.70 q/s 172.42 us/query + rerank speedup (batched _into vs single-query loop): 1.03x + +Interpretation (no-fiction): at dim=1024 the rerank stage is a small slice +(~10 us/query) of an already-stage-1-dominated two-stage cost (~160 us/query); +the batched _into form is on par with the single-query loop SINGLE-THREADED +(~1.03x). The caller-owned serial primitives are NOT a single-thread speedup — +their value is (a) allocation-free steady state (tests/alloc_free.rs proves 0 +heap allocations on a warmed _into call) and (b) caller-owned parallelism: no +internal rayon, so a DB/runtime can drive the _into form across its own bounded +pool (GIL released) one query-range per worker. This dim=1024 result is its own +mechanism; it is NOT explained by the SignBitmap AVX-tail dim=768 result. diff --git a/examples/two_stage_bench.rs b/examples/two_stage_bench.rs new file mode 100644 index 00000000..23f24966 --- /dev/null +++ b/examples/two_stage_bench.rs @@ -0,0 +1,177 @@ +//! Focused benchmark + integration example for the **caller-owned serial** +//! two-stage path (the integration contract for DBs / runtimes that own their +//! own parallelism). SYNTHETIC corpus — these numbers are a +//! relative decomposition of the serial path on random data, NOT a retrieval- +//! quality or real-corpus claim, and the dim=1024 result is its own mechanism +//! (do not conflate it with the SignBitmap AVX-tail dim=768 result). +//! +//! It decomposes the cost into four separately-timed phases at the Harrier-1024 +//! shape and prints a headline "batched `_into` vs single-query loop" rerank +//! speedup — the per-query-overhead reduction the caller-owned API exists for: +//! 1. stage-1 candidate generation (top_m_candidates_batched_serial_csr) +//! 2. single-query subset rerank loop (search_asymmetric_subset, baseline) +//! 3. batched rerank `_into` (warmed SubsetScratch, caller-owned buffers) +//! 4. full two-stage serial (1 + 3 end to end) +//! +//! cargo run --release --example two_stage_bench -- [--dim N] [--n N] +//! [--queries N] [--m N] [--k N] [--bits {1,2,4}] [--reps N] + +use ordvec::{RankQuant, SignBitmap, SubsetScratch}; +use rand::{RngExt, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use std::time::Instant; + +fn median(mut v: Vec) -> f64 { + v.sort_by(|a, b| a.partial_cmp(b).unwrap()); + v[v.len() / 2] +} + +fn main() { + // Harrier-1024 defaults; all overridable. + let mut dim = 1024usize; + let mut n = 50_000usize; + let mut nq = 200usize; + let mut m = 256usize; + let mut k = 10usize; + let mut bits = 2u8; + let mut reps = 20usize; + let mut args = std::env::args().skip(1); + while let Some(flag) = args.next() { + let mut val = || args.next().expect("flag needs a value").parse().unwrap(); + match flag.as_str() { + "--dim" => dim = val(), + "--n" => n = val(), + "--queries" => nq = val(), + "--m" => m = val(), + "--k" => k = val(), + "--bits" => bits = args.next().unwrap().parse().unwrap(), + "--reps" => reps = val(), + other => { + eprintln!("unknown arg: {other}"); + std::process::exit(2); + } + } + } + assert!(nq > 0 && n > 0 && reps > 0, "n, queries, reps must be > 0"); + + let mut rng = ChaCha8Rng::seed_from_u64(7); + let corpus: Vec = (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let mut sign = SignBitmap::new(dim); + sign.add(&corpus); + let mut rq = RankQuant::new(dim, bits); + rq.add(&corpus); + let queries: Vec = (0..nq * dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + drop(corpus); + + let out_k = k.min(rq.len()); + // Caller-owned output buffers, allocated ONCE and reused across batches — + // rectangular nq*out_k, sentinel-padded for underfull rows. + let mut out_scores = vec![f32::NEG_INFINITY; nq * out_k]; + let mut out_indices = vec![-1i64; nq * out_k]; + let mut scratch = SubsetScratch::new(); + + // Warm: build the candidate batch once and warm the scratch to this shape. + let cb = sign.top_m_candidates_batched_serial_csr(&queries, m); + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &cb.offsets, + &cb.candidates, + k, + &mut scratch, + &mut out_scores, + &mut out_indices, + ); + let total_candidates = cb.candidates.len(); + + // Phase 1 — stage-1 candidate generation (serial CSR). + let p1 = median( + (0..reps) + .map(|_| { + let t = Instant::now(); + let c = sign.top_m_candidates_batched_serial_csr(&queries, m); + std::hint::black_box(&c); + t.elapsed().as_secs_f64() + }) + .collect(), + ); + + // Phase 2 — single-query subset rerank loop (the per-query baseline). + let p2 = median( + (0..reps) + .map(|_| { + let t = Instant::now(); + for qi in 0..nq { + let row = &cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]]; + let r = rq.search_asymmetric_subset(&queries[qi * dim..(qi + 1) * dim], row, k); + std::hint::black_box(&r); + } + t.elapsed().as_secs_f64() + }) + .collect(), + ); + + // Phase 3 — batched `_into` (warmed scratch + reused caller buffers). + let p3 = median( + (0..reps) + .map(|_| { + let t = Instant::now(); + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &cb.offsets, + &cb.candidates, + k, + &mut scratch, + &mut out_scores, + &mut out_indices, + ); + t.elapsed().as_secs_f64() + }) + .collect(), + ); + + // Phase 4 — full two-stage serial (stage-1 gen + batched rerank). + let p4 = median( + (0..reps) + .map(|_| { + let t = Instant::now(); + let c = sign.top_m_candidates_batched_serial_csr(&queries, m); + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &c.offsets, + &c.candidates, + k, + &mut scratch, + &mut out_scores, + &mut out_indices, + ); + t.elapsed().as_secs_f64() + }) + .collect(), + ); + + let row = |label: &str, secs: f64| { + println!( + " {label:<34} {:>9.3} ms {:>10.2} q/s {:>9.2} us/query", + secs * 1e3, + nq as f64 / secs, + secs / nq as f64 * 1e6, + ); + }; + println!("caller-owned serial two-stage (SYNTHETIC corpus)"); + println!( + " dim={dim} n={n} queries={nq} m={m} k={k} bits={bits} out_k={out_k} \ + candidates={total_candidates} reps={reps}" + ); + println!( + " (dim % 64 == {}: AVX-512 tier eligible when supported)", + dim % 64 + ); + row("1. stage-1 candidate gen (CSR)", p1); + row("2. single-query rerank loop", p2); + row("3. batched rerank _into", p3); + row("4. full two-stage (1+3)", p4); + println!( + " rerank speedup (batched _into vs single-query loop): {:.2}x", + p2 / p3 + ); +} diff --git a/src/lib.rs b/src/lib.rs index a3725de0..7dbf76a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,6 +87,13 @@ pub use sign_bitmap::SignBitmap; #[doc(hidden)] pub use quant::search_asymmetric_byte_lut; +// `subset_rerank_uses_simd` is a test-only dispatch probe used by the crate's +// own SIMD-parity tests. Gated behind the non-default `test-utils` feature and +// excluded from semver guarantees — not a supported downstream API. +#[cfg(feature = "test-utils")] +#[doc(hidden)] +pub use quant::subset_rerank_uses_simd; + // `MultiBucketBitmap` underwrites the bilinear bucket-overlap // decomposition but is not the constant-weight top-bucket theorem surface and // is not stable public API. It is reachable only with the `experimental` diff --git a/src/quant.rs b/src/quant.rs index 5c3b2ca8..1d022aea 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -312,6 +312,33 @@ fn select_simd_tier(dim: usize, bits: u8) -> SimdTier { } } +/// Test-only dispatch probe used by the crate's own SIMD-parity tests. Not a +/// supported downstream API; gated behind the non-default `test-utils` feature +/// and excluded from semver guarantees. +/// +/// Returns `true` when the asymmetric subset rerank takes a SIMD kernel (vs the +/// scalar LUT fallback) for a **constructor-valid** `(dim, bits)` on this CPU. +/// The scalar fallback allocates a per-query LUT, so the allocation-free +/// steady-state guarantee of +/// [`RankQuant::search_asymmetric_subset_batched_serial_into`] holds exactly +/// when this is `true`. +/// +/// Returns `false` for any `(dim, bits)` that [`RankQuant::new`] would reject, +/// so it answers "the rerank will take a SIMD kernel" rather than acting as a +/// raw tier probe: a SIMD tier can be selected for a `(dim, bits)` that is not +/// constructor-valid (e.g. `bits = 4` with `dim` a multiple of 8 but not of +/// `2^bits = 16`). +/// +/// It reads the same [`select_simd_tier`] the rerank dispatch reads, so it +/// cannot drift from the actual dispatch. +#[cfg(feature = "test-utils")] +#[doc(hidden)] +#[must_use] +pub fn subset_rerank_uses_simd(dim: usize, bits: u8) -> bool { + RankQuant::validate_params(dim, bits).is_ok() + && !matches!(select_simd_tier(dim, bits), SimdTier::None) +} + impl RankQuant { /// Validate `(dim, bits)` for **code validity** — the precondition for /// generating bucket codes, projections, and asymmetric scores. @@ -1187,9 +1214,11 @@ impl RankQuant { } /// Serial (NO rayon) batched subset rerank into caller-owned buffers. - /// Allocation-free after `scratch` warmup. The integration contract for - /// runtimes that own their own parallelism (call this from a bounded pool, - /// with the GIL released, one row range per worker is the caller's choice). + /// Allocation-free after `scratch` warmup **on the SIMD rerank path + /// (AVX-512 / AVX2)**; the scalar fallback allocates a per-query scoring LUT. + /// The integration contract for runtimes that own their own parallelism + /// (call this from a bounded pool, with the GIL released, one row range per + /// worker is the caller's choice). /// /// `queries` is `nq * dim`. Candidates are CSR: `candidate_offsets.len() /// == nq + 1`, row `qi` is `candidates[candidate_offsets[qi]..candidate_offsets[qi+1]]`. @@ -1206,6 +1235,36 @@ impl RankQuant { /// `nq + 1` long, not starting at `0`, non-monotonic, or not ending at /// `candidates.len()`), a row longer than `self.len()`, a candidate id /// `>= self.len()`, a non-finite query value, or a wrong output-buffer length. + /// + /// Buffer sizing differs from the single-query [`Self::search_asymmetric_subset`] + /// (which returns a short `Vec` of `min(k, row_len)`): here the output is a + /// rectangular `nq * out_k` grid, sentinel-padded — size both buffers to + /// `nq * k.min(self.len())`. A too-short buffer trips the fail-loud length + /// assert rather than under-writing; this is a common porting pitfall. + /// + /// # Example + /// ```no_run + /// use ordvec::{RankQuant, SignBitmap, SubsetScratch}; + /// # let (dim, k, m) = (1024usize, 10usize, 256usize); + /// let sign = SignBitmap::new(dim); + /// let rq = RankQuant::new(dim, 2); + /// # let queries = vec![0.0f32; dim * 64]; + /// let nq = queries.len() / dim; + /// let out_k = k.min(rq.len()); + /// // Allocate scratch + output buffers ONCE; reuse across batches. + /// let mut scratch = SubsetScratch::new(); + /// let mut out_scores = vec![f32::NEG_INFINITY; nq * out_k]; + /// let mut out_indices = vec![-1i64; nq * out_k]; + /// let cb = sign.top_m_candidates_batched_serial_csr(&queries, m); + /// rq.search_asymmetric_subset_batched_serial_into( + /// &queries, &cb.offsets, &cb.candidates, k, + /// &mut scratch, &mut out_scores, &mut out_indices, + /// ); + /// // Query qi's top-k is out_indices[qi*out_k..(qi+1)*out_k] (sentinel-padded). + /// // Reuse scratch + buffers for the next batch — no further allocation once + /// // scratch has warmed to this shape (NO internal rayon: drive this from + /// // your own pool, one query-range per worker). + /// ``` #[allow(clippy::too_many_arguments)] // arity is intrinsic to the caller-owned buffered contract (CSR inputs + scratch + two output buffers) pub fn search_asymmetric_subset_batched_serial_into( &self, diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 04081ab3..4f1ce09e 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -307,6 +307,20 @@ impl SignBitmap { /// [`Self::top_m_candidates`] (which materialises a per-query `n` Hamming /// row). A future release may replace the internals with streaming top-m /// behind this frozen signature; the CSR output contract will not change. + /// + /// # Example + /// ```no_run + /// use ordvec::SignBitmap; + /// # let (dim, m) = (1024usize, 256usize); + /// let sign = SignBitmap::new(dim); + /// # let queries = vec![0.0f32; dim * 64]; + /// let cb = sign.top_m_candidates_batched_serial_csr(&queries, m); + /// // CSR: query qi's candidate row is + /// // `cb.candidates[cb.offsets[qi]..cb.offsets[qi + 1]]`. Pass `cb.offsets` + /// // and `cb.candidates` straight into + /// // `RankQuant::search_asymmetric_subset_batched_serial_into`. + /// let _row0 = &cb.candidates[cb.offsets[0]..cb.offsets[1]]; + /// ``` #[must_use = "this scans the corpus per query to generate candidates; dropping the result discards that work"] pub fn top_m_candidates_batched_serial_csr(&self, queries: &[f32], m: usize) -> CandidateBatch { let dim = self.dim; diff --git a/tests/alloc_free.rs b/tests/alloc_free.rs new file mode 100644 index 00000000..66d13a84 --- /dev/null +++ b/tests/alloc_free.rs @@ -0,0 +1,121 @@ +//! Counting-allocator proof that the caller-owned batched rerank `_into` form +//! performs ZERO heap allocations in steady state — i.e. after the +//! `SubsetScratch` has warmed to the batch shape. This is the strong form of +//! the capacity-stability proxy in `tests/index/two_stage.rs` +//! (`batched_into_is_allocation_free_after_warmup`): a capacity check can miss +//! an alloc-then-free-to-same-capacity, an allocation counter cannot. +//! +//! Lives in its own test binary so the `#[global_allocator]` only governs this +//! file's measurement and never perturbs the rest of the suite. + +use std::alloc::{GlobalAlloc, Layout, System}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[cfg(feature = "test-utils")] +use ordvec::{RankQuant, SignBitmap, SubsetScratch}; +#[cfg(feature = "test-utils")] +use rand::{RngExt, SeedableRng}; +#[cfg(feature = "test-utils")] +use rand_chacha::ChaCha8Rng; + +static ALLOCS: AtomicUsize = AtomicUsize::new(0); + +/// System allocator that counts allocating operations (alloc / zeroed / +/// realloc). Dealloc is not counted — we assert on *allocations* in a window. +struct Counting; + +// SAFETY: every method forwards to the System allocator with the identical +// pointer/layout, only incrementing a relaxed counter first; this preserves +// all of System's safety contract. +unsafe impl GlobalAlloc for Counting { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + ALLOCS.fetch_add(1, Ordering::Relaxed); + unsafe { System.alloc(layout) } + } + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + unsafe { System.dealloc(ptr, layout) } + } + unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 { + ALLOCS.fetch_add(1, Ordering::Relaxed); + unsafe { System.realloc(ptr, layout, new_size) } + } + unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 { + ALLOCS.fetch_add(1, Ordering::Relaxed); + unsafe { System.alloc_zeroed(layout) } + } +} + +#[global_allocator] +static GLOBAL: Counting = Counting; + +#[cfg(feature = "test-utils")] +#[test] +fn batched_into_is_truly_allocation_free_after_warmup() { + let dim = 128usize; + let n = 2_000usize; + let nq = 8usize; + let m = 64usize; + let k = 10usize; + let bits = 2u8; + + // The zero-allocation guarantee holds only when the rerank takes a SIMD + // kernel: the scalar LUT fallback (`scan_via_lut_scalar`) allocates a + // per-query LUT. Gate on the SAME dispatch decision the rerank reads — via + // `subset_rerank_uses_simd`, so the gate cannot drift from the actual + // dispatch — and skip the strict check on hosts that fall to scalar + // (aarch64, or x86 without AVX2+FMA / AVX-512). + if !ordvec::subset_rerank_uses_simd(dim, bits) { + eprintln!( + "alloc_free: rerank uses the scalar LUT fallback for \ + (dim={dim}, bits={bits}) — it allocates a per-query LUT; \ + skipping strict zero-alloc check" + ); + return; + } + + let mut rng = ChaCha8Rng::seed_from_u64(2024); + let corpus: Vec = (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let mut sign = SignBitmap::new(dim); + sign.add(&corpus); + let mut rq = RankQuant::new(dim, bits); + rq.add(&corpus); + let queries: Vec = (0..nq * dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + + let cb = sign.top_m_candidates_batched_serial_csr(&queries, m); + let out_k = k.min(rq.len()); + let mut out_scores = vec![f32::NEG_INFINITY; nq * out_k]; + let mut out_indices = vec![-1i64; nq * out_k]; + let mut scratch = SubsetScratch::new(); + + // Warm the scratch to this exact batch shape. + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &cb.offsets, + &cb.candidates, + k, + &mut scratch, + &mut out_scores, + &mut out_indices, + ); + + // Steady state: an identical second call (same shape, warmed scratch, + // caller-owned output buffers reused) must allocate nothing. + let before = ALLOCS.load(Ordering::Relaxed); + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &cb.offsets, + &cb.candidates, + k, + &mut scratch, + &mut out_scores, + &mut out_indices, + ); + let after = ALLOCS.load(Ordering::Relaxed); + + assert_eq!( + after - before, + 0, + "steady-state _into allocated {} time(s) (expected 0)", + after - before + ); +} diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs index b638da66..664c44c6 100644 --- a/tests/index/two_stage.rs +++ b/tests/index/two_stage.rs @@ -746,3 +746,289 @@ fn batched_subset_rerank_matches_scalar_reference_across_tiers() { } } } + +// --------------------------------------------------------------------------- +// Rejection-path regression net for the caller-owned batched rerank. +// +// validate_csr_batch + the _into query/buffer asserts run BEFORE any unsafe +// gather (see the memory-safety contract on +// search_asymmetric_subset_batched_serial_into). These pin that every guard +// fires, so a malformed-but-accepted CSR/query can never reach the SIMD scan. +// (batched_into_rejects_bad_offsets_len + batched_into_rejects_oob_candidate +// above already cover the offsets-length and candidate-id guards.) +// --------------------------------------------------------------------------- + +#[test] +#[should_panic(expected = "exceeds n_vectors")] +fn batched_into_rejects_overlong_row() { + // A row longer than the corpus would over-size the gather scratch; this is + // the guard that bounds the unsafe gather. One row of N+1 in-range ids. + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_101)[..D].to_vec(); + let candidates = vec![0u32; N + 1]; + let offsets = vec![0usize, N + 1]; + let mut s = vec![0.0f32; 3]; + let mut i = vec![0i64; 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &offsets, + &candidates, + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic(expected = "monotonic")] +fn batched_into_rejects_nonmonotonic_offsets() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_102)[..2 * D].to_vec(); + let candidates = vec![0u32; 3]; // last offset (3) == candidates.len() + let offsets = vec![0usize, 5, 3]; // window [5,3] is non-monotonic + let mut s = vec![0.0f32; 2 * 3]; + let mut i = vec![0i64; 2 * 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &offsets, + &candidates, + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic(expected = "must equal candidates.len()")] +fn batched_into_rejects_wrong_final_offset() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_103)[..D].to_vec(); + let candidates = vec![0u32; 2]; // len 2, but final offset says 3 + let offsets = vec![0usize, 3]; + let mut s = vec![0.0f32; 3]; + let mut i = vec![0i64; 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &offsets, + &candidates, + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic(expected = "must be 0")] +fn batched_into_rejects_nonzero_first_offset() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_104)[..D].to_vec(); + let candidates: Vec = Vec::new(); + let offsets = vec![1usize, 1]; // offsets[0] != 0 + let mut s = vec![0.0f32; 3]; + let mut i = vec![0i64; 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &offsets, + &candidates, + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic] +fn batched_into_rejects_nonfinite_query() { + // A non-finite query value is rejected before the SIMD scan it would feed. + let (_sign, rq, _corpus) = build_two_stage(2); + let mut q = make_corpus(8_105)[..D].to_vec(); + q[0] = f32::NAN; + let mut s = vec![0.0f32; 3]; + let mut i = vec![0i64; 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &[0, 1], + &[0u32], + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic] +fn serial_csr_rejects_nonfinite_query() { + let (sign, _rq, _corpus) = build_two_stage(2); + let mut q = make_corpus(8_106)[..D].to_vec(); + q[1] = f32::INFINITY; + let _ = sign.top_m_candidates_batched_serial_csr(&q, 16); +} + +#[test] +#[should_panic(expected = "must be a multiple of dim")] +fn batched_into_rejects_ragged_query() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_107)[..D + 1].to_vec(); // not a multiple of dim + let mut s = vec![0.0f32; 3]; + let mut i = vec![0i64; 3]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &[0, 0], + &[], + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic(expected = "must be a multiple of dim")] +fn batched_serial_wrapper_rejects_ragged_query() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_108)[..D + 1].to_vec(); + let _ = rq.search_asymmetric_subset_batched_serial(&q, &[0, 0], &[], 3); +} + +#[test] +#[should_panic(expected = "out_scores length")] +fn batched_into_rejects_short_out_scores() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_109)[..D].to_vec(); + let out_k = 3usize.min(N); + let mut s = vec![0.0f32; out_k - 1]; // one short + let mut i = vec![0i64; out_k]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &[0, 1], + &[0u32], + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +#[should_panic(expected = "out_indices length")] +fn batched_into_rejects_short_out_indices() { + let (_sign, rq, _corpus) = build_two_stage(2); + let q = make_corpus(8_110)[..D].to_vec(); + let out_k = 3usize.min(N); + let mut s = vec![0.0f32; out_k]; + let mut i = vec![0i64; out_k - 1]; // one short + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &q, + &[0, 1], + &[0u32], + 3, + &mut scratch, + &mut s, + &mut i, + ); +} + +#[test] +fn batched_into_pads_mixed_full_and_underfull_rows() { + // One batch mixing a full row (len > k), an underfull row (0 < len < k), + // and an empty row. Each row's filled slots must equal the single-query + // reference; trailing slots are sentinel-padded (-1 / NEG_INFINITY). + let (sign, rq, _corpus) = build_two_stage(2); + let k = 5usize; + let out_k = k.min(N); + let queries = make_corpus(8_200)[..3 * D].to_vec(); + let q0 = &queries[0..D]; + let q1 = &queries[D..2 * D]; + let full_row = sign.top_m_candidates(q0, k + 4); // > k + let underfull_row: Vec = sign.top_m_candidates(q1, 2); // 0 < len < k + let empty_row: Vec = Vec::new(); + let rows = vec![full_row, underfull_row, empty_row]; + let (cand, off) = flatten_to_csr(&rows); + + let mut scores = vec![0.0f32; 3 * out_k]; + let mut indices = vec![0i64; 3 * out_k]; + let mut scratch = ordvec::SubsetScratch::new(); + rq.search_asymmetric_subset_batched_serial_into( + &queries, + &off, + &cand, + k, + &mut scratch, + &mut scores, + &mut indices, + ); + + for qi in 0..3 { + let (es, ei) = rq.search_asymmetric_subset(&queries[qi * D..(qi + 1) * D], &rows[qi], k); + for slot in 0..out_k { + if slot < ei.len() { + assert_eq!( + indices[qi * out_k + slot], + ei[slot], + "row{qi} slot{slot} id" + ); + assert_eq!( + scores[qi * out_k + slot], + es[slot], + "row{qi} slot{slot} score" + ); + } else { + assert_eq!( + indices[qi * out_k + slot], + -1, + "row{qi} slot{slot} sentinel id" + ); + assert_eq!( + scores[qi * out_k + slot], + f32::NEG_INFINITY, + "row{qi} slot{slot} sentinel score" + ); + } + } + } +} + +#[cfg(feature = "test-utils")] +#[test] +fn subset_rerank_uses_simd_is_false_for_constructor_invalid_params() { + // The dispatch probe must answer "the rerank will take a SIMD kernel" only + // for `(dim, bits)` that `RankQuant::new` accepts. A SIMD tier can be + // selected for params the constructor rejects — `bits = 4` needs + // `dim % 2^bits == 0`, but the AVX2 b=4 lane invariant is only + // `dim % 8 == 0` — so the probe must not claim SIMD for an index you + // cannot even build. + for &(dim, bits) in &[(8usize, 4u8), (24, 4), (3, 1), (6, 4)] { + assert!( + RankQuant::validate_params(dim, bits).is_err(), + "fixture ({dim},{bits}) must be constructor-invalid" + ); + assert!( + !ordvec::subset_rerank_uses_simd(dim, bits), + "probe must be false for constructor-invalid ({dim},{bits})" + ); + } + // Invariant: wherever the probe claims SIMD, the params must be buildable. + for dim in 2..=130usize { + for bits in [1u8, 2, 4] { + if ordvec::subset_rerank_uses_simd(dim, bits) { + assert!( + RankQuant::validate_params(dim, bits).is_ok(), + "probe claimed SIMD for constructor-invalid ({dim},{bits})" + ); + } + } + } +}