diff --git a/examples/bench_contingency.rs b/examples/bench_contingency.rs new file mode 100644 index 00000000..d2f2a8ab --- /dev/null +++ b/examples/bench_contingency.rs @@ -0,0 +1,497 @@ +//! Profiling harness for the contingency / projection surface (issue #219), +//! on a SYNTHETIC seeded corpus — NOT real data, fully reproducible. +//! +//! Two APIs, three regimes: +//! +//! * API 1 — stateless dense [`ordvec::Contingency`]: the `nb × nb` +//! bucket-overlap table for two `&[u8]` code slices, plus its named +//! projections. +//! * API 2 — the indexed [`ordvec::MultiBucketBitmap`] twin: +//! `contingency_row` / `diagonal_overlap_row` / `project_all_batched` +//! accumulate the same table straight from the stored bucket bitmaps. +//! +//! Regimes (issue #219 acceptance): +//! +//! (a) ONE PAIR / MANY PROJECTIONS — build the dense `Contingency` table +//! once, then apply all ~7 projections off the cached integer table, +//! vs naively recomputing each projection from the raw `&[u8]` codes +//! every time. Shows the build-once-project-many win. +//! +//! (b) ONE QUERY / MANY DOCS / ONE PROJECTION — the indexed diagonal fast +//! path over the whole corpus (dispatched = SIMD-when-available, and a +//! forced-scalar twin), vs the dense pairwise loop that rebuilds the +//! full `nb × nb` table per doc just to read its diagonal. +//! +//! (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — the indexed batched +//! `project_all_batched` (one table per doc, K projections off it), vs +//! calling the single-projection indexed path K times (one corpus +//! rescan per projection). Shows the no-rescan win, dispatched vs +//! forced-scalar. +//! +//! Timing is median-of-N wall clock (same discipline as +//! `examples/bench_retrieval.rs`). Every number is SYNTHETIC. +//! +//! NOTE on SIMD: the indexed kernels dispatch to an AVX-512 VPOPCNTDQ +//! popcount-AND path only when the host advertises `avx512f` AND +//! `avx512vpopcntdq` (the dense API-1 kernel additionally needs `avx512bw`). +//! On a host without VPOPCNTDQ the "dispatched" column runs the SAME scalar +//! kernel as the forced-scalar column, so the SIMD/scalar ratio is ~1.0 — +//! that is the honest result, not a bug. The header prints the host features +//! so the ratio can be read in context. +//! +//! Run: cargo run --release --features experimental --example bench_contingency + +#[cfg(not(feature = "experimental"))] +fn main() { + eprintln!( + "bench_contingency requires --features experimental (Contingency / \ + MultiBucketBitmap). Re-run: cargo run --release --features experimental \ + --example bench_contingency" + ); +} + +#[cfg(feature = "experimental")] +fn main() { + bench::run(); +} + +#[cfg(feature = "experimental")] +mod bench { + use std::time::Instant; + + use ordvec::{Contingency, MultiBucketBitmap, Projection}; + use rand::{RngExt, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + // ---- timing ----------------------------------------------------------- + + /// Median wall-clock seconds over `reps` timed runs (one warmup first), + /// each run doing `iters` repetitions of `f`. Returns seconds PER `f` + /// call (total / iters), so callers report cost of a single operation. + fn median_secs(reps: usize, iters: usize, mut f: impl FnMut()) -> f64 { + // warmup + for _ in 0..iters { + f(); + } + let mut lats = Vec::with_capacity(reps); + for _ in 0..reps { + let t = Instant::now(); + for _ in 0..iters { + f(); + } + lats.push(t.elapsed().as_secs_f64() / iters as f64); + } + lats.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + lats[lats.len() / 2] + } + + // ---- corpus ----------------------------------------------------------- + + fn gauss(rng: &mut ChaCha8Rng) -> f32 { + let u1: f32 = rng.random_range(1e-9..1.0); + let u2: f32 = rng.random_range(0.0..1.0); + (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos() + } + + /// Clustered gaussian corpus + a single query, seeded for reproducibility. + /// Each doc/query draws a cluster prototype plus noise (queries quieter), + /// so bucket assignments correlate the way real ordinal codes would — + /// pure iid noise would make every contingency table look the same. + fn make_data(n: usize, dim: usize, n_clusters: usize, seed: u64) -> (Vec, Vec) { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut protos = vec![0.0f32; n_clusters * dim]; + for x in protos.iter_mut() { + *x = gauss(&mut rng); + } + let emit = |count: usize, noise: f32, rng: &mut ChaCha8Rng| -> Vec { + let mut out = vec![0.0f32; count * dim]; + for i in 0..count { + let c = rng.random_range(0..n_clusters as u32) as usize; + for j in 0..dim { + out[i * dim + j] = protos[c * dim + j] + noise * gauss(rng); + } + } + out + }; + let corpus = emit(n, 1.0, &mut rng); + let query = emit(1, 0.5, &mut rng); + (corpus, query) + } + + /// Reconstruct the per-coordinate bucket codes (`0..nb`) a doc was encoded + /// with by reading back the index's stored bitmaps via the indexed + /// contingency: feeding a one-hot "query" for bucket `b` recovers, per + /// coordinate, whether the doc sits in bucket `b`. Simpler here: rebuild + /// the codes from `query_bitmaps_from_ranks` of the raw float vector, which + /// is exactly the encoding `add` applies. + fn codes_from_bitmaps(bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec { + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = b * qpb; + for (j, code) in codes.iter_mut().enumerate() { + if (bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + *code = b as u8; + } + } + } + codes + } + + // ---- the ~7 projections, as both named ops (API 1) and weight matrices -- + + /// The named-projection battery used in regime (a): the full surface a + /// caller might evaluate off one table (top-overlap, top-group, diagonal, + /// two bands, L1 transport, centred outer product). + fn named_projections(nb: usize) -> Vec { + let mut p = vec![ + Projection::TopOverlap, + Projection::TopGroupOverlap { width: 2 }, + Projection::DiagonalAgreement, + Projection::BandAgreement { radius: 1 }, + Projection::BucketL1Distance, + Projection::RankQuantSymmetric, + ]; + if nb > 4 { + p.push(Projection::BandAgreement { radius: 2 }); + } + p + } + + /// The same battery expressed as raw `nb × nb` weight matrices, for the + /// indexed `project_all_batched` path in regime (c). (TopOverlap, + /// TopGroup, diagonal, band1, band2, outer product — six dense matrices; + /// L1 distance is also expressible as a weight matrix `|a-b|`.) + fn weight_matrices(nb: usize) -> Vec> { + let c = (nb as f32 - 1.0) / 2.0; + let mut out: Vec> = Vec::new(); + + // top-overlap: single cell (nb-1, nb-1) + let mut w = vec![0.0f32; nb * nb]; + w[(nb - 1) * nb + (nb - 1)] = 1.0; + out.push(w); + + // top-group width 2 + let mut w = vec![0.0f32; nb * nb]; + let start = nb - 2; + for a in start..nb { + for b in start..nb { + w[a * nb + b] = 1.0; + } + } + out.push(w); + + // diagonal + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + w[a * nb + a] = 1.0; + } + out.push(w); + + // band radius 1 (unit weights) + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= 1 { + w[a * nb + b] = 1.0; + } + } + } + out.push(w); + + // bucket-L1 transport cost |a-b| + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + w[a * nb + b] = a.abs_diff(b) as f32; + } + } + out.push(w); + + // centred outer product (a-c)(b-c) + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + w[a * nb + b] = (a as f32 - c) * (b as f32 - c); + } + } + out.push(w); + + if nb > 4 { + // band radius 2 + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= 2 { + w[a * nb + b] = 1.0; + } + } + } + out.push(w); + } + out + } + + pub fn run() { + #[cfg(target_arch = "x86_64")] + { + println!( + "# host SIMD: avx2={} avx512f={} avx512bw={} avx512vpopcntdq={}", + is_x86_feature_detected!("avx2"), + is_x86_feature_detected!("avx512f"), + is_x86_feature_detected!("avx512bw"), + is_x86_feature_detected!("avx512vpopcntdq"), + ); + if !is_x86_feature_detected!("avx512vpopcntdq") { + println!( + "# NOTE: no avx512vpopcntdq on this host — the 'dispatched' \ + and 'scalar' columns run the SAME kernel; ratio ~1.0 is honest." + ); + } + } + #[cfg(not(target_arch = "x86_64"))] + println!("# host SIMD: non-x86_64 target — scalar kernels only"); + + println!("# bench_contingency — SYNTHETIC clustered gaussian corpus (not real data)"); + + for &bits in &[2u8, 4u8] { + run_bits(bits); + println!(); + } + } + + fn run_bits(bits: u8) { + let dim = 1024usize; + let n = 20_000usize; + let n_clusters = 64usize; + let seed = 219u64; + let nb = 1usize << bits; + let qpb = dim / 64; + + println!("# ===== bits={bits} (nb={nb}) dim={dim} n={n} clusters={n_clusters} seed={seed} ====="); + + let (corpus, query) = make_data(n, dim, n_clusters, seed); + + // Build the indexed surface (API 2). + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + + // Per-coord codes for API 1 (dense). `query_bitmaps_from_ranks` applies + // the exact same rank→bucket encoding `add` uses, so re-encoding doc 0's + // float vector and the query recovers the real codes the index holds. + let q_codes = codes_from_bitmaps(&q_bitmaps, nb, qpb, dim); + let doc0 = &corpus[0..dim]; + let d0_bitmaps = idx.query_bitmaps_from_ranks(doc0); + let d_codes = codes_from_bitmaps(&d0_bitmaps, nb, qpb, dim); + + // ---- regime (a): ONE PAIR / MANY PROJECTIONS (dense API 1) -------- + regime_a(&q_codes, &d_codes, nb); + + // ---- regime (b): ONE QUERY / MANY DOCS / ONE PROJECTION ---------- + regime_b(&idx, &q_bitmaps, n, nb); + + // ---- regime (c): ONE QUERY / MANY DOCS / MANY PROJECTIONS -------- + regime_c(&idx, &q_bitmaps, nb); + } + + /// (a) Build the dense `Contingency` once, apply all projections off the + /// cached table — vs recomputing each projection from the raw codes (build + /// a fresh table per projection). Single pair, repeated for timing. + fn regime_a(q_codes: &[u8], d_codes: &[u8], nb: usize) { + let projs = named_projections(nb); + let n_proj = projs.len(); + + // Build-once-project-many: one table, then all projections. + let build_once = median_secs(25, 200, || { + let c = Contingency::new(q_codes, d_codes, nb).unwrap(); + let mut acc = 0.0f32; + for p in &projs { + acc += p.score(&c); + } + std::hint::black_box(acc); + }); + + // Naive: rebuild the table from raw codes for EVERY projection. + let rebuild_each = median_secs(25, 200, || { + let mut acc = 0.0f32; + for p in &projs { + let c = Contingency::new(q_codes, d_codes, nb).unwrap(); + acc += p.score(&c); + } + std::hint::black_box(acc); + }); + + let speedup = rebuild_each / build_once; + println!("# (a) ONE PAIR / {n_proj} PROJECTIONS — dense Contingency (API 1)"); + println!("{:<34} {:>14} {:>14}", "approach", "us/pair", "speedup"); + println!( + "{:<34} {:>14.3} {:>14}", + "build-once, project-many", + build_once * 1e6, + "1.00x (ref)" + ); + println!( + "{:<34} {:>14.3} {:>13.2}x", + "rebuild-per-projection", + rebuild_each * 1e6, + speedup + ); + println!( + "DATA\ta\tbits={}\tbuild_once_us={:.3}\trebuild_each_us={:.3}\tn_proj={}\tspeedup={:.2}", + (nb as f32).log2() as u32, + build_once * 1e6, + rebuild_each * 1e6, + n_proj, + speedup + ); + } + + /// (b) ONE QUERY / MANY DOCS / ONE PROJECTION — indexed diagonal fast path + /// (dispatched + forced-scalar) over the corpus, vs the dense pairwise + /// loop (full table per doc, read its diagonal). + fn regime_b(idx: &MultiBucketBitmap, q_bitmaps: &[u64], n: usize, nb: usize) { + // dispatched diagonal fast path over every doc + let diag_dispatched = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let d = idx.diagonal_overlap_row(q_bitmaps, di); + acc += d.iter().map(|&x| x as u64).sum::(); + } + std::hint::black_box(acc); + }); + + // forced-scalar diagonal fast path + let diag_scalar = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let d = idx.diagonal_overlap_row_scalar(q_bitmaps, di); + acc += d.iter().map(|&x| x as u64).sum::(); + } + std::hint::black_box(acc); + }); + + // dense pairwise loop: build the FULL nb*nb table per doc (dispatched + // contingency_row), then take its diagonal trace. This is the work the + // diagonal fast path avoids (nb passes vs nb*nb). + let dense_full = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let table = idx.contingency_row(q_bitmaps, di); + for a in 0..nb { + acc += table[a * nb + a] as u64; + } + } + std::hint::black_box(acc); + }); + + let simd_speedup = diag_scalar / diag_dispatched; + let fastpath_speedup = dense_full / diag_dispatched; + + let per = |s: f64| s / n as f64 * 1e9; // ns per doc + println!("# (b) ONE QUERY / {n} DOCS / ONE PROJECTION (diagonal) — indexed (API 2)"); + println!( + "{:<34} {:>14} {:>16}", + "approach", "ns/doc", "vs dispatched" + ); + println!( + "{:<34} {:>14.2} {:>16}", + "diagonal fast path (dispatched)", + per(diag_dispatched), + "1.00x (ref)" + ); + println!( + "{:<34} {:>14.2} {:>15.2}x", + "diagonal fast path (scalar)", + per(diag_scalar), + simd_speedup + ); + println!( + "{:<34} {:>14.2} {:>15.2}x", + "dense full-table diagonal", + per(dense_full), + fastpath_speedup + ); + println!( + "DATA\tb\tbits={}\tdiag_disp_ns={:.2}\tdiag_scalar_ns={:.2}\tdense_full_ns={:.2}\tsimd_speedup={:.2}\tfastpath_speedup={:.2}", + (nb as f32).log2() as u32, + per(diag_dispatched), + per(diag_scalar), + per(dense_full), + simd_speedup, + fastpath_speedup + ); + } + + /// (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — batched + /// `project_all_batched` (table once, K projections off it) vs calling the + /// single-projection path K times (one corpus rescan per projection). + /// Both dispatched and forced-scalar. + fn regime_c(idx: &MultiBucketBitmap, q_bitmaps: &[u64], nb: usize) { + let mats = weight_matrices(nb); + let k = mats.len(); + let weights: Vec<&[f32]> = mats.iter().map(|w| w.as_slice()).collect(); + let n = idx.len(); + + // batched dispatched: one table per doc, all K projections off it. + let batched_disp = median_secs(7, 3, || { + let out = idx.project_all_batched(q_bitmaps, &weights); + std::hint::black_box(out.len()); + }); + + // batched forced-scalar. + let batched_scalar = median_secs(7, 3, || { + let out = idx.project_all_batched_scalar(q_bitmaps, &weights); + std::hint::black_box(out.len()); + }); + + // rescan: call the single-projection batched path K times (each call + // rescans the whole corpus building a fresh table per doc). Dispatched. + let rescan_disp = median_secs(7, 3, || { + let mut acc = 0usize; + for w in &weights { + let single: [&[f32]; 1] = [w]; + let out = idx.project_all_batched(q_bitmaps, &single); + acc += out.len(); + } + std::hint::black_box(acc); + }); + + let norescan_speedup = rescan_disp / batched_disp; + let simd_speedup = batched_scalar / batched_disp; + + let per = |s: f64| s / n as f64 * 1e9; // ns per doc (over all K) + println!("# (c) ONE QUERY / {n} DOCS / {k} PROJECTIONS — indexed batched (API 2)"); + println!( + "{:<38} {:>14} {:>16}", + "approach", "ns/doc(allK)", "vs batched" + ); + println!( + "{:<38} {:>14.2} {:>16}", + "project_all_batched (dispatched)", + per(batched_disp), + "1.00x (ref)" + ); + println!( + "{:<38} {:>14.2} {:>15.2}x", + "project_all_batched (scalar)", + per(batched_scalar), + simd_speedup + ); + println!( + "{:<38} {:>14.2} {:>15.2}x", + "rescan: single-proj x K (dispatched)", + per(rescan_disp), + norescan_speedup + ); + println!( + "DATA\tc\tbits={}\tbatched_disp_ns={:.2}\tbatched_scalar_ns={:.2}\trescan_disp_ns={:.2}\tk={}\tnorescan_speedup={:.2}\tsimd_speedup={:.2}", + (nb as f32).log2() as u32, + per(batched_disp), + per(batched_scalar), + per(rescan_disp), + k, + norescan_speedup, + simd_speedup + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index b912a100..1daa7f39 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,14 @@ //! coordinate, set when the coordinate is positive) for sign-cosine //! candidate generation. //! +//! These four families are the retrieval surface. The `experimental` +//! `MultiBucketBitmap` indexed contingency / projection API is a niche +//! research/analysis substrate for the bilinear bucket-overlap decomposition — +//! it is **not** a default single-score retrieval path and was never +//! kernel-optimized for that role. For primary nearest-neighbour retrieval use +//! [`RankQuant`], [`Bitmap`], or the two-stage candidate-generation → rerank +//! flow instead. +//! //! The [`Bitmap`] candidate score is the implementation surface with the //! strongest formal story: in the companion Lean formalization, literal //! constant-weight bitmap overlap is the query-preserving quotient statistic, diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 0b4a2d9c..a375a363 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -1,5 +1,25 @@ //! MultiBucketBitmap: `2^bits` bitmaps per document, one per bucket. //! +//! **Stability — not a default surface.** [`MultiBucketBitmap`] and the indexed +//! contingency / projection kernels built on it are gated behind the +//! **non-default `experimental` Cargo feature**. They are a research scaffold, +//! **not** part of the stable public API, and are **excluded from semver +//! guarantees** — they may change or be removed without a major-version bump. +//! The stable, semver-covered bucket-overlap surface is the stateless dense +//! `Contingency` / `Projection` API; reach for this indexed path only when you +//! have explicitly opted into `experimental`. +//! +//! **Positioning — not a primary retrieval path.** `MultiBucketBitmap` is a +//! niche research/analysis substrate for the bilinear bucket-overlap +//! decomposition; it is **not** the default single-score retrieval path and was +//! never kernel-optimized for that role. For primary nearest-neighbour +//! retrieval, use the headline paths instead — [`crate::RankQuant`] (symmetric +//! and asymmetric float-query scoring), [`crate::Bitmap`] (top-bucket +//! `popcount(Q AND D)` candidate scoring), and the two-stage +//! candidate-generation → rerank flow. Reach for this indexed contingency / +//! projection surface only to *analyze* bucket-overlap structure, never as a +//! primary retrieval index. +//! //! Represents the constant-composition bucket assignment of each //! document explicitly as a set of `2^bits` disjoint bitmaps over the //! `dim` coordinates. The bilinear bucket-overlap score @@ -131,6 +151,42 @@ impl MultiBucketBitmap { w } + /// Unit-diagonal weight matrix: `W[a, a] = 1`, off-diagonal `0`. The + /// bilinear score reduces to `Σ_a |Q_a ∩ D_a|` — the same-bucket + /// agreement count. This is the cheapest truncation: `nb` popcount-AND + /// passes per doc instead of the full `nb²`. It is a pure overlap signal + /// (no bucket-magnitude weighting), so it is closest in spirit to a + /// multi-level [`crate::Bitmap`]. + pub fn diagonal_weights(&self) -> Vec { + let nb = self.n_buckets; + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + w[a * nb + a] = 1.0; + } + w + } + + /// Outer-product weights `(a − c)(b − c)` restricted to the band + /// `|a − b| <= half_width` (off-band entries zeroed). `half_width = 0` + /// keeps only the magnitude-weighted diagonal; `half_width >= nb − 1` + /// recovers the full [`Self::outer_product_weights`] matrix. Sweeping + /// `half_width` interpolates candidate-gen cost (non-zero band entries ⇒ + /// popcount-AND passes per doc) between the diagonal and the exact + /// bilinear probe, tracing the recall/latency frontier. + pub fn banded_weights(&self, half_width: usize) -> Vec { + let nb = self.n_buckets; + let c = (nb as f32 - 1.0) / 2.0; + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= half_width { + w[a * nb + b] = (a as f32 - c) * (b as f32 - c); + } + } + } + w + } + /// Compute the bilinear bucket-overlap score /// `Σ_{a, b} W[a, b] · |Q_a ∩ D_b|` /// for a single (query, doc) pair. Scales nothing — caller @@ -194,6 +250,250 @@ impl MultiBucketBitmap { head } + // ----------------------------------------------------------------- + // Indexed contingency / projection surface (issue #219, API 2 of 2). + // + // **Feature gate:** this entire surface is behind the non-default + // `experimental` Cargo feature and is excluded from semver guarantees. + // It is *not* the default single-score retrieval path — for standard + // nearest-neighbour scoring use [`crate::RankQuant`] (asymmetric LUT + // or symmetric dot-product) or [`crate::Bitmap`] (top-bucket overlap). + // `MultiBucketBitmap` targets niche workloads where the full bilinear + // decomposition, multi-projection evidence, or the diagonal fast path + // are needed (one query vs. many docs with many weight projections). + // + // `bilinear_score` / `top_m_bilinear` recompute one weighted `Σ W·C` + // per call and rescan the doc's bitmaps once per *projection*. The + // methods below decouple the accumulation from the projection: the + // `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` is built **once** + // per (query, doc) over a single pass of the doc's bitmaps, then any + // number of weight matrices are applied as cheap `nb²` dot products + // over the cached integer table. This is the in-index analogue of the + // stateless [`crate::Contingency`] (API 1) — same `nb × nb` object, + // but accumulated straight from the bitmap rows rather than from raw + // `&[u8]` codes. + // + // The hot inner accumulation (the `nb × nb` popcount-AND over `qpb` + // u64 words) dispatches at runtime to an AVX-512 VPOPCNTDQ kernel, + // mirroring [`crate::Bitmap`]'s masked-tail popcount-AND scan, with a + // portable scalar fallback on every other target / unsupported CPU. + // ----------------------------------------------------------------- + + /// Full `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` for one + /// document, accumulated in a single pass over the doc's bitmaps. + /// + /// Returns a row-major `nb × nb` `Vec` where `out[a * nb + b]` is + /// the count of coordinates the query placed in bucket `a` and the doc + /// placed in bucket `b`. This is the indexed twin of + /// [`crate::Contingency::counts`]: the same table, built from the + /// bitmap rows instead of `&[u8]` codes. A caller can then apply any + /// projection (e.g. via [`crate::Contingency`]'s projections, or a raw + /// `nb²` weighted sum) without rescanning the bitmaps. + /// + /// # Panics + /// Panics if `doc_idx >= len()`. `q_bitmaps` must be `nb * qpb` words + /// (as produced by [`Self::query_bitmaps_from_ranks`]). + pub fn contingency_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "contingency_row: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "contingency_row: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut table = vec![0u32; nb * nb]; + contingency_accumulate(q_bitmaps, doc, nb, qpb, &mut table); + table + } + + /// Diagonal-only fast path: the `nb` cells `C[a, a] = |Q_a ∩ D_a|` for + /// one document, in a single pass over the doc's bitmaps. + /// + /// This is the common cheap projection — same-bucket agreement per + /// bucket — and costs `nb` popcount-AND passes per doc instead of the + /// full `nb²`. `out[a]` is `|Q_a ∩ D_a|`. Summing the result is the + /// [`Self::diagonal_weights`] bilinear score. + /// + /// # Panics + /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`. + pub fn diagonal_overlap_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "diagonal_overlap_row: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "diagonal_overlap_row: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut diag = vec![0u32; nb]; + diagonal_accumulate(q_bitmaps, doc, nb, qpb, &mut diag); + diag + } + + /// Batched indexed projection: for every document, build its `nb × nb` + /// contingency table **once**, then apply *all* of `weights` to that + /// single cached table, returning `docs × projections` scores. + /// + /// `weights[p]` is the row-major `nb × nb` weight matrix for projection + /// `p`. The returned `out[di][p]` is `Σ_{a, b} weights[p][a*nb+b] · + /// C_di[a, b]`. Each document's bitmaps are streamed once regardless of + /// how many projections are requested — the projections share the + /// accumulated table rather than each rescanning the corpus (which is + /// what `bilinear_score` would do per projection). + /// + /// Runs across documents in parallel via rayon. The per-doc table is a + /// small `nb²` integer buffer; the projection dot products are cheap + /// relative to the popcount-AND accumulation that dominates. + /// + /// # Panics + /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len() + /// != nb * nb`. + #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"] + pub fn project_all_batched(&self, q_bitmaps: &[u64], weights: &[&[f32]]) -> Vec> { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "project_all_batched: q_bitmaps must be nb * qpb words", + ); + for (p, w) in weights.iter().enumerate() { + assert_eq!( + w.len(), + nb * nb, + "project_all_batched: weights[{p}] must be an nb * nb matrix", + ); + } + let n = self.n_vectors; + let n_proj = weights.len(); + if n == 0 || n_proj == 0 { + return vec![Vec::new(); n]; + } + let per_doc = nb * qpb; + let bitmaps = &self.bitmaps; + (0..n) + .into_par_iter() + .map(|di| { + let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; + // One accumulation pass over this doc's bitmaps. nb <= 16 ⇒ + // nb*nb <= 256, so a stack table avoids a per-doc heap + // allocation inside the parallel map (allocator contention). + assert!( + nb * nb <= 256, + "project_all_batched: nb={nb} exceeds stack table capacity \ + (nb*nb={} > 256); bits must be <=4 so nb<=16", + nb * nb, + ); + let mut table = [0u32; 256]; + let table = &mut table[..nb * nb]; + contingency_accumulate(q_bitmaps, doc, nb, qpb, table); + project_table(table, weights) + }) + .collect() + } + + /// `#[doc(hidden)]` bench-only twin of [`Self::diagonal_overlap_row`] that + /// forces the **portable scalar** diagonal kernel, bypassing the runtime + /// AVX-512 dispatch. It exists so `examples/bench_contingency` can time the + /// scalar and SIMD diagonal paths against each other on the same index + /// (mirroring the `#[doc(hidden)]` `search_asymmetric_byte_lut` bench + /// reference at the crate root). Not part of the stable API — production + /// callers use [`Self::diagonal_overlap_row`], which dispatches to the + /// fastest available kernel. + /// + /// # Panics + /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`. + #[doc(hidden)] + pub fn diagonal_overlap_row_scalar(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "diagonal_overlap_row_scalar: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "diagonal_overlap_row_scalar: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut diag = vec![0u32; nb]; + diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, &mut diag); + diag + } + + /// `#[doc(hidden)]` bench-only twin of [`Self::project_all_batched`] that + /// forces the **portable scalar** contingency-accumulation kernel, + /// bypassing the runtime AVX-512 dispatch. Lets + /// `examples/bench_contingency` time the scalar and SIMD batched-projection + /// paths on the same index. Not part of the stable API. + /// + /// # Panics + /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len() + /// != nb * nb`. + #[doc(hidden)] + #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"] + pub fn project_all_batched_scalar( + &self, + q_bitmaps: &[u64], + weights: &[&[f32]], + ) -> Vec> { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "project_all_batched_scalar: q_bitmaps must be nb * qpb words", + ); + for (p, w) in weights.iter().enumerate() { + assert_eq!( + w.len(), + nb * nb, + "project_all_batched_scalar: weights[{p}] must be an nb * nb matrix", + ); + } + let n = self.n_vectors; + let n_proj = weights.len(); + if n == 0 || n_proj == 0 { + return vec![Vec::new(); n]; + } + let per_doc = nb * qpb; + let bitmaps = &self.bitmaps; + (0..n) + .into_par_iter() + .map(|di| { + let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; + // Stack table (nb*nb <= 256): no per-doc heap alloc in the + // parallel map. + assert!( + nb * nb <= 256, + "project_all_batched_scalar: nb={nb} exceeds stack table capacity \ + (nb*nb={} > 256); bits must be <=4 so nb<=16", + nb * nb, + ); + let mut table = [0u32; 256]; + let table = &mut table[..nb * nb]; + contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table); + project_table(table, weights) + }) + .collect() + } + pub fn len(&self) -> usize { self.n_vectors } @@ -216,3 +516,497 @@ impl MultiBucketBitmap { self.bitmaps.len() * std::mem::size_of::() } } + +// ===================================================================== +// Contingency-accumulation kernels. +// +// `q_bitmaps` and `doc` are both row-major `nb * qpb` u64 layouts: bucket +// `a`'s bitmap occupies words `[a * qpb, (a + 1) * qpb)`. The full table +// cell is +// C[a, b] = Σ_{w < qpb} popcount(Q_a[w] & D_b[w]), +// the popcount over the bitwise-AND of bucket `a`'s query bitmap and +// bucket `b`'s doc bitmap — the exact popcount-AND shape of +// [`crate::Bitmap`]'s overlap kernel, here run for every (a, b) pair. +// +// Each path dispatches to an AVX-512 VPOPCNTDQ kernel when the host +// supports it (mirroring the masked-tail discipline in `index/bitmap.rs`), +// otherwise the portable scalar reference. Counts are exact integers, so +// the SIMD and scalar paths are required to agree bit-for-bit (the parity +// test enforces this against API 1's dense `Contingency`). +// ===================================================================== + +/// Build the full row-major `nb × nb` contingency table for one +/// (query, doc) pair into `table` (length `nb * nb`). +/// +/// Dispatches to the AVX-512 VPOPCNTDQ kernel when available, else the +/// scalar reference. Both paths require `q_bitmaps.len() == doc.len() == +/// nb * qpb` and `table.len() == nb * nb`. +fn contingency_accumulate( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + debug_assert_eq!(q_bitmaps.len(), nb * qpb); + debug_assert_eq!(doc.len(), nb * qpb); + debug_assert_eq!(table.len(), nb * nb); + + #[cfg(target_arch = "x86_64")] + let use_avx512vpop = + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq"); + #[cfg(not(target_arch = "x86_64"))] + let use_avx512vpop = false; + + if use_avx512vpop { + #[cfg(target_arch = "x86_64")] + unsafe { + contingency_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, table); + return; + } + } + contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table); +} + +/// Apply every weight matrix in `weights` to a single cached `nb × nb` +/// contingency `table`, returning one `f32` projection score per matrix. +/// Each score is `Σ_{cell} weight[cell] · table[cell]` — the cheap dot +/// product the batched path shares across projections after one accumulation +/// pass. Caller guarantees every `weights[p].len() == table.len()`. +fn project_table(table: &[u32], weights: &[&[f32]]) -> Vec { + weights + .iter() + .map(|w| { + w.iter() + .zip(table.iter()) + .map(|(&weight, &c)| weight * c as f32) + .sum() + }) + .collect() +} + +/// Portable scalar reference for the full `nb × nb` table. One +/// popcount-AND reduction per (a, b) cell over the `qpb` u64 words. +fn contingency_accumulate_scalar( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + for a in 0..nb { + let q_off = a * qpb; + let row = a * nb; + for b in 0..nb { + let d_off = b * qpb; + let mut overlap: u32 = 0; + for w in 0..qpb { + overlap += (q_bitmaps[q_off + w] & doc[d_off + w]).count_ones(); + } + table[row + b] = overlap; + } + } +} + +/// Build only the `nb` diagonal cells `C[a, a] = |Q_a ∩ D_a|` into `diag` +/// (length `nb`). The common cheap projection. +/// +/// Dispatches to the AVX-512 VPOPCNTDQ diagonal kernel when available, +/// else the scalar reference. +fn diagonal_accumulate(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize, diag: &mut [u32]) { + debug_assert_eq!(q_bitmaps.len(), nb * qpb); + debug_assert_eq!(doc.len(), nb * qpb); + debug_assert_eq!(diag.len(), nb); + + #[cfg(target_arch = "x86_64")] + let use_avx512vpop = + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq"); + #[cfg(not(target_arch = "x86_64"))] + let use_avx512vpop = false; + + if use_avx512vpop { + #[cfg(target_arch = "x86_64")] + unsafe { + diagonal_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, diag); + return; + } + } + diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, diag); +} + +/// Portable scalar reference for the `nb` diagonal cells. +fn diagonal_accumulate_scalar( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + diag: &mut [u32], +) { + // `a` is load-bearing: it indexes the bucket bitmap offset (`a * qpb`) + // as well as `diag[a]`. + #[allow(clippy::needless_range_loop)] + for a in 0..nb { + let off = a * qpb; + let mut overlap: u32 = 0; + for w in 0..qpb { + overlap += (q_bitmaps[off + w] & doc[off + w]).count_ones(); + } + diag[a] = overlap; + } +} + +/// AVX-512 VPOPCNTDQ full-table contingency accumulation. +/// +/// Tiles the `qpb`-word bucket bitmaps into 512-bit (8×u64) ZMM blocks +/// with a `bzhi`-style masked tail for the final `qpb % 8` words, exactly +/// as [`crate::Bitmap`]'s scan kernel does. For each block, every query +/// bucket `a`'s lane and every doc bucket `b`'s lane is AND-ed, +/// `_mm512_popcnt_epi64`-counted, and reduced into the `(a, b)` cell — the +/// masked-tail popcount-AND, run over all `nb²` bucket pairs. +/// +/// # Safety +/// Requires the `avx512f` and `avx512vpopcntdq` target features, confirmed +/// by the `#[target_feature]` gate plus the caller's runtime +/// `is_x86_feature_detected!`. `q_bitmaps.len() == doc.len() == nb * qpb` +/// and `table.len() == nb * nb` are caller contracts (asserted at the +/// public entry points); the tail load uses a `(1 << rem) - 1` word mask +/// so no lane past `qpb` is read for either operand. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512vpopcntdq")] +unsafe fn contingency_accumulate_avx512vpop( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + use std::arch::x86_64::*; + // SAFETY: `q_bitmaps.len() == doc.len() == nb * qpb` and `table.len() + // == nb * nb` (caller contract). Each ZMM load reads 8 u64 words at a + // word offset `< qpb` within bucket `a`/`b`'s `qpb`-word region; the + // final partial block uses `_mm512_maskz_loadu_epi64` with a + // `(1 << rem) - 1` mask so lanes past `qpb` read as zero and are never + // dereferenced. AVX-512 F/VPOPCNTDQ are confirmed by `#[target_feature]` + // plus the runtime detection in the caller. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let full_blocks = qpb / 8; + let rem = qpb % 8; + let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 }; + let q_ptr = q_bitmaps.as_ptr(); + let d_ptr = doc.as_ptr(); + + for a in 0..nb { + let q_base = a * qpb; + let row = a * nb; + for b in 0..nb { + let d_base = b * qpb; + let mut acc = _mm512_setzero_si512(); + // Whole 512-bit blocks. + for blk in 0..full_blocks { + let w = blk * 8; + let q_zmm = _mm512_loadu_si512(q_ptr.add(q_base + w) as *const __m512i); + let d_zmm = _mm512_loadu_si512(d_ptr.add(d_base + w) as *const __m512i); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + // Masked tail: only the low `rem` words are loaded; lanes + // past `qpb` read as zero and contribute no popcount. + if rem != 0 { + let w = full_blocks * 8; + let q_zmm = + _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(q_base + w) as *const i64); + let d_zmm = + _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(d_base + w) as *const i64); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + let sum: i64 = _mm512_reduce_add_epi64(acc); + table[row + b] = sum as u32; + } + } + } +} + +/// AVX-512 VPOPCNTDQ diagonal-only accumulation: the `nb` cells +/// `C[a, a]`, each a masked-tail popcount-AND over bucket `a`'s query and +/// doc bitmaps. Same masked-tail discipline as the full kernel, restricted +/// to the diagonal. +/// +/// # Safety +/// Same contract as [`contingency_accumulate_avx512vpop`]: +/// `q_bitmaps.len() == doc.len() == nb * qpb`, `diag.len() == nb`, the +/// target features confirmed by the gate plus runtime detection, and the +/// masked tail bounding every load to the live `qpb` words. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512vpopcntdq")] +unsafe fn diagonal_accumulate_avx512vpop( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + diag: &mut [u32], +) { + use std::arch::x86_64::*; + // SAFETY: identical bounding to `contingency_accumulate_avx512vpop`, + // restricted to `b == a`: every load is at a word offset `< qpb` within + // bucket `a`'s `qpb`-word region, with the final partial block masked to + // the low `rem` words. `diag.len() == nb` bounds every `diag[a]` write. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let full_blocks = qpb / 8; + let rem = qpb % 8; + let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 }; + let q_ptr = q_bitmaps.as_ptr(); + let d_ptr = doc.as_ptr(); + + // `a` indexes the bucket bitmap offset (`a * qpb`) and `diag[a]`. + #[allow(clippy::needless_range_loop)] + for a in 0..nb { + let base = a * qpb; + let mut acc = _mm512_setzero_si512(); + for blk in 0..full_blocks { + let w = blk * 8; + let q_zmm = _mm512_loadu_si512(q_ptr.add(base + w) as *const __m512i); + let d_zmm = _mm512_loadu_si512(d_ptr.add(base + w) as *const __m512i); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + if rem != 0 { + let w = full_blocks * 8; + let q_zmm = _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(base + w) as *const i64); + let d_zmm = _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(base + w) as *const i64); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + let sum: i64 = _mm512_reduce_add_epi64(acc); + diag[a] = sum as u32; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Contingency; + use rand::{RngExt, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + /// Reconstruct the per-coordinate bucket codes a doc was encoded with + /// by reading back the index's stored bitmaps. Used to feed API 1's + /// dense `Contingency` the *same* bucket assignment the index holds. + fn doc_codes(idx: &MultiBucketBitmap, doc_idx: usize) -> Vec { + let nb = idx.n_buckets(); + let qpb = idx.qwords_per_bitmap; + let dim = idx.dim(); + let base = doc_idx * nb * qpb; + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = base + b * qpb; + // `j` is load-bearing: it indexes the bit position (`j/64`, + // `j%64`) as well as `codes[j]`. + #[allow(clippy::needless_range_loop)] + for j in 0..dim { + if (idx.bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + codes[j] = b as u8; + } + } + } + codes + } + + /// Reconstruct the query bucket codes from the query bitmaps. + fn query_codes(q_bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec { + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = b * qpb; + // `j` is load-bearing: bit position and `codes[j]` index. + #[allow(clippy::needless_range_loop)] + for j in 0..dim { + if (q_bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + codes[j] = b as u8; + } + } + } + codes + } + + /// Independent pure-scalar reference for the full table, not routed + /// through the dispatch (so the dispatched path can't mask its own bug). + fn reference_table(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize) -> Vec { + let mut table = vec![0u32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + let mut overlap = 0u32; + for w in 0..qpb { + overlap += (q_bitmaps[a * qpb + w] & doc[b * qpb + w]).count_ones(); + } + table[a * nb + b] = overlap; + } + } + table + } + + fn make_corpus(seed: u64, n: usize, dim: usize) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect() + } + + /// CORRECTNESS GATE (issue #219): the dispatched contingency path + /// (AVX-512 on this host) and the diagonal fast path must produce + /// BIT-IDENTICAL results to (a) an independent scalar reference, the + /// forced scalar kernel, and (c) a freshly-built dense `Contingency` + /// (API 1) over the same query/doc bucket codes. Counts are exact + /// integers — equality is exact, not tolerance-based. + /// + /// Dims chosen to exercise the masked tail: 384 (qpb=6, all-tail, no + /// full ZMM), 768 (qpb=12, one full ZMM + 4-word tail), 1024 (qpb=16, + /// two full ZMMs, no tail). + #[test] + fn parity_scalar_simd_diagonal_dense() { + for &dim in &[384usize, 768, 1024] { + for bits in [2u8, 4u8] { + let nb = 1usize << bits; + let qpb = dim / 64; + let n = 24; + let corpus = make_corpus(0xC0FFEE ^ dim as u64 ^ (bits as u64) << 32, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + + let mut rng = ChaCha8Rng::seed_from_u64(0x5EED ^ dim as u64); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + let q_codes = query_codes(&q_bitmaps, nb, qpb, dim); + + for di in 0..n { + let doc_base = di * nb * qpb; + let doc = &idx.bitmaps[doc_base..doc_base + nb * qpb]; + + // (a) independent scalar reference table. + let want = reference_table(&q_bitmaps, doc, nb, qpb); + + // forced scalar kernel. + let mut scalar = vec![0u32; nb * nb]; + contingency_accumulate_scalar(&q_bitmaps, doc, nb, qpb, &mut scalar); + assert_eq!( + scalar, want, + "scalar kernel != reference (dim={dim}, bits={bits}, doc={di})", + ); + + // dispatched path (AVX-512 on this host). + let dispatched = idx.contingency_row(&q_bitmaps, di); + assert_eq!( + dispatched, want, + "dispatched (SIMD) != reference (dim={dim}, bits={bits}, doc={di})", + ); + + // diagonal fast path == diagonal of the full table. + let diag = idx.diagonal_overlap_row(&q_bitmaps, di); + let want_diag: Vec = (0..nb).map(|a| want[a * nb + a]).collect(); + assert_eq!( + diag, want_diag, + "diagonal fast path != table diagonal (dim={dim}, bits={bits}, doc={di})", + ); + + // dense `Contingency` (API 1) over the same codes. + let d_codes = doc_codes(&idx, di); + let dense = Contingency::new(&q_codes, &d_codes, nb).unwrap(); + assert_eq!( + dispatched, + dense.counts(), + "dispatched (SIMD) != dense Contingency (dim={dim}, bits={bits}, doc={di})", + ); + // And the dense diagonal trace matches the fast-path sum. + assert_eq!( + diag.iter().sum::(), + dense.diagonal_agreement(), + "diagonal sum != dense diagonal_agreement (dim={dim}, bits={bits}, doc={di})", + ); + } + } + } + } + + /// `project_all_batched` applies every projection to each doc's table + /// built once, and agrees with per-doc `bilinear_score` (which rescans + /// per projection) for the same weight matrices. + #[test] + fn project_all_batched_matches_bilinear_score() { + let dim = 768; + let bits = 4u8; + let n = 40; + let corpus = make_corpus(0xBA7C4, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + + let mut rng = ChaCha8Rng::seed_from_u64(0xD0C5); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + + let outer = idx.outer_product_weights(); + let diagonal = idx.diagonal_weights(); + let banded = idx.banded_weights(1); + let weights: Vec<&[f32]> = vec![&outer, &diagonal, &banded]; + + let batched = idx.project_all_batched(&q_bitmaps, &weights); + assert_eq!(batched.len(), n); + for (di, row) in batched.iter().enumerate() { + assert_eq!(row.len(), 3); + // bilinear_score rescans the doc per weight matrix — the + // batched path must reproduce it exactly (same integer table, + // same weighted sum, same f32 accumulation order per matrix). + for (p, w) in weights.iter().enumerate() { + let want = idx.bilinear_score(&q_bitmaps, w, di); + assert_eq!( + row[p], want, + "project_all_batched[{di}][{p}] != bilinear_score", + ); + } + } + } + + /// Diagonal fast path sums to the `diagonal_weights` bilinear score. + #[test] + fn diagonal_row_sums_to_diagonal_bilinear_score() { + let dim = 384; + let bits = 2u8; + let n = 16; + let corpus = make_corpus(0xD1A6, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + let mut rng = ChaCha8Rng::seed_from_u64(0xF00D); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + let w = idx.diagonal_weights(); + for di in 0..n { + let diag_sum: u32 = idx.diagonal_overlap_row(&q_bitmaps, di).iter().sum(); + assert_eq!(diag_sum as f32, idx.bilinear_score(&q_bitmaps, &w, di)); + } + } + + #[test] + fn empty_index_project_all_batched_is_empty() { + let idx = MultiBucketBitmap::new(256, 2); + let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.0f32; 256]); + let diag = idx.diagonal_weights(); + let weights: Vec<&[f32]> = vec![&diag]; + assert!(idx.project_all_batched(&q_bitmaps, &weights).is_empty()); + } + + #[test] + fn project_all_batched_no_weights_yields_empty_rows() { + let dim = 256; + let mut idx = MultiBucketBitmap::new(dim, 2); + idx.add(&make_corpus(1, 5, dim)); + let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.5f32; dim]); + let weights: Vec<&[f32]> = Vec::new(); + let out = idx.project_all_batched(&q_bitmaps, &weights); + assert_eq!(out.len(), 5); + assert!(out.iter().all(|row| row.is_empty())); + } +}