From dc069fda7aae52e24bf01818aa7e903beee86da7 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 13:44:43 -0500 Subject: [PATCH 01/13] feat(experimental): stateless dense bucket-overlap contingency + projection API (#219) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New experimental-gated `Contingency` (src/contingency.rs): the full nb×nb bucket-overlap table C[a][b] = |{coords: query∈a ∧ doc∈b}| for two equal-length &[u8] bucket-code slices, built in one O(dim) histogram pass. Projections mirror ordgraph::edge: top_overlap / diagonal_agreement / band_agreement / top_group_overlap / bucket_l1_distance / coarsened_counts / rankquant_symmetric_score + a general project(&weights) for learned/custom nb×nb matrices, and a Projection enum. Stateless — no index, no persistence, never wired into a search path; this is the pairwise-evidence container ordgraph migrates to. Kernel: scalar reference + AVX-512 (avx512f/bw/vpopcntdq) histogram with a masked 64-byte tail (live-lane masking avoids tail bucket-0 false matches), runtime is_x86_feature_detected dispatch matching #[target_feature], portable scalar fallback. #![deny(unsafe_op_in_unsafe_fn)] honored; no new deps. bucket_l1_distance returns u64 (the distance-weighted sum (nb-1)·dim overflows u32 for accepted dim up to u32::MAX). Counts are u32 (a cell <= dim). Tests reproduce ordgraph::edge's exact projection values + simd==scalar parity + constructor validation + the u32-overflow regression. Behind `experimental`, absent from the default build. Verified: fmt/clippy(-D warnings)/test green. Refs #219. Targets 0.6 (may land in 0.5.0 — maintainer decides). Signed-off-by: Nelson Spence --- src/contingency.rs | 705 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 11 + 2 files changed, 716 insertions(+) create mode 100644 src/contingency.rs diff --git a/src/contingency.rs b/src/contingency.rs new file mode 100644 index 00000000..aca6cca9 --- /dev/null +++ b/src/contingency.rs @@ -0,0 +1,705 @@ +//! Stateless dense bucket-overlap contingency table for two fixed-length +//! ordinal bucket-code vectors. +//! +//! A [`Contingency`] is the full `nb × nb` co-occurrence table between two +//! bucket-code slices `q` and `d` of equal length `dim`: +//! +//! ```text +//! count(a, b) = #{ j ∈ [0, dim) : q[j] == a ∧ d[j] == b } +//! ``` +//! +//! This is the *stateless dense-code contingency* surface (issue #219): it +//! consumes two `&[u8]` code slices directly — there is no index, no corpus, +//! no persistence, and it is **not** wired into any retrieval path. It is the +//! algebraic object the multi-bucket bilinear score decomposes into (see +//! [`crate::MultiBucketBitmap`]), exposed as a contingency table so callers can +//! compute arbitrary projections (diagonal/band agreement, top-bucket overlap, +//! L1 distance, coarsened tables, the symmetric RankQuant score, and general +//! learned `nb × nb` weight matrices) over a single `O(dim)` histogram pass. +//! +//! Ported to reach behavioural parity with `ordgraph::edge::Contingency`. The +//! `EdgeEvidence` "this is evidence for X" wrapper deliberately stays in +//! ordgraph — ordvec exposes only the substrate primitive. +//! +//! ## Count width +//! +//! Cell counts are stored as `u32`. A single cell holds at most `dim` codes +//! (every coordinate lands in exactly one cell), so the table never overflows +//! while `dim <= u32::MAX`, which the constructor asserts. The reference +//! prototype used `u16` with a `checked_add` that panics past `dim > u16::MAX`; +//! `u32` removes that hazard for the stateless code path, where the caller's +//! `dim` is arbitrary (the crate-wide `u16` *rank* invariant does not bind raw +//! bucket codes), at 2 bytes/cell more than `u16`. For the small `nb` this type +//! targets (`nb` is a per-coordinate bucket count, typically `≤ 16`) the table +//! is `nb² ≤ 256` cells, so the width choice is immaterial to memory. + +use crate::OrdvecError; + +/// Full bucket-overlap contingency table for two equal-length bucket codes. +/// +/// Construct with [`Contingency::new`] from two `&[u8]` code slices that share +/// a length (`dim`) and a bucket count (`nb`). Every accessor and projection +/// reads the cached `nb × nb` table; no rescanning of the input codes occurs +/// after construction. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct Contingency { + buckets: usize, + /// Row-major `buckets × buckets`. `counts[a * buckets + b]` is the number + /// of coordinates with query-bucket `a` and doc-bucket `b`. + counts: Vec, +} + +impl Contingency { + /// Build the contingency table from two bucket-code slices. + /// + /// `query` and `doc` must have the same length (`dim`), and every code must + /// be a valid bucket id `< nb`. One `O(dim)` histogram pass fills the full + /// `nb × nb` table. + /// + /// # Errors + /// - [`OrdvecError::InvalidParameter`] if `nb == 0`, or if `dim` exceeds + /// `u32::MAX` (a single cell could then exceed `u32`). + /// - [`OrdvecError::InvalidVectorLength`] if `doc.len() != query.len()`. + /// - [`OrdvecError::InvalidParameter`] if any code is `>= nb`. + pub fn new(query: &[u8], doc: &[u8], nb: usize) -> Result { + if nb == 0 { + return Err(OrdvecError::InvalidParameter { + name: "nb", + message: "bucket count must be > 0".to_string(), + }); + } + if query.len() != doc.len() { + return Err(OrdvecError::InvalidVectorLength { + name: "doc", + len: doc.len(), + expected: query.len(), + }); + } + if query.len() > u32::MAX as usize { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: format!("must be <= {} (u32 contingency count cap)", u32::MAX), + }); + } + // `nb * nb` is checked: a hostile `nb` near `usize::MAX` would wrap the + // table-length multiply and silently under-size the allocation. + let table_len = nb + .checked_mul(nb) + .ok_or_else(|| OrdvecError::InvalidParameter { + name: "nb", + message: "bucket count squared overflows usize".to_string(), + })?; + let mut counts = vec![0u32; table_len]; + build_histogram(query, doc, nb, &mut counts)?; + Ok(Self { + buckets: nb, + counts, + }) + } + + /// Number of buckets `nb` (the table is `nb × nb`). + pub fn buckets(&self) -> usize { + self.buckets + } + + /// The flat row-major `nb × nb` count table. + pub fn counts(&self) -> &[u32] { + &self.counts + } + + /// Count of coordinates with query-bucket `a` and doc-bucket `b`. + /// + /// # Panics + /// Panics if `a >= nb` or `b >= nb`. + pub fn count(&self, query_bucket: usize, doc_bucket: usize) -> u32 { + assert!(query_bucket < self.buckets, "query_bucket out of range"); + assert!(doc_bucket < self.buckets, "doc_bucket out of range"); + self.counts[query_bucket * self.buckets + doc_bucket] + } + + /// Sum of row `query_bucket`: how many coordinates the query placed in that + /// bucket. For fixed-composition codes this is the constant per-bucket + /// occupancy. + /// + /// # Panics + /// Panics if `query_bucket >= nb`. + pub fn row_sum(&self, query_bucket: usize) -> u32 { + assert!(query_bucket < self.buckets, "query_bucket out of range"); + let base = query_bucket * self.buckets; + self.counts[base..base + self.buckets].iter().sum() + } + + /// Sum of column `doc_bucket`: how many coordinates the doc placed in that + /// bucket. + /// + /// # Panics + /// Panics if `doc_bucket >= nb`. + pub fn column_sum(&self, doc_bucket: usize) -> u32 { + assert!(doc_bucket < self.buckets, "doc_bucket out of range"); + (0..self.buckets) + .map(|query_bucket| self.counts[query_bucket * self.buckets + doc_bucket]) + .sum() + } + + /// Total mass: equals `dim`. + pub fn total_count(&self) -> u32 { + self.counts.iter().copied().sum() + } + + /// Count in the top-bucket cell `(nb − 1, nb − 1)`: coordinates both codes + /// rank in the highest bucket. + pub fn top_overlap(&self) -> u32 { + self.count(self.buckets - 1, self.buckets - 1) + } + + /// Trace of the table: coordinates assigned to the same bucket by both + /// codes, summed over all buckets. + pub fn diagonal_agreement(&self) -> u32 { + (0..self.buckets) + .map(|bucket| self.counts[bucket * self.buckets + bucket]) + .sum() + } + + /// Mass within `radius` of the diagonal: coordinates whose two bucket codes + /// differ by at most `radius`. `radius = 0` reduces to + /// [`Self::diagonal_agreement`]. + pub fn band_agreement(&self, radius: usize) -> u32 { + let mut total = 0u32; + for qb in 0..self.buckets { + let base = qb * self.buckets; + for db in 0..self.buckets { + if qb.abs_diff(db) <= radius { + total += self.counts[base + db]; + } + } + } + total + } + + /// Mass in the top-right `group_width × group_width` block: coordinates both + /// codes place in the top `group_width` buckets. + /// + /// # Panics + /// Panics if `group_width == 0` or `group_width > nb`. + pub fn top_group_overlap(&self, group_width: usize) -> u32 { + assert!(group_width > 0, "group_width must be > 0"); + assert!(group_width <= self.buckets, "group_width must be <= nb"); + let start = self.buckets - group_width; + let mut total = 0u32; + for qb in start..self.buckets { + let base = qb * self.buckets; + for db in start..self.buckets { + total += self.counts[base + db]; + } + } + total + } + + /// Total bucket-index L1 distance: `Σ_{a,b} |a − b| · count(a, b)`. The + /// integer earth-mover-style cost of moving the doc histogram onto the + /// query histogram along the bucket axis. + /// Returns `u64`: the distance-weighted sum `Σ |a−b|·C[a][b]` can reach + /// `(nb−1)·dim`, which overflows `u32` for accepted inputs (a single count + /// already fits `u32` only up to `dim ≤ u32::MAX`; the `|a−b|` weight then + /// scales it past `u32`). `u64` is overflow-free for every constructible + /// table (`nb ≤ 256`, `dim ≤ u32::MAX` ⇒ max `255 · u32::MAX < u64::MAX`). + pub fn bucket_l1_distance(&self) -> u64 { + let mut total = 0u64; + for qb in 0..self.buckets { + let base = qb * self.buckets; + for db in 0..self.buckets { + total += qb.abs_diff(db) as u64 * u64::from(self.counts[base + db]); + } + } + total + } + + /// Coarsen the `nb × nb` table into a `groups × groups` table by merging + /// contiguous equal-width bucket blocks. Preserves total mass. + /// + /// # Panics + /// Panics if `groups == 0`, `groups > nb`, or `nb` is not divisible by + /// `groups`. + pub fn coarsened_counts(&self, groups: usize) -> Vec { + assert!(groups > 0, "groups must be > 0"); + assert!(groups <= self.buckets, "groups must be <= nb"); + assert!( + self.buckets.is_multiple_of(groups), + "bucket count must be divisible by groups" + ); + let width = self.buckets / groups; + let mut out = vec![0u32; groups * groups]; + for qb in 0..self.buckets { + let base = qb * self.buckets; + let qg = qb / width; + for db in 0..self.buckets { + let dg = db / width; + out[qg * groups + dg] += self.counts[base + db]; + } + } + out + } + + /// Symmetric RankQuant score: `Σ_{a,b} (a − c)(b − c) · count(a, b)` with + /// `c = (nb − 1) / 2`. Algebraically identical to the per-coordinate + /// centred product `Σ_j (q[j] − c)(d[j] − c)` — the outer-product weight + /// matrix just rearranges the same sum. + pub fn rankquant_symmetric_score(&self) -> f32 { + let centre = (self.buckets as f32 - 1.0) / 2.0; + let mut score = 0.0f32; + for qb in 0..self.buckets { + let base = qb * self.buckets; + let qw = qb as f32 - centre; + for db in 0..self.buckets { + let weight = qw * (db as f32 - centre); + score += weight * self.counts[base + db] as f32; + } + } + score + } + + /// General projection under an arbitrary `nb × nb` weight matrix: + /// `Σ_{a,b} weights[a * nb + b] · count(a, b)`. This is the learned/custom + /// weight-matrix entry point — the diagonal, band, and outer-product helpers + /// above are all special cases of this dense reduction. + /// + /// # Panics + /// Panics if `weights.len() != nb * nb`. + pub fn project(&self, weights: &[f32]) -> f32 { + assert_eq!( + weights.len(), + self.counts.len(), + "weights must be an nb * nb matrix", + ); + weights + .iter() + .zip(self.counts.iter()) + .map(|(&w, &c)| w * c as f32) + .sum() + } +} + +/// Named projections over a [`Contingency`], mirroring +/// `ordgraph::edge::Projection`. Each variant's [`Self::score`] returns the +/// projection as `f32`. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Projection { + /// Top-bucket cell count — [`Contingency::top_overlap`]. + TopOverlap, + /// Top `width`-bucket block mass — [`Contingency::top_group_overlap`]. + TopGroupOverlap { width: usize }, + /// Diagonal trace — [`Contingency::diagonal_agreement`]. + DiagonalAgreement, + /// Banded mass within `radius` — [`Contingency::band_agreement`]. + BandAgreement { radius: usize }, + /// Bucket-index L1 transport cost — [`Contingency::bucket_l1_distance`]. + BucketL1Distance, + /// Centred outer-product score — [`Contingency::rankquant_symmetric_score`]. + RankQuantSymmetric, +} + +impl Projection { + /// Evaluate this projection against `contingency`, returning the value as + /// `f32` (matching the ordgraph projection contract). + pub fn score(self, contingency: &Contingency) -> f32 { + match self { + Self::TopOverlap => contingency.top_overlap() as f32, + Self::TopGroupOverlap { width } => contingency.top_group_overlap(width) as f32, + Self::DiagonalAgreement => contingency.diagonal_agreement() as f32, + Self::BandAgreement { radius } => contingency.band_agreement(radius) as f32, + Self::BucketL1Distance => contingency.bucket_l1_distance() as f32, + Self::RankQuantSymmetric => contingency.rankquant_symmetric_score(), + } + } +} + +/// Fill the row-major `nb × nb` `counts` table from the two code slices. +/// +/// Dispatches at runtime to an AVX-512 BW byte-compare kernel when available +/// (the masked-tail popcount-AND discipline mirrors [`crate::Bitmap`]'s +/// scan kernels), otherwise the portable scalar histogram scatter. Both paths +/// validate every code is `< nb` and return [`OrdvecError::InvalidParameter`] +/// on the first out-of-range code. +fn build_histogram( + query: &[u8], + doc: &[u8], + nb: usize, + counts: &mut [u32], +) -> Result<(), OrdvecError> { + debug_assert_eq!(query.len(), doc.len()); + debug_assert_eq!(counts.len(), nb * nb); + + // Validate the code range up front so the SIMD kernel can assume every + // `q[j] < nb` and `d[j] < nb` and index the table without a per-element + // bounds check (the scalar fallback shares the same validated contract). + if let Some(bad) = find_out_of_range(query, doc, nb) { + return Err(OrdvecError::InvalidParameter { + name: "code", + message: format!("bucket code {bad} out of range (must be < {nb})"), + }); + } + + #[cfg(target_arch = "x86_64")] + let use_avx512 = is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512bw") + && is_x86_feature_detected!("avx512vpopcntdq") + // The byte-compare kernel materialises an nb-wide mask table; cap it to + // the small bucket counts this type targets so the per-bucket popcount + // pass stays cheaper than a single scalar scatter. + && nb <= 16; + #[cfg(not(target_arch = "x86_64"))] + let use_avx512 = false; + + if use_avx512 { + #[cfg(target_arch = "x86_64")] + unsafe { + build_histogram_avx512(query, doc, nb, counts); + return Ok(()); + } + } + + build_histogram_scalar(query, doc, nb, counts); + Ok(()) +} + +/// Return the first code (from either slice) that is `>= nb`, else `None`. +fn find_out_of_range(query: &[u8], doc: &[u8], nb: usize) -> Option { + // nb fits a usize; codes are u8, so any code >= nb is out of range. When + // nb > 255 every u8 code is in range, so the scan can be skipped. + if nb > u8::MAX as usize { + return None; + } + let cap = nb as u8; + query.iter().chain(doc.iter()).copied().find(|&c| c >= cap) +} + +/// Portable scalar histogram scatter: one `O(dim)` pass, one table increment +/// per coordinate. Assumes every code is `< nb` (validated by the caller). +fn build_histogram_scalar(query: &[u8], doc: &[u8], nb: usize, counts: &mut [u32]) { + for (&qb, &db) in query.iter().zip(doc.iter()) { + let idx = qb as usize * nb + db as usize; + // `+= 1` cannot overflow u32: a cell holds at most `dim <= u32::MAX` + // coordinates and each coordinate increments exactly one cell. + counts[idx] += 1; + } +} + +/// AVX-512 BW + VPOPCNTDQ contingency build. +/// +/// For each bucket pair `(a, b)` the cell `count(a, b)` is +/// `Σ_words popcount((q_bytes == a) & (d_bytes == b))` — a popcount over the +/// bitwise-AND of two byte-equality masks. This is the exact popcount-AND shape +/// of [`crate::Bitmap`]'s overlap kernel, lifted from precomputed bitmaps to +/// on-the-fly `_mm512_cmpeq_epi8_mask` comparisons, with the trailing +/// `< 64`-byte tail handled under a `bzhi`-style length mask (the masked-tail +/// discipline mirrors the bitmap scan). +/// +/// # Safety +/// Requires the `avx512f`, `avx512bw`, and `avx512vpopcntdq` target features, +/// confirmed by the `#[target_feature]` gate plus the caller's runtime +/// `is_x86_feature_detected!`. `query.len() == doc.len()` and +/// `counts.len() == nb * nb` are caller contracts; all loads are masked to the +/// live byte count so no read passes the end of either slice. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512bw,avx512vpopcntdq")] +unsafe fn build_histogram_avx512(query: &[u8], doc: &[u8], nb: usize, counts: &mut [u32]) { + use std::arch::x86_64::*; + // SAFETY: `query.len() == doc.len() == len` (caller contract), so the two + // masked 64-byte loads at the same offset cover the same live region; the + // tail load uses a `(1 << rem) - 1` byte mask so lanes past `len` are never + // read. `counts.len() == nb * nb` and `a, b < nb <= 16` (dispatch gate) + // bound every `counts[a * nb + b]` write. AVX-512 F/BW/VPOPCNTDQ are + // confirmed by `#[target_feature]` + the runtime detection in the caller. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let len = query.len(); + let q_ptr = query.as_ptr(); + let d_ptr = doc.as_ptr(); + + // Per-cell 64-bit accumulators, row-major `nb × nb` (nb <= 16 ⇒ <= 256 + // u64). A doc cell holds at most `len` codes; u64 cannot overflow. + let mut acc = vec![0u64; nb * nb]; + + let mut off = 0usize; + while off < len { + let rem = len - off; + let (q_vec, d_vec) = if rem >= 64 { + ( + _mm512_loadu_si512(q_ptr.add(off) as *const __m512i), + _mm512_loadu_si512(d_ptr.add(off) as *const __m512i), + ) + } else { + // Masked tail: only the low `rem` bytes are loaded; the rest of + // each register reads as zero and is excluded from every + // `cmpeq` mask by ANDing with `live` below. + let load_mask: __mmask64 = (1u64 << rem) - 1; + ( + _mm512_maskz_loadu_epi8(load_mask, q_ptr.add(off) as *const i8), + _mm512_maskz_loadu_epi8(load_mask, d_ptr.add(off) as *const i8), + ) + }; + // `live` marks the lanes that hold a real byte in this block. + let live: __mmask64 = if rem >= 64 { + u64::MAX + } else { + (1u64 << rem) - 1 + }; + + // For each query bucket `a`, the lanes where `q == a`; for each doc + // bucket `b`, the lanes where `d == b`. The cell increment is the + // popcount of the AND of the two lane masks, restricted to live + // lanes. This is popcount(maskQ & maskD) — the masked popcount-AND. + for a in 0..nb { + let a_splat = _mm512_set1_epi8(a as i8); + let q_eq: __mmask64 = _mm512_cmpeq_epi8_mask(q_vec, a_splat) & live; + if q_eq == 0 { + continue; + } + let row = a * nb; + for b in 0..nb { + let b_splat = _mm512_set1_epi8(b as i8); + let d_eq: __mmask64 = _mm512_cmpeq_epi8_mask(d_vec, b_splat); + acc[row + b] += (q_eq & d_eq).count_ones() as u64; + } + } + + off += 64; + } + + for (cell, &v) in counts.iter_mut().zip(acc.iter()) { + // Each cell holds at most `len <= u32::MAX` codes (caller-asserted), + // so the u64 accumulator fits u32. + *cell = v as u32; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- ordgraph::edge parity gate ------------------------------------- + // Every assertion value below is reproduced verbatim from + // ordgraph-proto/src/edge.rs's tests. The bucket codes are the *already + // bucketed* outputs ordgraph derives from ranks; here we feed the codes + // directly (the stateless dense-code contract). + + /// The ordgraph `contingency_counts_bucket_intersections` case: + /// query = [0,0,1,1,2,2,3,3], doc = [3,3,2,2,1,1,0,0] (reverse), nb = 4. + #[test] + fn parity_counts_bucket_intersections() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [3u8, 3, 2, 2, 1, 1, 0, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + + assert_eq!(c.count(0, 3), 2); + assert_eq!(c.count(3, 0), 2); + assert_eq!(c.top_overlap(), 0); + assert_eq!(c.diagonal_agreement(), 0); + } + + /// The ordgraph `projections_recover_top_diagonal_band_and_rankquant_score` + /// case: query = [0,0,1,1,2,2,3,3], doc = [0,1,1,2,2,3,3,0], nb = 4. + #[test] + fn parity_projections() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + + assert_eq!(Projection::TopOverlap.score(&c), 1.0); + assert_eq!(Projection::DiagonalAgreement.score(&c), 4.0); + assert_eq!(Projection::BandAgreement { radius: 1 }.score(&c), 7.0); + assert_eq!(Projection::TopGroupOverlap { width: 2 }.score(&c), 3.0); + assert_eq!(Projection::BucketL1Distance.score(&c), 6.0); + assert_eq!(Projection::RankQuantSymmetric.score(&c), 4.0); + + // Direct-accessor parity (same values, non-enum surface). + assert_eq!(c.top_overlap(), 1); + assert_eq!(c.diagonal_agreement(), 4); + assert_eq!(c.band_agreement(1), 7); + assert_eq!(c.top_group_overlap(2), 3); + assert_eq!(c.bucket_l1_distance(), 6); + assert_eq!(c.rankquant_symmetric_score(), 4.0); + } + + /// `bucket_l1_distance` must not overflow for constructor-accepted inputs. + /// All mass in the maximum-distance cell at `dim = u32::MAX` (the largest + /// accepted `dim`) gives `(nb-1)·dim`, which exceeds `u32::MAX`. Built + /// directly because a `u32::MAX`-length input slice is impractical to + /// allocate — this is exactly the table such an input would produce. + #[test] + fn bucket_l1_distance_does_not_overflow_u32() { + let nb = 16usize; + let mut counts = vec![0u32; nb * nb]; + counts[nb - 1] = u32::MAX; // C[0][nb-1]: query bucket 0, doc bucket nb-1 + let c = Contingency { + buckets: nb, + counts, + }; + let expected = (nb as u64 - 1) * u64::from(u32::MAX); // 15 · 4_294_967_295 + assert!( + expected > u64::from(u32::MAX), + "fixture must exceed u32 to be a real regression" + ); + assert_eq!(c.bucket_l1_distance(), expected); + } + + /// The ordgraph `contingency_has_fixed_row_and_column_margins` case. + #[test] + fn parity_fixed_margins() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + + assert_eq!(c.total_count(), 8); + for bucket in 0..4 { + assert_eq!(c.row_sum(bucket), 2); + assert_eq!(c.column_sum(bucket), 2); + } + } + + /// The ordgraph `rankquant_symmetric_projection_matches_direct_centered_sum` + /// case: the table-projected score equals the per-coordinate centred sum. + #[test] + fn parity_rankquant_matches_direct_centered_sum() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + + let centre = 1.5f32; + let direct: f32 = query + .iter() + .zip(doc.iter()) + .map(|(&q, &d)| (f32::from(q) - centre) * (f32::from(d) - centre)) + .sum(); + + assert_eq!(c.rankquant_symmetric_score(), direct); + } + + /// The ordgraph `coarsened_counts_preserve_total_mass` case: + /// coarsened_counts(2) = [3, 1, 1, 3], total = 8. + #[test] + fn parity_coarsened_counts() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + + assert_eq!(c.coarsened_counts(2), vec![3, 1, 1, 3]); + assert_eq!(c.coarsened_counts(2).iter().sum::(), 8); + } + + // ---- ordvec-specific surface ---------------------------------------- + + /// `project` with a hand-built weight matrix recovers the named + /// projections it generalises. + #[test] + fn project_generalises_named_projections() { + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + let nb = 4; + + // Unit-diagonal weights ⇒ diagonal_agreement. + let mut diag = vec![0.0f32; nb * nb]; + for a in 0..nb { + diag[a * nb + a] = 1.0; + } + assert_eq!(c.project(&diag), c.diagonal_agreement() as f32); + + // Centred outer-product weights ⇒ rankquant_symmetric_score. + let centre = (nb as f32 - 1.0) / 2.0; + let mut outer = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + outer[a * nb + b] = (a as f32 - centre) * (b as f32 - centre); + } + } + assert_eq!(c.project(&outer), c.rankquant_symmetric_score()); + + // A fully custom learned matrix: weighted dot product over cells. + let learned: Vec = (0..(nb * nb)).map(|i| i as f32 * 0.5).collect(); + let expected: f32 = (0..(nb * nb)) + .map(|i| learned[i] * c.counts()[i] as f32) + .sum(); + assert_eq!(c.project(&learned), expected); + } + + /// Constructor input validation. + #[test] + fn rejects_mismatched_lengths() { + let err = Contingency::new(&[0u8, 1], &[0u8, 1, 2], 4).unwrap_err(); + assert!(matches!(err, OrdvecError::InvalidVectorLength { .. })); + } + + #[test] + fn rejects_zero_buckets() { + let err = Contingency::new(&[0u8], &[0u8], 0).unwrap_err(); + assert!(matches!( + err, + OrdvecError::InvalidParameter { name: "nb", .. } + )); + } + + #[test] + fn rejects_out_of_range_code() { + // doc has a code (4) >= nb (4). + let err = Contingency::new(&[0u8, 1, 2, 3], &[0u8, 1, 2, 4], 4).unwrap_err(); + assert!(matches!( + err, + OrdvecError::InvalidParameter { name: "code", .. } + )); + } + + /// AVX-512 dispatch is gated on `dim >= 64` (the byte-compare kernel only + /// fires once there is at least one full 64-byte block plus tail handling). + /// Build a long, randomised case and check the SIMD-dispatched table + /// matches an independent scalar histogram, exercising the masked tail. + #[test] + fn simd_path_matches_scalar_reference() { + // Lengths chosen so at least one trips the AVX-512 path with a partial + // tail (200 = 3*64 + 8) and one is a clean multiple of 64 (256). + for &len in &[64usize, 200, 256, 1000] { + for nb in [2usize, 4, 8, 16] { + let mut seed = 0x9E3779B9u32 ^ (len as u32).wrapping_mul(2654435761); + let mut next = || { + // xorshift32, deterministic, no rng dependency. + seed ^= seed << 13; + seed ^= seed >> 17; + seed ^= seed << 5; + seed + }; + let query: Vec = (0..len).map(|_| (next() as usize % nb) as u8).collect(); + let doc: Vec = (0..len).map(|_| (next() as usize % nb) as u8).collect(); + + let got = Contingency::new(&query, &doc, nb).unwrap(); + + // Independent scalar reference. + let mut want = vec![0u32; nb * nb]; + for (&q, &d) in query.iter().zip(doc.iter()) { + want[q as usize * nb + d as usize] += 1; + } + assert_eq!( + got.counts(), + want.as_slice(), + "contingency table mismatch at len={len}, nb={nb}", + ); + assert_eq!(got.total_count(), len as u32); + } + } + } + + /// nb > 255 forces the scalar path (the SIMD kernel caps at nb <= 16) and + /// exercises the `find_out_of_range` skip branch (all u8 codes in range). + #[test] + fn large_nb_uses_scalar_and_skips_range_scan() { + let query = [0u8, 5, 200, 255]; + let doc = [255u8, 200, 5, 0]; + let c = Contingency::new(&query, &doc, 300).unwrap(); + assert_eq!(c.buckets(), 300); + assert_eq!(c.count(0, 255), 1); + assert_eq!(c.count(255, 0), 1); + assert_eq!(c.count(200, 5), 1); + assert_eq!(c.count(5, 200), 1); + assert_eq!(c.total_count(), 4); + } +} diff --git a/src/lib.rs b/src/lib.rs index 85158248..cf03435b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,6 +51,8 @@ use std::fmt; mod bitmap; +#[cfg(feature = "experimental")] +mod contingency; mod fastscan; #[cfg(feature = "experimental")] mod multi_bucket; @@ -85,6 +87,15 @@ pub use quant::search_asymmetric_byte_lut; #[cfg(feature = "experimental")] pub use multi_bucket::MultiBucketBitmap; +// `Contingency` / `Projection` are the stateless dense-code contingency-table +// surface (issue #219): the full `nb × nb` bucket-overlap table for two `&[u8]` +// code slices, plus its named projections. This is a research/analysis +// primitive — it is *not* a retrieval index and is never wired into a search +// path — so it lives behind the same `experimental` gate as +// `MultiBucketBitmap`, the bilinear-decomposition surface it complements. +#[cfg(feature = "experimental")] +pub use contingency::{Contingency, Projection}; + // `RankQuantFastscan` is an optional FastScan b=2 scan path. It is // re-exported `#[doc(hidden)]` at the crate root — reachable as // `ordvec::RankQuantFastscan` for callers who opt in, but not From 77c2299329b792eacf9a32bae876a535c6fdc380 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 13:45:27 -0500 Subject: [PATCH 02/13] feat(experimental): indexed MultiBucketBitmap contingency kernels + batched projections (#219) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stacked on feat/contingency-dense (parity test cross-checks the dense Contingency). Indexed counterpart to the stateless API — one query bucket code vs many doc bitmaps, emitting tables/projections per doc: - contingency_row(q_bitmaps, doc_idx) -> full nb×nb table |Q_a ∩ D_b|, single pass. - diagonal_overlap_row(..) -> the nb diagonal cells (nb popcount-AND passes, not nb²). - project_all_batched(q_bitmaps, &[&weights]) -> docs×projections: builds each doc's table ONCE then applies every weight matrix to the cached integers (no per-projection rescan, unlike bilinear_score). rayon over docs. Plus diagonal_weights()/banded_weights() weight-matrix constructors. Kernels: scalar + AVX-512 VPOPCNTDQ (contingency/diagonal_accumulate_avx512vpop) mirroring bitmap.rs masked-tail popcount-AND, runtime-dispatched, portable fallback; #![deny(unsafe_op_in_unsafe_fn)] honored; no new deps; not wired into any search path. Correctness gate: scalar == SIMD == diagonal == dense Contingency, EXACT integer equality across dims 384/768/1024 (full + masked tail) × bits {2,4}. bench_contingency.rs (SYNTHETIC, 3 regimes): build-once/project-many 5.7-6.5×, no-rescan 4.4-5.5×, SIMD 2.3-3.4×. Verified: fmt/clippy(-D warnings)/test(experimental + default) green. Behind `experimental`. Refs #219. Targets 0.6 (may land in 0.5.0 — maintainer decides). Signed-off-by: Nelson Spence --- examples/bench_contingency.rs | 497 ++++++++++++++++++++++ src/multi_bucket.rs | 747 ++++++++++++++++++++++++++++++++++ 2 files changed, 1244 insertions(+) create mode 100644 examples/bench_contingency.rs diff --git a/examples/bench_contingency.rs b/examples/bench_contingency.rs new file mode 100644 index 00000000..a4dbb0ec --- /dev/null +++ b/examples/bench_contingency.rs @@ -0,0 +1,497 @@ +//! Profiling harness for the contingency / projection surface (issue #219), +//! on a SYNTHETIC seeded corpus — NOT real data, fully reproducible. +//! +//! Two APIs, three regimes: +//! +//! * API 1 — stateless dense [`ordvec::Contingency`]: the `nb × nb` +//! bucket-overlap table for two `&[u8]` code slices, plus its named +//! projections. +//! * API 2 — the indexed [`ordvec::MultiBucketBitmap`] twin: +//! `contingency_row` / `diagonal_overlap_row` / `project_all_batched` +//! accumulate the same table straight from the stored bucket bitmaps. +//! +//! Regimes (issue #219 acceptance): +//! +//! (a) ONE PAIR / MANY PROJECTIONS — build the dense `Contingency` table +//! once, then apply all ~7 projections off the cached integer table, +//! vs naively recomputing each projection from the raw `&[u8]` codes +//! every time. Shows the build-once-project-many win. +//! +//! (b) ONE QUERY / MANY DOCS / ONE PROJECTION — the indexed diagonal fast +//! path over the whole corpus (dispatched = SIMD-when-available, and a +//! forced-scalar twin), vs the dense pairwise loop that rebuilds the +//! full `nb × nb` table per doc just to read its diagonal. +//! +//! (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — the indexed batched +//! `project_all_batched` (one table per doc, K projections off it), vs +//! calling the single-projection indexed path K times (one corpus +//! rescan per projection). Shows the no-rescan win, dispatched vs +//! forced-scalar. +//! +//! Timing is median-of-N wall clock (same discipline as +//! `examples/bench_retrieval.rs`). Every number is SYNTHETIC. +//! +//! NOTE on SIMD: the indexed kernels dispatch to an AVX-512 VPOPCNTDQ +//! popcount-AND path only when the host advertises `avx512f` AND +//! `avx512vpopcntdq` (the dense API-1 kernel additionally needs `avx512bw`). +//! On a host without VPOPCNTDQ the "dispatched" column runs the SAME scalar +//! kernel as the forced-scalar column, so the SIMD/scalar ratio is ~1.0 — +//! that is the honest result, not a bug. The header prints the host features +//! so the ratio can be read in context. +//! +//! Run: cargo run --release --features experimental --example bench_contingency + +#[cfg(not(feature = "experimental"))] +fn main() { + eprintln!( + "bench_contingency requires --features experimental (Contingency / \ + MultiBucketBitmap). Re-run: cargo run --release --features experimental \ + --example bench_contingency" + ); +} + +#[cfg(feature = "experimental")] +fn main() { + bench::run(); +} + +#[cfg(feature = "experimental")] +mod bench { + use std::time::Instant; + + use ordvec::{Contingency, MultiBucketBitmap, Projection}; + use rand::{RngExt, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + // ---- timing ----------------------------------------------------------- + + /// Median wall-clock seconds over `reps` timed runs (one warmup first), + /// each run doing `iters` repetitions of `f`. Returns seconds PER `f` + /// call (total / iters), so callers report cost of a single operation. + fn median_secs(reps: usize, iters: usize, mut f: impl FnMut()) -> f64 { + // warmup + for _ in 0..iters { + f(); + } + let mut lats = Vec::with_capacity(reps); + for _ in 0..reps { + let t = Instant::now(); + for _ in 0..iters { + f(); + } + lats.push(t.elapsed().as_secs_f64() / iters as f64); + } + lats.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + lats[lats.len() / 2] + } + + // ---- corpus ----------------------------------------------------------- + + fn gauss(rng: &mut ChaCha8Rng) -> f32 { + let u1: f32 = rng.random_range(1e-9..1.0); + let u2: f32 = rng.random_range(0.0..1.0); + (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos() + } + + /// Clustered gaussian corpus + a single query, seeded for reproducibility. + /// Each doc/query draws a cluster prototype plus noise (queries quieter), + /// so bucket assignments correlate the way real ordinal codes would — + /// pure iid noise would make every contingency table look the same. + fn make_data(n: usize, dim: usize, n_clusters: usize, seed: u64) -> (Vec, Vec) { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut protos = vec![0.0f32; n_clusters * dim]; + for x in protos.iter_mut() { + *x = gauss(&mut rng); + } + let emit = |count: usize, noise: f32, rng: &mut ChaCha8Rng| -> Vec { + let mut out = vec![0.0f32; count * dim]; + for i in 0..count { + let c = rng.random_range(0..n_clusters as u32) as usize; + for j in 0..dim { + out[i * dim + j] = protos[c * dim + j] + noise * gauss(rng); + } + } + out + }; + let corpus = emit(n, 1.0, &mut rng); + let query = emit(1, 0.5, &mut rng); + (corpus, query) + } + + /// Reconstruct the per-coordinate bucket codes (`0..nb`) a doc was encoded + /// with by reading back the index's stored bitmaps via the indexed + /// contingency: feeding a one-hot "query" for bucket `b` recovers, per + /// coordinate, whether the doc sits in bucket `b`. Simpler here: rebuild + /// the codes from `query_bitmaps_from_ranks` of the raw float vector, which + /// is exactly the encoding `add` applies. + fn codes_from_bitmaps(bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec { + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = b * qpb; + for (j, code) in codes.iter_mut().enumerate() { + if (bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + *code = b as u8; + } + } + } + codes + } + + // ---- the ~7 projections, as both named ops (API 1) and weight matrices -- + + /// The named-projection battery used in regime (a): the full surface a + /// caller might evaluate off one table (top-overlap, top-group, diagonal, + /// two bands, L1 transport, centred outer product). + fn named_projections(nb: usize) -> Vec { + let mut p = vec![ + Projection::TopOverlap, + Projection::TopGroupOverlap { width: 2 }, + Projection::DiagonalAgreement, + Projection::BandAgreement { radius: 1 }, + Projection::BucketL1Distance, + Projection::RankQuantSymmetric, + ]; + if nb > 4 { + p.push(Projection::BandAgreement { radius: 2 }); + } + p + } + + /// The same battery expressed as raw `nb × nb` weight matrices, for the + /// indexed `project_all_batched` path in regime (c). (TopOverlap, + /// TopGroup, diagonal, band1, band2, outer product — six dense matrices; + /// L1 distance is also expressible as a weight matrix `|a-b|`.) + fn weight_matrices(nb: usize) -> Vec> { + let c = (nb as f32 - 1.0) / 2.0; + let mut out: Vec> = Vec::new(); + + // top-overlap: single cell (nb-1, nb-1) + let mut w = vec![0.0f32; nb * nb]; + w[(nb - 1) * nb + (nb - 1)] = 1.0; + out.push(w); + + // top-group width 2 + let mut w = vec![0.0f32; nb * nb]; + let start = nb - 2; + for a in start..nb { + for b in start..nb { + w[a * nb + b] = 1.0; + } + } + out.push(w); + + // diagonal + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + w[a * nb + a] = 1.0; + } + out.push(w); + + // band radius 1 (unit weights) + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= 1 { + w[a * nb + b] = 1.0; + } + } + } + out.push(w); + + // bucket-L1 transport cost |a-b| + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + w[a * nb + b] = a.abs_diff(b) as f32; + } + } + out.push(w); + + // centred outer product (a-c)(b-c) + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + w[a * nb + b] = (a as f32 - c) * (b as f32 - c); + } + } + out.push(w); + + if nb > 4 { + // band radius 2 + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= 2 { + w[a * nb + b] = 1.0; + } + } + } + out.push(w); + } + out + } + + pub fn run() { + #[cfg(target_arch = "x86_64")] + { + println!( + "# host SIMD: avx2={} avx512f={} avx512bw={} avx512vpopcntdq={}", + is_x86_feature_detected!("avx2"), + is_x86_feature_detected!("avx512f"), + is_x86_feature_detected!("avx512bw"), + is_x86_feature_detected!("avx512vpopcntdq"), + ); + if !is_x86_feature_detected!("avx512vpopcntdq") { + println!( + "# NOTE: no avx512vpopcntdq on this host — the 'dispatched' \ + and 'scalar' columns run the SAME kernel; ratio ~1.0 is honest." + ); + } + } + #[cfg(not(target_arch = "x86_64"))] + println!("# host SIMD: non-x86_64 target — scalar kernels only"); + + println!("# bench_contingency — SYNTHETIC clustered gaussian corpus (not real data)"); + + for &bits in &[2u8, 4u8] { + run_bits(bits); + println!(); + } + } + + fn run_bits(bits: u8) { + let dim = 1024usize; + let n = 20_000usize; + let n_clusters = 64usize; + let seed = 219u64; + let nb = 1usize << bits; + let qpb = dim / 64; + + println!("# ===== bits={bits} (nb={nb}) dim={dim} n={n} clusters={n_clusters} seed={seed} ====="); + + let (corpus, query) = make_data(n, dim, n_clusters, seed); + + // Build the indexed surface (API 2). + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + + // Per-coord codes for API 1 (dense). `query_bitmaps_from_ranks` applies + // the exact same rank→bucket encoding `add` uses, so re-encoding doc 0's + // float vector and the query recovers the real codes the index holds. + let q_codes = codes_from_bitmaps(&q_bitmaps, nb, qpb, dim); + let doc0 = &corpus[0..dim]; + let d0_bitmaps = idx.query_bitmaps_from_ranks(doc0); + let d_codes = codes_from_bitmaps(&d0_bitmaps, nb, qpb, dim); + + // ---- regime (a): ONE PAIR / MANY PROJECTIONS (dense API 1) -------- + regime_a(&q_codes, &d_codes, nb); + + // ---- regime (b): ONE QUERY / MANY DOCS / ONE PROJECTION ---------- + regime_b(&idx, &q_bitmaps, n, nb); + + // ---- regime (c): ONE QUERY / MANY DOCS / MANY PROJECTIONS -------- + regime_c(&idx, &q_bitmaps, nb); + } + + /// (a) Build the dense `Contingency` once, apply all projections off the + /// cached table — vs recomputing each projection from the raw codes (build + /// a fresh table per projection). Single pair, repeated for timing. + fn regime_a(q_codes: &[u8], d_codes: &[u8], nb: usize) { + let projs = named_projections(nb); + let n_proj = projs.len(); + + // Build-once-project-many: one table, then all projections. + let build_once = median_secs(25, 200, || { + let c = Contingency::new(q_codes, d_codes, nb).unwrap(); + let mut acc = 0.0f32; + for p in &projs { + acc += p.score(&c); + } + std::hint::black_box(acc); + }); + + // Naive: rebuild the table from raw codes for EVERY projection. + let rebuild_each = median_secs(25, 200, || { + let mut acc = 0.0f32; + for p in &projs { + let c = Contingency::new(q_codes, d_codes, nb).unwrap(); + acc += p.score(&c); + } + std::hint::black_box(acc); + }); + + let speedup = rebuild_each / build_once; + println!("# (a) ONE PAIR / {n_proj} PROJECTIONS — dense Contingency (API 1)"); + println!("{:<34} {:>14} {:>14}", "approach", "us/pair", "speedup"); + println!( + "{:<34} {:>14.3} {:>14}", + "build-once, project-many", + build_once * 1e6, + "1.00x (ref)" + ); + println!( + "{:<34} {:>14.3} {:>13.2}x", + "rebuild-per-projection", + rebuild_each * 1e6, + 1.0 / speedup + ); + println!( + "DATA\ta\tbits={}\tbuild_once_us={:.3}\trebuild_each_us={:.3}\tn_proj={}\tspeedup={:.2}", + (nb as f32).log2() as u32, + build_once * 1e6, + rebuild_each * 1e6, + n_proj, + speedup + ); + } + + /// (b) ONE QUERY / MANY DOCS / ONE PROJECTION — indexed diagonal fast path + /// (dispatched + forced-scalar) over the corpus, vs the dense pairwise + /// loop (full table per doc, read its diagonal). + fn regime_b(idx: &MultiBucketBitmap, q_bitmaps: &[u64], n: usize, nb: usize) { + // dispatched diagonal fast path over every doc + let diag_dispatched = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let d = idx.diagonal_overlap_row(q_bitmaps, di); + acc += d.iter().map(|&x| x as u64).sum::(); + } + std::hint::black_box(acc); + }); + + // forced-scalar diagonal fast path + let diag_scalar = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let d = idx.diagonal_overlap_row_scalar(q_bitmaps, di); + acc += d.iter().map(|&x| x as u64).sum::(); + } + std::hint::black_box(acc); + }); + + // dense pairwise loop: build the FULL nb*nb table per doc (dispatched + // contingency_row), then take its diagonal trace. This is the work the + // diagonal fast path avoids (nb passes vs nb*nb). + let dense_full = median_secs(7, 3, || { + let mut acc = 0u64; + for di in 0..n { + let table = idx.contingency_row(q_bitmaps, di); + for a in 0..nb { + acc += table[a * nb + a] as u64; + } + } + std::hint::black_box(acc); + }); + + let simd_speedup = diag_scalar / diag_dispatched; + let fastpath_speedup = dense_full / diag_dispatched; + + let per = |s: f64| s / n as f64 * 1e9; // ns per doc + println!("# (b) ONE QUERY / {n} DOCS / ONE PROJECTION (diagonal) — indexed (API 2)"); + println!( + "{:<34} {:>14} {:>16}", + "approach", "ns/doc", "vs dispatched" + ); + println!( + "{:<34} {:>14.2} {:>16}", + "diagonal fast path (dispatched)", + per(diag_dispatched), + "1.00x (ref)" + ); + println!( + "{:<34} {:>14.2} {:>15.2}x", + "diagonal fast path (scalar)", + per(diag_scalar), + simd_speedup + ); + println!( + "{:<34} {:>14.2} {:>15.2}x", + "dense full-table diagonal", + per(dense_full), + 1.0 / fastpath_speedup + ); + println!( + "DATA\tb\tbits={}\tdiag_disp_ns={:.2}\tdiag_scalar_ns={:.2}\tdense_full_ns={:.2}\tsimd_speedup={:.2}\tfastpath_speedup={:.2}", + (nb as f32).log2() as u32, + per(diag_dispatched), + per(diag_scalar), + per(dense_full), + simd_speedup, + fastpath_speedup + ); + } + + /// (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — batched + /// `project_all_batched` (table once, K projections off it) vs calling the + /// single-projection path K times (one corpus rescan per projection). + /// Both dispatched and forced-scalar. + fn regime_c(idx: &MultiBucketBitmap, q_bitmaps: &[u64], nb: usize) { + let mats = weight_matrices(nb); + let k = mats.len(); + let weights: Vec<&[f32]> = mats.iter().map(|w| w.as_slice()).collect(); + let n = idx.len(); + + // batched dispatched: one table per doc, all K projections off it. + let batched_disp = median_secs(7, 3, || { + let out = idx.project_all_batched(q_bitmaps, &weights); + std::hint::black_box(out.len()); + }); + + // batched forced-scalar. + let batched_scalar = median_secs(7, 3, || { + let out = idx.project_all_batched_scalar(q_bitmaps, &weights); + std::hint::black_box(out.len()); + }); + + // rescan: call the single-projection batched path K times (each call + // rescans the whole corpus building a fresh table per doc). Dispatched. + let rescan_disp = median_secs(7, 3, || { + let mut acc = 0usize; + for w in &weights { + let single: [&[f32]; 1] = [w]; + let out = idx.project_all_batched(q_bitmaps, &single); + acc += out.len(); + } + std::hint::black_box(acc); + }); + + let norescan_speedup = rescan_disp / batched_disp; + let simd_speedup = batched_scalar / batched_disp; + + let per = |s: f64| s / n as f64 * 1e9; // ns per doc (over all K) + println!("# (c) ONE QUERY / {n} DOCS / {k} PROJECTIONS — indexed batched (API 2)"); + println!( + "{:<38} {:>14} {:>16}", + "approach", "ns/doc(allK)", "vs batched" + ); + println!( + "{:<38} {:>14.2} {:>16}", + "project_all_batched (dispatched)", + per(batched_disp), + "1.00x (ref)" + ); + println!( + "{:<38} {:>14.2} {:>15.2}x", + "project_all_batched (scalar)", + per(batched_scalar), + simd_speedup + ); + println!( + "{:<38} {:>14.2} {:>15.2}x", + "rescan: single-proj x K (dispatched)", + per(rescan_disp), + norescan_speedup + ); + println!( + "DATA\tc\tbits={}\tbatched_disp_ns={:.2}\tbatched_scalar_ns={:.2}\trescan_disp_ns={:.2}\tk={}\tnorescan_speedup={:.2}\tsimd_speedup={:.2}", + (nb as f32).log2() as u32, + per(batched_disp), + per(batched_scalar), + per(rescan_disp), + k, + norescan_speedup, + simd_speedup + ); + } +} diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 0b4a2d9c..144b86cf 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -131,6 +131,42 @@ impl MultiBucketBitmap { w } + /// Unit-diagonal weight matrix: `W[a, a] = 1`, off-diagonal `0`. The + /// bilinear score reduces to `Σ_a |Q_a ∩ D_a|` — the same-bucket + /// agreement count. This is the cheapest truncation: `nb` popcount-AND + /// passes per doc instead of the full `nb²`. It is a pure overlap signal + /// (no bucket-magnitude weighting), so it is closest in spirit to a + /// multi-level [`crate::Bitmap`]. + pub fn diagonal_weights(&self) -> Vec { + let nb = self.n_buckets; + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + w[a * nb + a] = 1.0; + } + w + } + + /// Outer-product weights `(a − c)(b − c)` restricted to the band + /// `|a − b| <= half_width` (off-band entries zeroed). `half_width = 0` + /// keeps only the magnitude-weighted diagonal; `half_width >= nb − 1` + /// recovers the full [`Self::outer_product_weights`] matrix. Sweeping + /// `half_width` interpolates candidate-gen cost (non-zero band entries ⇒ + /// popcount-AND passes per doc) between the diagonal and the exact + /// bilinear probe, tracing the recall/latency frontier. + pub fn banded_weights(&self, half_width: usize) -> Vec { + let nb = self.n_buckets; + let c = (nb as f32 - 1.0) / 2.0; + let mut w = vec![0.0f32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + if a.abs_diff(b) <= half_width { + w[a * nb + b] = (a as f32 - c) * (b as f32 - c); + } + } + } + w + } + /// Compute the bilinear bucket-overlap score /// `Σ_{a, b} W[a, b] · |Q_a ∩ D_b|` /// for a single (query, doc) pair. Scales nothing — caller @@ -194,6 +230,223 @@ impl MultiBucketBitmap { head } + // ----------------------------------------------------------------- + // Indexed contingency / projection surface (issue #219, API 2 of 2). + // + // `bilinear_score` / `top_m_bilinear` recompute one weighted `Σ W·C` + // per call and rescan the doc's bitmaps once per *projection*. The + // methods below decouple the accumulation from the projection: the + // `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` is built **once** + // per (query, doc) over a single pass of the doc's bitmaps, then any + // number of weight matrices are applied as cheap `nb²` dot products + // over the cached integer table. This is the in-index analogue of the + // stateless [`crate::Contingency`] (API 1) — same `nb × nb` object, + // but accumulated straight from the bitmap rows rather than from raw + // `&[u8]` codes. + // + // The hot inner accumulation (the `nb × nb` popcount-AND over `qpb` + // u64 words) dispatches at runtime to an AVX-512 VPOPCNTDQ kernel, + // mirroring [`crate::Bitmap`]'s masked-tail popcount-AND scan, with a + // portable scalar fallback on every other target / unsupported CPU. + // ----------------------------------------------------------------- + + /// Full `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` for one + /// document, accumulated in a single pass over the doc's bitmaps. + /// + /// Returns a row-major `nb × nb` `Vec` where `out[a * nb + b]` is + /// the count of coordinates the query placed in bucket `a` and the doc + /// placed in bucket `b`. This is the indexed twin of + /// [`crate::Contingency::counts`]: the same table, built from the + /// bitmap rows instead of `&[u8]` codes. A caller can then apply any + /// projection (e.g. via [`crate::Contingency`]'s projections, or a raw + /// `nb²` weighted sum) without rescanning the bitmaps. + /// + /// # Panics + /// Panics if `doc_idx >= len()`. `q_bitmaps` must be `nb * qpb` words + /// (as produced by [`Self::query_bitmaps_from_ranks`]). + pub fn contingency_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "contingency_row: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "contingency_row: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut table = vec![0u32; nb * nb]; + contingency_accumulate(q_bitmaps, doc, nb, qpb, &mut table); + table + } + + /// Diagonal-only fast path: the `nb` cells `C[a, a] = |Q_a ∩ D_a|` for + /// one document, in a single pass over the doc's bitmaps. + /// + /// This is the common cheap projection — same-bucket agreement per + /// bucket — and costs `nb` popcount-AND passes per doc instead of the + /// full `nb²`. `out[a]` is `|Q_a ∩ D_a|`. Summing the result is the + /// [`Self::diagonal_weights`] bilinear score. + /// + /// # Panics + /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`. + pub fn diagonal_overlap_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "diagonal_overlap_row: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "diagonal_overlap_row: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut diag = vec![0u32; nb]; + diagonal_accumulate(q_bitmaps, doc, nb, qpb, &mut diag); + diag + } + + /// Batched indexed projection: for every document, build its `nb × nb` + /// contingency table **once**, then apply *all* of `weights` to that + /// single cached table, returning `docs × projections` scores. + /// + /// `weights[p]` is the row-major `nb × nb` weight matrix for projection + /// `p`. The returned `out[di][p]` is `Σ_{a, b} weights[p][a*nb+b] · + /// C_di[a, b]`. Each document's bitmaps are streamed once regardless of + /// how many projections are requested — the projections share the + /// accumulated table rather than each rescanning the corpus (which is + /// what `bilinear_score` would do per projection). + /// + /// Runs across documents in parallel via rayon. The per-doc table is a + /// small `nb²` integer buffer; the projection dot products are cheap + /// relative to the popcount-AND accumulation that dominates. + /// + /// # Panics + /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len() + /// != nb * nb`. + #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"] + pub fn project_all_batched(&self, q_bitmaps: &[u64], weights: &[&[f32]]) -> Vec> { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "project_all_batched: q_bitmaps must be nb * qpb words", + ); + for (p, w) in weights.iter().enumerate() { + assert_eq!( + w.len(), + nb * nb, + "project_all_batched: weights[{p}] must be an nb * nb matrix", + ); + } + let n = self.n_vectors; + let n_proj = weights.len(); + if n == 0 || n_proj == 0 { + return vec![Vec::new(); n]; + } + let per_doc = nb * qpb; + let bitmaps = &self.bitmaps; + (0..n) + .into_par_iter() + .map(|di| { + let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; + // One accumulation pass over this doc's bitmaps. + let mut table = vec![0u32; nb * nb]; + contingency_accumulate(q_bitmaps, doc, nb, qpb, &mut table); + project_table(&table, weights) + }) + .collect() + } + + /// `#[doc(hidden)]` bench-only twin of [`Self::diagonal_overlap_row`] that + /// forces the **portable scalar** diagonal kernel, bypassing the runtime + /// AVX-512 dispatch. It exists so `examples/bench_contingency` can time the + /// scalar and SIMD diagonal paths against each other on the same index + /// (mirroring the `#[doc(hidden)]` `search_asymmetric_byte_lut` bench + /// reference at the crate root). Not part of the stable API — production + /// callers use [`Self::diagonal_overlap_row`], which dispatches to the + /// fastest available kernel. + /// + /// # Panics + /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`. + #[doc(hidden)] + pub fn diagonal_overlap_row_scalar(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert!( + doc_idx < self.n_vectors, + "diagonal_overlap_row_scalar: doc_idx {doc_idx} out of range (n_vectors {})", + self.n_vectors, + ); + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "diagonal_overlap_row_scalar: q_bitmaps must be nb * qpb words", + ); + let doc_base = doc_idx * nb * qpb; + let doc = &self.bitmaps[doc_base..doc_base + nb * qpb]; + let mut diag = vec![0u32; nb]; + diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, &mut diag); + diag + } + + /// `#[doc(hidden)]` bench-only twin of [`Self::project_all_batched`] that + /// forces the **portable scalar** contingency-accumulation kernel, + /// bypassing the runtime AVX-512 dispatch. Lets + /// `examples/bench_contingency` time the scalar and SIMD batched-projection + /// paths on the same index. Not part of the stable API. + /// + /// # Panics + /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len() + /// != nb * nb`. + #[doc(hidden)] + #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"] + pub fn project_all_batched_scalar( + &self, + q_bitmaps: &[u64], + weights: &[&[f32]], + ) -> Vec> { + let qpb = self.qwords_per_bitmap; + let nb = self.n_buckets; + assert_eq!( + q_bitmaps.len(), + nb * qpb, + "project_all_batched_scalar: q_bitmaps must be nb * qpb words", + ); + for (p, w) in weights.iter().enumerate() { + assert_eq!( + w.len(), + nb * nb, + "project_all_batched_scalar: weights[{p}] must be an nb * nb matrix", + ); + } + let n = self.n_vectors; + let n_proj = weights.len(); + if n == 0 || n_proj == 0 { + return vec![Vec::new(); n]; + } + let per_doc = nb * qpb; + let bitmaps = &self.bitmaps; + (0..n) + .into_par_iter() + .map(|di| { + let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; + let mut table = vec![0u32; nb * nb]; + contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, &mut table); + project_table(&table, weights) + }) + .collect() + } + pub fn len(&self) -> usize { self.n_vectors } @@ -216,3 +469,497 @@ impl MultiBucketBitmap { self.bitmaps.len() * std::mem::size_of::() } } + +// ===================================================================== +// Contingency-accumulation kernels. +// +// `q_bitmaps` and `doc` are both row-major `nb * qpb` u64 layouts: bucket +// `a`'s bitmap occupies words `[a * qpb, (a + 1) * qpb)`. The full table +// cell is +// C[a, b] = Σ_{w < qpb} popcount(Q_a[w] & D_b[w]), +// the popcount over the bitwise-AND of bucket `a`'s query bitmap and +// bucket `b`'s doc bitmap — the exact popcount-AND shape of +// [`crate::Bitmap`]'s overlap kernel, here run for every (a, b) pair. +// +// Each path dispatches to an AVX-512 VPOPCNTDQ kernel when the host +// supports it (mirroring the masked-tail discipline in `index/bitmap.rs`), +// otherwise the portable scalar reference. Counts are exact integers, so +// the SIMD and scalar paths are required to agree bit-for-bit (the parity +// test enforces this against API 1's dense `Contingency`). +// ===================================================================== + +/// Build the full row-major `nb × nb` contingency table for one +/// (query, doc) pair into `table` (length `nb * nb`). +/// +/// Dispatches to the AVX-512 VPOPCNTDQ kernel when available, else the +/// scalar reference. Both paths require `q_bitmaps.len() == doc.len() == +/// nb * qpb` and `table.len() == nb * nb`. +fn contingency_accumulate( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + debug_assert_eq!(q_bitmaps.len(), nb * qpb); + debug_assert_eq!(doc.len(), nb * qpb); + debug_assert_eq!(table.len(), nb * nb); + + #[cfg(target_arch = "x86_64")] + let use_avx512vpop = + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq"); + #[cfg(not(target_arch = "x86_64"))] + let use_avx512vpop = false; + + if use_avx512vpop { + #[cfg(target_arch = "x86_64")] + unsafe { + contingency_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, table); + return; + } + } + contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table); +} + +/// Apply every weight matrix in `weights` to a single cached `nb × nb` +/// contingency `table`, returning one `f32` projection score per matrix. +/// Each score is `Σ_{cell} weight[cell] · table[cell]` — the cheap dot +/// product the batched path shares across projections after one accumulation +/// pass. Caller guarantees every `weights[p].len() == table.len()`. +fn project_table(table: &[u32], weights: &[&[f32]]) -> Vec { + weights + .iter() + .map(|w| { + w.iter() + .zip(table.iter()) + .map(|(&weight, &c)| weight * c as f32) + .sum() + }) + .collect() +} + +/// Portable scalar reference for the full `nb × nb` table. One +/// popcount-AND reduction per (a, b) cell over the `qpb` u64 words. +fn contingency_accumulate_scalar( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + for a in 0..nb { + let q_off = a * qpb; + let row = a * nb; + for b in 0..nb { + let d_off = b * qpb; + let mut overlap: u32 = 0; + for w in 0..qpb { + overlap += (q_bitmaps[q_off + w] & doc[d_off + w]).count_ones(); + } + table[row + b] = overlap; + } + } +} + +/// Build only the `nb` diagonal cells `C[a, a] = |Q_a ∩ D_a|` into `diag` +/// (length `nb`). The common cheap projection. +/// +/// Dispatches to the AVX-512 VPOPCNTDQ diagonal kernel when available, +/// else the scalar reference. +fn diagonal_accumulate(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize, diag: &mut [u32]) { + debug_assert_eq!(q_bitmaps.len(), nb * qpb); + debug_assert_eq!(doc.len(), nb * qpb); + debug_assert_eq!(diag.len(), nb); + + #[cfg(target_arch = "x86_64")] + let use_avx512vpop = + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq"); + #[cfg(not(target_arch = "x86_64"))] + let use_avx512vpop = false; + + if use_avx512vpop { + #[cfg(target_arch = "x86_64")] + unsafe { + diagonal_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, diag); + return; + } + } + diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, diag); +} + +/// Portable scalar reference for the `nb` diagonal cells. +fn diagonal_accumulate_scalar( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + diag: &mut [u32], +) { + // `a` is load-bearing: it indexes the bucket bitmap offset (`a * qpb`) + // as well as `diag[a]`. + #[allow(clippy::needless_range_loop)] + for a in 0..nb { + let off = a * qpb; + let mut overlap: u32 = 0; + for w in 0..qpb { + overlap += (q_bitmaps[off + w] & doc[off + w]).count_ones(); + } + diag[a] = overlap; + } +} + +/// AVX-512 VPOPCNTDQ full-table contingency accumulation. +/// +/// Tiles the `qpb`-word bucket bitmaps into 512-bit (8×u64) ZMM blocks +/// with a `bzhi`-style masked tail for the final `qpb % 8` words, exactly +/// as [`crate::Bitmap`]'s scan kernel does. For each block, every query +/// bucket `a`'s lane and every doc bucket `b`'s lane is AND-ed, +/// `_mm512_popcnt_epi64`-counted, and reduced into the `(a, b)` cell — the +/// masked-tail popcount-AND, run over all `nb²` bucket pairs. +/// +/// # Safety +/// Requires the `avx512f` and `avx512vpopcntdq` target features, confirmed +/// by the `#[target_feature]` gate plus the caller's runtime +/// `is_x86_feature_detected!`. `q_bitmaps.len() == doc.len() == nb * qpb` +/// and `table.len() == nb * nb` are caller contracts (asserted at the +/// public entry points); the tail load uses a `(1 << rem) - 1` word mask +/// so no lane past `qpb` is read for either operand. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512vpopcntdq")] +unsafe fn contingency_accumulate_avx512vpop( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + table: &mut [u32], +) { + use std::arch::x86_64::*; + // SAFETY: `q_bitmaps.len() == doc.len() == nb * qpb` and `table.len() + // == nb * nb` (caller contract). Each ZMM load reads 8 u64 words at a + // word offset `< qpb` within bucket `a`/`b`'s `qpb`-word region; the + // final partial block uses `_mm512_maskz_loadu_epi64` with a + // `(1 << rem) - 1` mask so lanes past `qpb` read as zero and are never + // dereferenced. AVX-512 F/VPOPCNTDQ are confirmed by `#[target_feature]` + // plus the runtime detection in the caller. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let full_blocks = qpb / 8; + let rem = qpb % 8; + let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 }; + let q_ptr = q_bitmaps.as_ptr(); + let d_ptr = doc.as_ptr(); + + for a in 0..nb { + let q_base = a * qpb; + let row = a * nb; + for b in 0..nb { + let d_base = b * qpb; + let mut acc = _mm512_setzero_si512(); + // Whole 512-bit blocks. + for blk in 0..full_blocks { + let w = blk * 8; + let q_zmm = _mm512_loadu_si512(q_ptr.add(q_base + w) as *const __m512i); + let d_zmm = _mm512_loadu_si512(d_ptr.add(d_base + w) as *const __m512i); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + // Masked tail: only the low `rem` words are loaded; lanes + // past `qpb` read as zero and contribute no popcount. + if rem != 0 { + let w = full_blocks * 8; + let q_zmm = + _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(q_base + w) as *const i64); + let d_zmm = + _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(d_base + w) as *const i64); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + let sum: i64 = _mm512_reduce_add_epi64(acc); + table[row + b] = sum as u32; + } + } + } +} + +/// AVX-512 VPOPCNTDQ diagonal-only accumulation: the `nb` cells +/// `C[a, a]`, each a masked-tail popcount-AND over bucket `a`'s query and +/// doc bitmaps. Same masked-tail discipline as the full kernel, restricted +/// to the diagonal. +/// +/// # Safety +/// Same contract as [`contingency_accumulate_avx512vpop`]: +/// `q_bitmaps.len() == doc.len() == nb * qpb`, `diag.len() == nb`, the +/// target features confirmed by the gate plus runtime detection, and the +/// masked tail bounding every load to the live `qpb` words. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512vpopcntdq")] +unsafe fn diagonal_accumulate_avx512vpop( + q_bitmaps: &[u64], + doc: &[u64], + nb: usize, + qpb: usize, + diag: &mut [u32], +) { + use std::arch::x86_64::*; + // SAFETY: identical bounding to `contingency_accumulate_avx512vpop`, + // restricted to `b == a`: every load is at a word offset `< qpb` within + // bucket `a`'s `qpb`-word region, with the final partial block masked to + // the low `rem` words. `diag.len() == nb` bounds every `diag[a]` write. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let full_blocks = qpb / 8; + let rem = qpb % 8; + let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 }; + let q_ptr = q_bitmaps.as_ptr(); + let d_ptr = doc.as_ptr(); + + // `a` indexes the bucket bitmap offset (`a * qpb`) and `diag[a]`. + #[allow(clippy::needless_range_loop)] + for a in 0..nb { + let base = a * qpb; + let mut acc = _mm512_setzero_si512(); + for blk in 0..full_blocks { + let w = blk * 8; + let q_zmm = _mm512_loadu_si512(q_ptr.add(base + w) as *const __m512i); + let d_zmm = _mm512_loadu_si512(d_ptr.add(base + w) as *const __m512i); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + if rem != 0 { + let w = full_blocks * 8; + let q_zmm = _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(base + w) as *const i64); + let d_zmm = _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(base + w) as *const i64); + let and_zmm = _mm512_and_si512(q_zmm, d_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc = _mm512_add_epi64(acc, pop_zmm); + } + let sum: i64 = _mm512_reduce_add_epi64(acc); + diag[a] = sum as u32; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Contingency; + use rand::{RngExt, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + /// Reconstruct the per-coordinate bucket codes a doc was encoded with + /// by reading back the index's stored bitmaps. Used to feed API 1's + /// dense `Contingency` the *same* bucket assignment the index holds. + fn doc_codes(idx: &MultiBucketBitmap, doc_idx: usize) -> Vec { + let nb = idx.n_buckets(); + let qpb = idx.qwords_per_bitmap; + let dim = idx.dim(); + let base = doc_idx * nb * qpb; + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = base + b * qpb; + // `j` is load-bearing: it indexes the bit position (`j/64`, + // `j%64`) as well as `codes[j]`. + #[allow(clippy::needless_range_loop)] + for j in 0..dim { + if (idx.bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + codes[j] = b as u8; + } + } + } + codes + } + + /// Reconstruct the query bucket codes from the query bitmaps. + fn query_codes(q_bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec { + let mut codes = vec![0u8; dim]; + for b in 0..nb { + let off = b * qpb; + // `j` is load-bearing: bit position and `codes[j]` index. + #[allow(clippy::needless_range_loop)] + for j in 0..dim { + if (q_bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 { + codes[j] = b as u8; + } + } + } + codes + } + + /// Independent pure-scalar reference for the full table, not routed + /// through the dispatch (so the dispatched path can't mask its own bug). + fn reference_table(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize) -> Vec { + let mut table = vec![0u32; nb * nb]; + for a in 0..nb { + for b in 0..nb { + let mut overlap = 0u32; + for w in 0..qpb { + overlap += (q_bitmaps[a * qpb + w] & doc[b * qpb + w]).count_ones(); + } + table[a * nb + b] = overlap; + } + } + table + } + + fn make_corpus(seed: u64, n: usize, dim: usize) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect() + } + + /// CORRECTNESS GATE (issue #219): the dispatched contingency path + /// (AVX-512 on this host) and the diagonal fast path must produce + /// BIT-IDENTICAL results to (a) an independent scalar reference, the + /// forced scalar kernel, and (c) a freshly-built dense `Contingency` + /// (API 1) over the same query/doc bucket codes. Counts are exact + /// integers — equality is exact, not tolerance-based. + /// + /// Dims chosen to exercise the masked tail: 384 (qpb=6, all-tail, no + /// full ZMM), 768 (qpb=12, one full ZMM + 4-word tail), 1024 (qpb=16, + /// two full ZMMs, no tail). + #[test] + fn parity_scalar_simd_diagonal_dense() { + for &dim in &[384usize, 768, 1024] { + for bits in [2u8, 4u8] { + let nb = 1usize << bits; + let qpb = dim / 64; + let n = 24; + let corpus = make_corpus(0xC0FFEE ^ dim as u64 ^ (bits as u64) << 32, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + + let mut rng = ChaCha8Rng::seed_from_u64(0x5EED ^ dim as u64); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + let q_codes = query_codes(&q_bitmaps, nb, qpb, dim); + + for di in 0..n { + let doc_base = di * nb * qpb; + let doc = &idx.bitmaps[doc_base..doc_base + nb * qpb]; + + // (a) independent scalar reference table. + let want = reference_table(&q_bitmaps, doc, nb, qpb); + + // forced scalar kernel. + let mut scalar = vec![0u32; nb * nb]; + contingency_accumulate_scalar(&q_bitmaps, doc, nb, qpb, &mut scalar); + assert_eq!( + scalar, want, + "scalar kernel != reference (dim={dim}, bits={bits}, doc={di})", + ); + + // dispatched path (AVX-512 on this host). + let dispatched = idx.contingency_row(&q_bitmaps, di); + assert_eq!( + dispatched, want, + "dispatched (SIMD) != reference (dim={dim}, bits={bits}, doc={di})", + ); + + // diagonal fast path == diagonal of the full table. + let diag = idx.diagonal_overlap_row(&q_bitmaps, di); + let want_diag: Vec = (0..nb).map(|a| want[a * nb + a]).collect(); + assert_eq!( + diag, want_diag, + "diagonal fast path != table diagonal (dim={dim}, bits={bits}, doc={di})", + ); + + // dense `Contingency` (API 1) over the same codes. + let d_codes = doc_codes(&idx, di); + let dense = Contingency::new(&q_codes, &d_codes, nb).unwrap(); + assert_eq!( + dispatched, + dense.counts(), + "dispatched (SIMD) != dense Contingency (dim={dim}, bits={bits}, doc={di})", + ); + // And the dense diagonal trace matches the fast-path sum. + assert_eq!( + diag.iter().sum::(), + dense.diagonal_agreement(), + "diagonal sum != dense diagonal_agreement (dim={dim}, bits={bits}, doc={di})", + ); + } + } + } + } + + /// `project_all_batched` applies every projection to each doc's table + /// built once, and agrees with per-doc `bilinear_score` (which rescans + /// per projection) for the same weight matrices. + #[test] + fn project_all_batched_matches_bilinear_score() { + let dim = 768; + let bits = 4u8; + let n = 40; + let corpus = make_corpus(0xBA7C4, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + + let mut rng = ChaCha8Rng::seed_from_u64(0xD0C5); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + + let outer = idx.outer_product_weights(); + let diagonal = idx.diagonal_weights(); + let banded = idx.banded_weights(1); + let weights: Vec<&[f32]> = vec![&outer, &diagonal, &banded]; + + let batched = idx.project_all_batched(&q_bitmaps, &weights); + assert_eq!(batched.len(), n); + for (di, row) in batched.iter().enumerate() { + assert_eq!(row.len(), 3); + // bilinear_score rescans the doc per weight matrix — the + // batched path must reproduce it exactly (same integer table, + // same weighted sum, same f32 accumulation order per matrix). + for (p, w) in weights.iter().enumerate() { + let want = idx.bilinear_score(&q_bitmaps, w, di); + assert_eq!( + row[p], want, + "project_all_batched[{di}][{p}] != bilinear_score", + ); + } + } + } + + /// Diagonal fast path sums to the `diagonal_weights` bilinear score. + #[test] + fn diagonal_row_sums_to_diagonal_bilinear_score() { + let dim = 384; + let bits = 2u8; + let n = 16; + let corpus = make_corpus(0xD1A6, n, dim); + let mut idx = MultiBucketBitmap::new(dim, bits); + idx.add(&corpus); + let mut rng = ChaCha8Rng::seed_from_u64(0xF00D); + let query: Vec = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect(); + let q_bitmaps = idx.query_bitmaps_from_ranks(&query); + let w = idx.diagonal_weights(); + for di in 0..n { + let diag_sum: u32 = idx.diagonal_overlap_row(&q_bitmaps, di).iter().sum(); + assert_eq!(diag_sum as f32, idx.bilinear_score(&q_bitmaps, &w, di)); + } + } + + #[test] + fn empty_index_project_all_batched_is_empty() { + let idx = MultiBucketBitmap::new(256, 2); + let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.0f32; 256]); + let diag = idx.diagonal_weights(); + let weights: Vec<&[f32]> = vec![&diag]; + assert!(idx.project_all_batched(&q_bitmaps, &weights).is_empty()); + } + + #[test] + fn project_all_batched_no_weights_yields_empty_rows() { + let dim = 256; + let mut idx = MultiBucketBitmap::new(dim, 2); + idx.add(&make_corpus(1, 5, dim)); + let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.5f32; dim]); + let weights: Vec<&[f32]> = Vec::new(); + let out = idx.project_all_batched(&q_bitmaps, &weights); + assert_eq!(out.len(), 5); + assert!(out.iter().all(|row| row.is_empty())); + } +} From d83a1bb6ce90f7a23eabde089be6b2121b1513f7 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 14:04:09 -0500 Subject: [PATCH 03/13] perf(contingency): apply gemini review optimizations (#224) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AVX-512 histogram kernel: precompute the nb bucket-value broadcast vectors once before the 64-byte block loop instead of recomputing set1_epi8 per block per bucket (gemini high). - band_agreement: iterate only the in-band columns instead of scanning all columns with an abs_diff filter. - rankquant_symmetric_score: factor the query weight out of the inner loop (nb multiplies instead of nb²). All exact: simd==scalar parity, rankquant-direct-sum parity, and projection parity tests unchanged and green. fmt/clippy clean. Signed-off-by: Nelson Spence --- src/contingency.rs | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/contingency.rs b/src/contingency.rs index aca6cca9..183bfb9d 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -167,10 +167,12 @@ impl Contingency { let mut total = 0u32; for qb in 0..self.buckets { let base = qb * self.buckets; - for db in 0..self.buckets { - if qb.abs_diff(db) <= radius { - total += self.counts[base + db]; - } + // Iterate only the in-band columns instead of scanning every + // column with an `abs_diff` filter. + let start = qb.saturating_sub(radius); + let end = (qb + radius).min(self.buckets - 1); + for db in start..=end { + total += self.counts[base + db]; } } total @@ -250,10 +252,14 @@ impl Contingency { for qb in 0..self.buckets { let base = qb * self.buckets; let qw = qb as f32 - centre; + // `qw` is invariant over the inner loop: accumulate the row's + // centred mass first, then scale by `qw` once (nb multiplies + // instead of nb²). + let mut row_sum = 0.0f32; for db in 0..self.buckets { - let weight = qw * (db as f32 - centre); - score += weight * self.counts[base + db] as f32; + row_sum += (db as f32 - centre) * self.counts[base + db] as f32; } + score += qw * row_sum; } score } @@ -420,6 +426,14 @@ unsafe fn build_histogram_avx512(query: &[u8], doc: &[u8], nb: usize, counts: &m // u64). A doc cell holds at most `len` codes; u64 cannot overflow. let mut acc = vec![0u64; nb * nb]; + // Bucket-value broadcast vectors, invariant across the 64-byte blocks — + // precompute once instead of recomputing `set1_epi8` per block per + // bucket. `nb <= 16` (dispatch gate), so a fixed array of 16 covers it. + let mut splats = [_mm512_setzero_si512(); 16]; + for (i, s) in splats.iter_mut().enumerate().take(nb) { + *s = _mm512_set1_epi8(i as i8); + } + let mut off = 0usize; while off < len { let rem = len - off; @@ -450,15 +464,13 @@ unsafe fn build_histogram_avx512(query: &[u8], doc: &[u8], nb: usize, counts: &m // popcount of the AND of the two lane masks, restricted to live // lanes. This is popcount(maskQ & maskD) — the masked popcount-AND. for a in 0..nb { - let a_splat = _mm512_set1_epi8(a as i8); - let q_eq: __mmask64 = _mm512_cmpeq_epi8_mask(q_vec, a_splat) & live; + let q_eq: __mmask64 = _mm512_cmpeq_epi8_mask(q_vec, splats[a]) & live; if q_eq == 0 { continue; } let row = a * nb; for b in 0..nb { - let b_splat = _mm512_set1_epi8(b as i8); - let d_eq: __mmask64 = _mm512_cmpeq_epi8_mask(d_vec, b_splat); + let d_eq: __mmask64 = _mm512_cmpeq_epi8_mask(d_vec, splats[b]); acc[row + b] += (q_eq & d_eq).count_ones() as u64; } } From f9df6e908080ca8c1aed59653cc4399e8606d809 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 14:06:20 -0500 Subject: [PATCH 04/13] perf(multi_bucket): stack-allocate the per-doc projection table (#225) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gemini: project_all_batched and its scalar twin allocated a Vec per document inside the rayon parallel map. nb is 2/4/16 (bits 1/2/4), so nb*nb <= 256 — use a stack [0u32; 256] sliced to nb*nb, removing the per-doc heap allocation and allocator contention from the batched hot loop. Exact parity unchanged (scalar==SIMD==diagonal==dense green). fmt/clippy clean. Signed-off-by: Nelson Spence --- src/multi_bucket.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 144b86cf..8d279137 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -359,10 +359,13 @@ impl MultiBucketBitmap { .into_par_iter() .map(|di| { let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; - // One accumulation pass over this doc's bitmaps. - let mut table = vec![0u32; nb * nb]; - contingency_accumulate(q_bitmaps, doc, nb, qpb, &mut table); - project_table(&table, weights) + // One accumulation pass over this doc's bitmaps. nb <= 16 ⇒ + // nb*nb <= 256, so a stack table avoids a per-doc heap + // allocation inside the parallel map (allocator contention). + let mut table = [0u32; 256]; + let table = &mut table[..nb * nb]; + contingency_accumulate(q_bitmaps, doc, nb, qpb, table); + project_table(table, weights) }) .collect() } @@ -440,9 +443,12 @@ impl MultiBucketBitmap { .into_par_iter() .map(|di| { let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; - let mut table = vec![0u32; nb * nb]; - contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, &mut table); - project_table(&table, weights) + // Stack table (nb*nb <= 256): no per-doc heap alloc in the + // parallel map. + let mut table = [0u32; 256]; + let table = &mut table[..nb * nb]; + contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table); + project_table(table, weights) }) .collect() } From f965bf5879cee298dc449736da6dcaf228e26adf Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 15:59:40 -0500 Subject: [PATCH 05/13] fix(contingency): saturating_add in band_agreement (qodo: radius overflow) (#224) radius is an uncapped public parameter (Projection::BandAgreement{radius}); a near-usize::MAX value overflowed qb + radius (panic in debug, silent wrap to a wrong/zero total in release). Use qb.saturating_add(radius). Regression test asserts band_agreement(usize::MAX) == total_count. Signed-off-by: Nelson Spence --- src/contingency.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/contingency.rs b/src/contingency.rs index 183bfb9d..4b9d1107 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -170,7 +170,9 @@ impl Contingency { // Iterate only the in-band columns instead of scanning every // column with an `abs_diff` filter. let start = qb.saturating_sub(radius); - let end = (qb + radius).min(self.buckets - 1); + // `saturating_add`: `radius` is an uncapped public parameter, so a + // near-`usize::MAX` value must not overflow before the `.min()`. + let end = qb.saturating_add(radius).min(self.buckets - 1); for db in start..=end { total += self.counts[base + db]; } @@ -534,6 +536,16 @@ mod tests { assert_eq!(c.rankquant_symmetric_score(), 4.0); } + #[test] + fn band_agreement_saturates_on_huge_radius() { + // `radius` is uncapped public input; a near-`usize::MAX` value must not + // overflow `qb + radius`. It should clamp to the whole table. + let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; + let doc = [0u8, 1, 1, 2, 2, 3, 3, 0]; + let c = Contingency::new(&query, &doc, 4).unwrap(); + assert_eq!(c.band_agreement(usize::MAX), c.total_count()); + } + /// `bucket_l1_distance` must not overflow for constructor-accepted inputs. /// All mass in the maximum-distance cell at `dim = u32::MAX` (the largest /// accepted `dim`) gives `(nb-1)·dim`, which exceeds `u32::MAX`. Built From 36d0a0bd9189e5276822d9b721ae803e5b92c927 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 17:57:58 -0500 Subject: [PATCH 06/13] fix(contingency): bound nb<=256 in Contingency::new (qodo) (#224) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qodo correctly flagged the infallible nb*nb allocation: nb is a caller-supplied usize (codes are u8, but nb is independent), so a large nb (e.g. 1<<20) would allocate a terabyte-scale table and abort the host. Cap nb at the u8 code domain (<=256) before the vec! — >256 buckets is also meaningless for u8 codes. (I had wrongly dismissed this as a false-positive; it is real.) Signed-off-by: Nelson Spence --- src/contingency.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/contingency.rs b/src/contingency.rs index 4b9d1107..241deb8f 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -68,6 +68,16 @@ impl Contingency { message: "bucket count must be > 0".to_string(), }); } + // Codes are `u8`, so a bucket id is in `0..=255` and `nb > 256` is both + // meaningless and dangerous: `nb` is a caller-supplied `usize`, so a + // large value would allocate an `nb * nb` table (e.g. `nb = 1 << 20` ⇒ + // a terabyte) and abort the process. Cap it at the u8 domain. + if nb > u8::MAX as usize + 1 { + return Err(OrdvecError::InvalidParameter { + name: "nb", + message: format!("must be <= 256 (codes are u8); got {nb}"), + }); + } if query.len() != doc.len() { return Err(OrdvecError::InvalidVectorLength { name: "doc", @@ -664,6 +674,19 @@ mod tests { )); } + #[test] + fn rejects_more_than_256_buckets() { + // `nb` is a caller-supplied usize; codes are u8, so nb > 256 is rejected + // before the nb*nb allocation (a large nb would otherwise abort the host). + let err = Contingency::new(&[0u8], &[0u8], 300).unwrap_err(); + assert!(matches!( + err, + OrdvecError::InvalidParameter { name: "nb", .. } + )); + // 256 is the boundary and is accepted. + assert!(Contingency::new(&[0u8], &[0u8], 256).is_ok()); + } + #[test] fn rejects_out_of_range_code() { // doc has a code (4) >= nb (4). From 1d280074720aaf95da204d6609cc2d17c8203ff0 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:03:45 -0500 Subject: [PATCH 07/13] test(contingency): use nb=256 in large_nb test after the nb<=256 cap (Codex) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nb<=256 bound (36d0a0b) rejected the large_nb_uses_scalar_and_skips_range_scan test's nb=300. nb=256 is the new max and is still > 255, so it exercises the same behavior: find_out_of_range is skipped (every u8 code is < 256) and the scalar path is taken (nb > 16). Full suite green (I had filtered the prior run to the new test and missed this — fixed). Signed-off-by: Nelson Spence --- src/contingency.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/contingency.rs b/src/contingency.rs index 241deb8f..31e60005 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -739,10 +739,12 @@ mod tests { /// exercises the `find_out_of_range` skip branch (all u8 codes in range). #[test] fn large_nb_uses_scalar_and_skips_range_scan() { + // nb = 256 is the max (codes are u8) and still `> 255`, so the + // out-of-range scan is skipped and the scalar path is taken (nb > 16). let query = [0u8, 5, 200, 255]; let doc = [255u8, 200, 5, 0]; - let c = Contingency::new(&query, &doc, 300).unwrap(); - assert_eq!(c.buckets(), 300); + let c = Contingency::new(&query, &doc, 256).unwrap(); + assert_eq!(c.buckets(), 256); assert_eq!(c.count(0, 255), 1); assert_eq!(c.count(255, 0), 1); assert_eq!(c.count(200, 5), 1); From 77d191f8b93e83662ae0b1e788c62748e5e696ec Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:22:49 -0500 Subject: [PATCH 08/13] docs: make MultiBucketBitmap experimental/unstable status explicit Per qodo finding #2 (PR #224): the comments in lib.rs for MultiBucketBitmap and Contingency/Projection did not make explicit that: - MultiBucketBitmap is NOT the default single-score retrieval path - MultiBucketBitmap is gated behind the non-default `experimental` feature, is unstable, and is excluded from semver guarantees - Contingency/Projection are the stable side of the `experimental` gate and ARE covered by semver guarantees Expanded the lib.rs comments to include an explicit warning on MultiBucketBitmap's niche (bilinear decomposition research, not production retrieval), its storage overhead (2-4x vs RankQuant), and its instability. Clarified that Contingency/Projection are the stable surface. Added matching feature-gate and stability notes to the contingency.rs module-level docs. Doc-only change; no logic touched. Signed-off-by: Nelson Spence --- src/contingency.rs | 10 ++++++++++ src/lib.rs | 36 ++++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/contingency.rs b/src/contingency.rs index 31e60005..c94986c5 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -1,6 +1,16 @@ //! Stateless dense bucket-overlap contingency table for two fixed-length //! ordinal bucket-code vectors. //! +//! **Feature gate:** this module is only compiled with the non-default +//! `experimental` cargo feature (`--features experimental`). +//! +//! **Stability:** [`Contingency`] and [`Projection`] are the *stable* side of +//! the `experimental` gate — they are the intended long-term surface for +//! contingency-table analysis and are covered by semver guarantees from the +//! release that introduced them. [`crate::MultiBucketBitmap`] is the +//! *unstable* counterpart behind the same gate and may change without a +//! major-version bump. See the crate-root docs for the full distinction. +//! //! A [`Contingency`] is the full `nb × nb` co-occurrence table between two //! bucket-code slices `q` and `d` of equal length `dim`: //! diff --git a/src/lib.rs b/src/lib.rs index cf03435b..6f01c7e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -80,19 +80,35 @@ pub use sign_bitmap::SignBitmap; #[doc(hidden)] pub use quant::search_asymmetric_byte_lut; -// `MultiBucketBitmap` underwrites the bilinear bucket-overlap -// decomposition but is not the constant-weight top-bucket theorem surface and -// is not stable public API. It is reachable only with the `experimental` -// feature; the default surface excludes it. +// `MultiBucketBitmap` underwrites the bilinear bucket-overlap decomposition. +// +// **`MultiBucketBitmap` is NOT the default retrieval surface.** It is a +// research/analysis primitive for the full bilinear `nb × nb` weight-matrix +// decomposition, not the constant-weight top-bucket theorem surface implemented +// by [`Bitmap`]. Its per-document storage is 2–4× larger than the corresponding +// `RankQuant` encoding; the full outer-product path does not outperform the +// equivalent per-coord scalar form and exists to expose the decomposition +// empirically and serve as a reference for truncated weight matrices. +// +// `MultiBucketBitmap` is gated behind the **non-default `experimental` cargo +// feature**, is excluded from semver guarantees, and may change or be removed +// without a major-version bump. It is not part of the stable public surface. #[cfg(feature = "experimental")] pub use multi_bucket::MultiBucketBitmap; -// `Contingency` / `Projection` are the stateless dense-code contingency-table -// surface (issue #219): the full `nb × nb` bucket-overlap table for two `&[u8]` -// code slices, plus its named projections. This is a research/analysis -// primitive — it is *not* a retrieval index and is never wired into a search -// path — so it lives behind the same `experimental` gate as -// `MultiBucketBitmap`, the bilinear-decomposition surface it complements. +// `Contingency` / `Projection` are the **stable** stateless dense-code +// contingency-table surface added in this release (issue #219): the full +// `nb × nb` bucket-overlap table for two `&[u8]` code slices, plus its named +// projections (diagonal agreement, band agreement, top-bucket overlap, L1 +// distance, etc.). This is a research/analysis primitive — it is *not* a +// retrieval index and is never wired into any search path. +// +// Although `Contingency` and `Projection` are gated behind the same +// `experimental` feature as `MultiBucketBitmap` (they complement the bilinear +// decomposition that surface exposes), they are the **stable** side of the +// `experimental` gate: the stateless dense API is the intended long-term +// surface and is covered by semver guarantees from this release forward. +// `MultiBucketBitmap` is the unstable counterpart — see the note above. #[cfg(feature = "experimental")] pub use contingency::{Contingency, Projection}; From 7fcef9203e1e81969bbfa0686c21fe78dc0936c1 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:23:32 -0500 Subject: [PATCH 09/13] fix: remediate qodo findings on PR #225 (nb<=16 assert, experimental doc, speedup consistency) - Add release-active assert!(nb * nb <= 256, ...) before the stack-table slice in both project_all_batched and project_all_batched_scalar in src/multi_bucket.rs, making the nb<=16 invariant fail-loud locally instead of relying solely on MultiBucketBitmap::new's bits restriction. - Add a doc note to the indexed contingency/projection section header explaining that this surface is gated behind the non-default `experimental` feature, excluded from semver, and is not the default single-score retrieval path (RankQuant/Bitmap are preferred). - Fix inverted speedup output in examples/bench_contingency.rs: regime_a printed 1.0/speedup in the human table but speedup in DATA; regime_b printed 1.0/fastpath_speedup in the table column but fastpath_speedup in DATA. Both now print the non-inverted ratio consistently. Signed-off-by: Nelson Spence --- examples/bench_contingency.rs | 4 ++-- src/multi_bucket.rs | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/examples/bench_contingency.rs b/examples/bench_contingency.rs index a4dbb0ec..d2f2a8ab 100644 --- a/examples/bench_contingency.rs +++ b/examples/bench_contingency.rs @@ -334,7 +334,7 @@ mod bench { "{:<34} {:>14.3} {:>13.2}x", "rebuild-per-projection", rebuild_each * 1e6, - 1.0 / speedup + speedup ); println!( "DATA\ta\tbits={}\tbuild_once_us={:.3}\trebuild_each_us={:.3}\tn_proj={}\tspeedup={:.2}", @@ -409,7 +409,7 @@ mod bench { "{:<34} {:>14.2} {:>15.2}x", "dense full-table diagonal", per(dense_full), - 1.0 / fastpath_speedup + fastpath_speedup ); println!( "DATA\tb\tbits={}\tdiag_disp_ns={:.2}\tdiag_scalar_ns={:.2}\tdense_full_ns={:.2}\tsimd_speedup={:.2}\tfastpath_speedup={:.2}", diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 8d279137..4465bd59 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -233,6 +233,15 @@ impl MultiBucketBitmap { // ----------------------------------------------------------------- // Indexed contingency / projection surface (issue #219, API 2 of 2). // + // **Feature gate:** this entire surface is behind the non-default + // `experimental` Cargo feature and is excluded from semver guarantees. + // It is *not* the default single-score retrieval path — for standard + // nearest-neighbour scoring use [`crate::RankQuant`] (asymmetric LUT + // or symmetric dot-product) or [`crate::Bitmap`] (top-bucket overlap). + // `MultiBucketBitmap` targets niche workloads where the full bilinear + // decomposition, multi-projection evidence, or the diagonal fast path + // are needed (one query vs. many docs with many weight projections). + // // `bilinear_score` / `top_m_bilinear` recompute one weighted `Σ W·C` // per call and rescan the doc's bitmaps once per *projection*. The // methods below decouple the accumulation from the projection: the @@ -362,6 +371,12 @@ impl MultiBucketBitmap { // One accumulation pass over this doc's bitmaps. nb <= 16 ⇒ // nb*nb <= 256, so a stack table avoids a per-doc heap // allocation inside the parallel map (allocator contention). + assert!( + nb * nb <= 256, + "project_all_batched: nb={nb} exceeds stack table capacity \ + (nb*nb={} > 256); bits must be <=4 so nb<=16", + nb * nb, + ); let mut table = [0u32; 256]; let table = &mut table[..nb * nb]; contingency_accumulate(q_bitmaps, doc, nb, qpb, table); @@ -445,6 +460,12 @@ impl MultiBucketBitmap { let doc = &bitmaps[di * per_doc..(di + 1) * per_doc]; // Stack table (nb*nb <= 256): no per-doc heap alloc in the // parallel map. + assert!( + nb * nb <= 256, + "project_all_batched_scalar: nb={nb} exceeds stack table capacity \ + (nb*nb={} > 256); bits must be <=4 so nb<=16", + nb * nb, + ); let mut table = [0u32; 256]; let table = &mut table[..nb * nb]; contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table); From 3e14283eb23c4a579c7ec801021f6af20b2c5aba Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:36:05 -0500 Subject: [PATCH 10/13] docs: add MultiBucketBitmap non-default/experimental stability banner to module rustdoc qodo's 'Missing MultiBucketBitmap non-default docs' finding wants the experimental/non-default/semver-excluded note on the user-visible rustdoc, not an internal comment. Add a prominent stability banner to the module-level //! docs (what docs.rs renders): MultiBucketBitmap and the indexed contingency / projection kernels are gated behind the non-default `experimental` feature, are not stable public API, and are excluded from semver guarantees; the stable surface is the stateless dense Contingency / Projection API. Signed-off-by: Nelson Spence --- src/multi_bucket.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 4465bd59..289bd256 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -1,5 +1,14 @@ //! MultiBucketBitmap: `2^bits` bitmaps per document, one per bucket. //! +//! **Stability — not a default surface.** [`MultiBucketBitmap`] and the indexed +//! contingency / projection kernels built on it are gated behind the +//! **non-default `experimental` Cargo feature**. They are a research scaffold, +//! **not** part of the stable public API, and are **excluded from semver +//! guarantees** — they may change or be removed without a major-version bump. +//! The stable, semver-covered bucket-overlap surface is the stateless dense +//! `Contingency` / `Projection` API; reach for this indexed path only when you +//! have explicitly opted into `experimental`. +//! //! Represents the constant-composition bucket assignment of each //! document explicitly as a set of `2^bits` disjoint bitmaps over the //! `dim` coordinates. The bilinear bucket-overlap score From cf514554a4f1d834d090e5470a6d9aef5f6950eb Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:54:32 -0500 Subject: [PATCH 11/13] docs: product-clean the contingency surface (remove internal-prototype references) Signed-off-by: Nelson Spence --- src/contingency.rs | 49 ++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/contingency.rs b/src/contingency.rs index c94986c5..e771c2d4 100644 --- a/src/contingency.rs +++ b/src/contingency.rs @@ -27,9 +27,18 @@ //! L1 distance, coarsened tables, the symmetric RankQuant score, and general //! learned `nb × nb` weight matrices) over a single `O(dim)` histogram pass. //! -//! Ported to reach behavioural parity with `ordgraph::edge::Contingency`. The -//! `EdgeEvidence` "this is evidence for X" wrapper deliberately stays in -//! ordgraph — ordvec exposes only the substrate primitive. +//! ## Adopting this API +//! +//! `Contingency` is a reusable, index-free bucket-code surface. If you maintain +//! a local fork of this logic, replace it with: +//! +//! ```rust,ignore +//! use ordvec::{Contingency, Projection}; +//! ``` +//! +//! Rank math (bucket assignment from float scores) delegates to +//! [`crate::RankQuant`]. `Contingency` itself operates on the *already bucketed* +//! `&[u8]` codes and carries no corpus state. //! //! ## Count width //! @@ -307,9 +316,10 @@ impl Contingency { } } -/// Named projections over a [`Contingency`], mirroring -/// `ordgraph::edge::Projection`. Each variant's [`Self::score`] returns the -/// projection as `f32`. +/// Named projections over a [`Contingency`]. +/// +/// Each variant selects a scalar summary of the co-occurrence table. +/// [`Self::score`] evaluates the chosen projection and returns it as `f32`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Projection { /// Top-bucket cell count — [`Contingency::top_overlap`]. @@ -328,7 +338,8 @@ pub enum Projection { impl Projection { /// Evaluate this projection against `contingency`, returning the value as - /// `f32` (matching the ordgraph projection contract). + /// `f32`. The returned value is the scalar summary selected by the variant — + /// integer-valued projections are widened to `f32` without rounding. pub fn score(self, contingency: &Contingency) -> f32 { match self { Self::TopOverlap => contingency.top_overlap() as f32, @@ -512,13 +523,12 @@ unsafe fn build_histogram_avx512(query: &[u8], doc: &[u8], nb: usize, counts: &m mod tests { use super::*; - // ---- ordgraph::edge parity gate ------------------------------------- - // Every assertion value below is reproduced verbatim from - // ordgraph-proto/src/edge.rs's tests. The bucket codes are the *already - // bucketed* outputs ordgraph derives from ranks; here we feed the codes - // directly (the stateless dense-code contract). + // ---- behavioural contract: bucket-code contingency ------------------ + // These tests verify the algebraic properties of the contingency table + // using directly-supplied bucket codes (the stateless dense-code contract). + // All assertion values are exact expected outcomes of the described inputs. - /// The ordgraph `contingency_counts_bucket_intersections` case: + /// Contingency counts bucket intersections: /// query = [0,0,1,1,2,2,3,3], doc = [3,3,2,2,1,1,0,0] (reverse), nb = 4. #[test] fn parity_counts_bucket_intersections() { @@ -532,8 +542,8 @@ mod tests { assert_eq!(c.diagonal_agreement(), 0); } - /// The ordgraph `projections_recover_top_diagonal_band_and_rankquant_score` - /// case: query = [0,0,1,1,2,2,3,3], doc = [0,1,1,2,2,3,3,0], nb = 4. + /// Projections recover top, diagonal, band, and RankQuant scores: + /// query = [0,0,1,1,2,2,3,3], doc = [0,1,1,2,2,3,3,0], nb = 4. #[test] fn parity_projections() { let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; @@ -588,7 +598,8 @@ mod tests { assert_eq!(c.bucket_l1_distance(), expected); } - /// The ordgraph `contingency_has_fixed_row_and_column_margins` case. + /// Fixed row and column margins: each query bucket and each doc bucket + /// appears exactly twice, so every row-sum and column-sum equals 2. #[test] fn parity_fixed_margins() { let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; @@ -602,8 +613,8 @@ mod tests { } } - /// The ordgraph `rankquant_symmetric_projection_matches_direct_centered_sum` - /// case: the table-projected score equals the per-coordinate centred sum. + /// RankQuant symmetric projection matches the direct per-coordinate centred + /// sum: the table-projected score and the element-wise centred product agree. #[test] fn parity_rankquant_matches_direct_centered_sum() { let query = [0u8, 0, 1, 1, 2, 2, 3, 3]; @@ -620,7 +631,7 @@ mod tests { assert_eq!(c.rankquant_symmetric_score(), direct); } - /// The ordgraph `coarsened_counts_preserve_total_mass` case: + /// Coarsened counts preserve total mass: /// coarsened_counts(2) = [3, 1, 1, 3], total = 8. #[test] fn parity_coarsened_counts() { From 766e3fa776668b99a0fbb2aceba50b9707e37364 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 19:17:04 -0500 Subject: [PATCH 12/13] docs: state MultiBucketBitmap is not a primary retrieval path (positioning) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qodo's 'Missing MultiBucketBitmap non-default docs' finding asks the docs to explicitly position MultiBucketBitmap as niche — not the default single-score retrieval path — and name the alternatives that dominate. Extend the module banner with a 'Positioning' note: MultiBucketBitmap is a research/analysis substrate (never kernel-optimized for retrieval); for primary retrieval use RankQuant (symmetric/asymmetric), Bitmap (top-bucket popcount), and the two-stage candidate-gen -> rerank flow. The indexed contingency/projection surface is for analyzing bucket-overlap structure, not primary retrieval. Signed-off-by: Nelson Spence --- src/multi_bucket.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs index 289bd256..a375a363 100644 --- a/src/multi_bucket.rs +++ b/src/multi_bucket.rs @@ -9,6 +9,17 @@ //! `Contingency` / `Projection` API; reach for this indexed path only when you //! have explicitly opted into `experimental`. //! +//! **Positioning — not a primary retrieval path.** `MultiBucketBitmap` is a +//! niche research/analysis substrate for the bilinear bucket-overlap +//! decomposition; it is **not** the default single-score retrieval path and was +//! never kernel-optimized for that role. For primary nearest-neighbour +//! retrieval, use the headline paths instead — [`crate::RankQuant`] (symmetric +//! and asymmetric float-query scoring), [`crate::Bitmap`] (top-bucket +//! `popcount(Q AND D)` candidate scoring), and the two-stage +//! candidate-generation → rerank flow. Reach for this indexed contingency / +//! projection surface only to *analyze* bucket-overlap structure, never as a +//! primary retrieval index. +//! //! Represents the constant-composition bucket assignment of each //! document explicitly as a set of `2^bits` disjoint bitmaps over the //! `dim` coordinates. The bilinear bucket-overlap score From 0f7d470bcb555a1637bb3daadd98ba3974d79ae7 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 19:22:42 -0500 Subject: [PATCH 13/13] docs: position MultiBucketBitmap in crate-root docs (project-doc surface) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retrieval-positioning note belongs on the crate-root docs.rs front page that every user sees, not only in the experimental-gated multi_bucket module (which a caller who never enables `experimental` never reads — yet they are exactly the audience choosing a retrieval path). Add the 'not a primary retrieval path; use RankQuant / Bitmap / two-stage' note to the lib.rs crate docs, right after the four headline substrate families. Uses a plain `MultiBucketBitmap` reference so the default (no-experimental) doc build stays link-clean; verified under RUSTDOCFLAGS=-D warnings both with and without the feature. Signed-off-by: Nelson Spence --- src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index db75a5f7..8ed0ef80 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,14 @@ //! coordinate, set when the coordinate is positive) for sign-cosine //! candidate generation. //! +//! These four families are the retrieval surface. The `experimental` +//! `MultiBucketBitmap` indexed contingency / projection API is a niche +//! research/analysis substrate for the bilinear bucket-overlap decomposition — +//! it is **not** a default single-score retrieval path and was never +//! kernel-optimized for that role. For primary nearest-neighbour retrieval use +//! [`RankQuant`], [`Bitmap`], or the two-stage candidate-generation → rerank +//! flow instead. +//! //! The [`Bitmap`] candidate score is the implementation surface with the //! strongest formal story: in the companion Lean formalization, literal //! constant-weight bitmap overlap is the query-preserving quotient statistic,