From 5af84118b79b961ff0cdf5199445a1ccd6a77c99 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 14:21:09 -0500 Subject: [PATCH 01/10] feat(experimental): reusable RankQuant bucket-code API (#220) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port ordgraph-proto/src/code.rs into ordvec as an index-free bucket-code surface so downstream evidence systems consume declared codes instead of forking kernel math. experimental-gated. API (ordvec::{CompositionSpec, RankQuantSpec, BucketCode, CompositionViolation}): - CompositionSpec: fixed-composition validation (dim/buckets, expected_per_bucket, histogram, validate_codes). - RankQuantSpec: dim+bits in {1,2,4}. - BucketCode: new(spec, codes) validated; from_ranks; from_vector(dim, bits, &[f32]) — the new ordvec-primitive path (rank_transform + rank_to_bucket), output feeds Contingency::new (#219); top_bitmap. - CompositionViolation: structured error enum (Display + Error). Rank math not duplicated (delegates to crate::rank). Parity tests reproduce the proto's exact assertion values; +validation + from_vector<->contingency cross-check. Verified: fmt/clippy(-D warnings, experimental+default)/test (69 lib + 2 integration, default-build gating confirmed) green. No new deps. Refs #220. Signed-off-by: Nelson Spence --- src/bucket_code.rs | 713 +++++++++++++++++++++++++++++++ src/lib.rs | 11 + tests/bucket_code_contingency.rs | 82 ++++ 3 files changed, 806 insertions(+) create mode 100644 src/bucket_code.rs create mode 100644 tests/bucket_code_contingency.rs diff --git a/src/bucket_code.rs b/src/bucket_code.rs new file mode 100644 index 00000000..916d7879 --- /dev/null +++ b/src/bucket_code.rs @@ -0,0 +1,713 @@ +//! Index-free, fixed-composition ordinal **bucket codes** (issue #220). +//! +//! This module exposes the reusable bucket-code surface that underpins the +//! RankQuant family, lifted out of any retrieval index. It lets a caller +//! derive and validate the per-coordinate bucket codes of a vector (or of an +//! already-computed rank permutation) **without building a corpus, a packed +//! payload, or a search structure**. The output is a plain `Vec` of bucket +//! ids in `[0, buckets)`. +//! +//! Three types model the contract: +//! +//! - [`CompositionSpec`] — a *fixed-composition* parameterisation +//! (`dim`, `buckets`) with `dim % buckets == 0`, so every bucket receives +//! exactly `dim / buckets` coordinates. It owns the code-validation rules: +//! length, range, and per-bucket occupancy. +//! - [`RankQuantSpec`] — the RankQuant-shaped specialisation: `buckets` +//! derived as `1 << bits` for `bits ∈ {1, 2, 4}`, matching the crate's +//! [`crate::RankQuant`] bit-width domain. +//! - [`BucketCode`] — a single validated code vector against a +//! [`CompositionSpec`], built from raw codes, from a rank permutation +//! ([`BucketCode::from_ranks`]), or directly from a float vector +//! ([`BucketCode::from_vector`]). +//! +//! The codes [`BucketCode::from_vector`] produces are exactly the bucket ids +//! the crate's rank primitives ([`crate::rank::rank_transform`] + +//! [`crate::rank::rank_to_bucket`]) assign, so they can be fed straight into +//! the stateless dense-code contingency surface (`Contingency::new`, issue +//! #219) without any further transform. +//! +//! Ported to reach behavioural parity with the `ordgraph` bucket-code +//! prototype; the rank math is *not* re-implemented here — it delegates to the +//! crate's shared [`crate::rank`] primitives. + +use std::error::Error; +use std::fmt; + +use crate::rank::{rank_to_bucket, rank_transform}; + +/// Fixed-composition bucket-code parameters. +/// +/// A spec fixes the code length (`dim`) and the bucket count (`buckets`) and +/// requires `dim % buckets == 0`, so a well-formed code places exactly +/// `dim / buckets` coordinates in every bucket. This *constant-composition* +/// invariant is what makes the codes interchangeable across documents and is +/// the property [`Self::validate_codes`] enforces. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct CompositionSpec { + dim: usize, + buckets: usize, + expected_per_bucket: usize, +} + +impl CompositionSpec { + /// Build a fixed-composition spec for `dim` coordinates over `buckets` + /// buckets. + /// + /// # Errors + /// - [`CompositionViolation::InvalidSpec`] if `dim == 0` or `buckets < 2`. + /// - [`CompositionViolation::NonUniformSpec`] if `dim` is not divisible by + /// `buckets` (the constant-composition invariant cannot hold). + pub fn new(dim: usize, buckets: usize) -> Result { + if dim == 0 { + return Err(CompositionViolation::InvalidSpec("dim must be > 0")); + } + if buckets < 2 { + return Err(CompositionViolation::InvalidSpec("buckets must be >= 2")); + } + if !dim.is_multiple_of(buckets) { + return Err(CompositionViolation::NonUniformSpec { dim, buckets }); + } + Ok(Self { + dim, + buckets, + expected_per_bucket: dim / buckets, + }) + } + + /// Build the spec implied by a RankQuant `(dim, bits)` pairing, where + /// `buckets == 1 << bits`. A convenience wrapper over + /// [`RankQuantSpec::new`] for callers that only need the composition. + pub fn rank_quant(dim: usize, bits: u8) -> Result { + RankQuantSpec::new(dim, bits).map(|spec| spec.composition) + } + + /// Code length the spec validates against. + pub fn dim(&self) -> usize { + self.dim + } + + /// Number of buckets `buckets`. + pub fn buckets(&self) -> usize { + self.buckets + } + + /// Coordinates a well-formed code places in each bucket: `dim / buckets`. + pub fn expected_per_bucket(&self) -> usize { + self.expected_per_bucket + } + + /// Per-bucket occupancy histogram of `codes`. + /// + /// One `O(dim)` pass tallying how many coordinates land in each bucket. + /// + /// # Errors + /// - [`CompositionViolation::WrongLength`] if `codes.len() != dim`. + /// - [`CompositionViolation::BucketOutOfRange`] on the first code `>= buckets`. + pub fn histogram(&self, codes: &[u8]) -> Result, CompositionViolation> { + if codes.len() != self.dim { + return Err(CompositionViolation::WrongLength { + expected: self.dim, + actual: codes.len(), + }); + } + let mut hist = vec![0usize; self.buckets]; + for (coordinate, &bucket) in codes.iter().enumerate() { + let bucket = bucket as usize; + if bucket >= self.buckets { + return Err(CompositionViolation::BucketOutOfRange { + coordinate, + bucket, + buckets: self.buckets, + }); + } + hist[bucket] += 1; + } + Ok(hist) + } + + /// Validate that `codes` is a well-formed fixed-composition code: correct + /// length, every code in range, and every bucket holding exactly + /// `expected_per_bucket` coordinates. + /// + /// # Errors + /// - the [`Self::histogram`] errors (wrong length, out-of-range code), plus + /// - [`CompositionViolation::WrongBucketCount`] on the first bucket whose + /// occupancy differs from `expected_per_bucket`. + pub fn validate_codes(&self, codes: &[u8]) -> Result<(), CompositionViolation> { + let hist = self.histogram(codes)?; + for (bucket, &count) in hist.iter().enumerate() { + if count != self.expected_per_bucket { + return Err(CompositionViolation::WrongBucketCount { + bucket, + expected: self.expected_per_bucket, + actual: count, + }); + } + } + Ok(()) + } +} + +/// RankQuant-shaped fixed-composition code parameters. +/// +/// Specialises [`CompositionSpec`] to the crate's RankQuant bit-width domain: +/// the bucket count is `1 << bits` for `bits ∈ {1, 2, 4}`, and `dim` is capped +/// at `u16::MAX` to mirror the crate-wide rank invariant (a rank vector is a +/// permutation of `[0, dim)` stored as `u16`). +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct RankQuantSpec { + bits: u8, + composition: CompositionSpec, +} + +impl RankQuantSpec { + /// Build a RankQuant spec for `dim` coordinates at `bits` bits/coordinate. + /// + /// # Errors + /// - [`CompositionViolation::InvalidBits`] if `bits ∉ {1, 2, 4}`. This is + /// the crate's [`crate::RankQuant`] bit-width domain — the reference + /// prototype also accepted `8`, but ordvec's packed format and analytical + /// norm are defined only for `{1, 2, 4}`, so 8-bit is rejected here. + /// - [`CompositionViolation::DimTooLarge`] if `dim > u16::MAX`. + /// - the [`CompositionSpec::new`] errors (non-divisible `dim`). + pub fn new(dim: usize, bits: u8) -> Result { + if !matches!(bits, 1 | 2 | 4) { + return Err(CompositionViolation::InvalidBits { bits }); + } + if dim > u16::MAX as usize { + return Err(CompositionViolation::DimTooLarge { + dim, + max: u16::MAX as usize, + }); + } + let buckets = 1usize << bits; + Ok(Self { + bits, + composition: CompositionSpec::new(dim, buckets)?, + }) + } + + /// Bits per coordinate (`1`, `2`, or `4`). + pub fn bits(&self) -> u8 { + self.bits + } + + /// The underlying fixed-composition spec (`buckets == 1 << bits`). + pub fn composition(&self) -> &CompositionSpec { + &self.composition + } + + /// Consume the spec, yielding the owned [`CompositionSpec`]. + pub fn into_composition(self) -> CompositionSpec { + self.composition + } +} + +/// A single validated, fixed-composition ordinal bucket code. +/// +/// Wraps a `Vec` of bucket ids together with the [`CompositionSpec`] it +/// satisfies. Every constructor validates the composition invariant up front, +/// so a constructed `BucketCode` is always well-formed: its [`Self::codes`] +/// are in range and balanced across buckets, and can be handed directly to the +/// dense-code contingency surface (`Contingency::new`, issue #219). +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct BucketCode { + spec: CompositionSpec, + codes: Vec, +} + +impl BucketCode { + /// Wrap pre-computed `codes` against `spec`, validating the composition. + /// + /// # Errors + /// The [`CompositionSpec::validate_codes`] errors (wrong length, + /// out-of-range code, wrong per-bucket occupancy). + pub fn new(spec: CompositionSpec, codes: Vec) -> Result { + spec.validate_codes(&codes)?; + Ok(Self { spec, codes }) + } + + /// Derive a bucket code from an explicit rank permutation. + /// + /// `ranks` must be a permutation of `[0, dim)` (every value distinct and + /// `< dim`); each rank maps to bucket `rank * buckets / dim`. This is the + /// rank-vector entry point: the caller already holds ranks (e.g. from + /// [`crate::rank::rank_transform`]) and wants the bucketed codes. + /// + /// # Errors + /// - the [`CompositionSpec::new`] errors (bad `dim`/`buckets`). + /// - [`CompositionViolation::WrongLength`] if `ranks.len() != dim`. + /// - [`CompositionViolation::RankOutOfRange`] on the first `rank >= dim`. + /// - [`CompositionViolation::DuplicateRank`] on the first repeated rank + /// (ranks must be a permutation). + pub fn from_ranks( + dim: usize, + buckets: usize, + ranks: &[usize], + ) -> Result { + let spec = CompositionSpec::new(dim, buckets)?; + if ranks.len() != dim { + return Err(CompositionViolation::WrongLength { + expected: dim, + actual: ranks.len(), + }); + } + + let mut seen = vec![false; dim]; + let mut codes = Vec::with_capacity(dim); + for (coordinate, &rank) in ranks.iter().enumerate() { + if rank >= dim { + return Err(CompositionViolation::RankOutOfRange { + coordinate, + rank, + dim, + }); + } + if seen[rank] { + return Err(CompositionViolation::DuplicateRank { rank }); + } + seen[rank] = true; + // `rank < dim` and `dim % buckets == 0`, so `rank * buckets / dim` + // lands in `[0, buckets)` and `buckets <= 1 << 8` for any code that + // can be a `u8`. (For the RankQuant domain `buckets <= 16`.) + codes.push((rank * buckets / dim) as u8); + } + Self::new(spec, codes) + } + + /// Derive a bucket code directly from a float vector. + /// + /// Computes the dimension-wise rank transform of `vector` + /// ([`crate::rank::rank_transform`]) and buckets each rank via the crate's + /// shared [`crate::rank::rank_to_bucket`] against the RankQuant spec for + /// `(dim, bits)`. The resulting codes are bit-identical to what + /// [`crate::RankQuant`] would pack for the same vector, so they feed the + /// dense-code contingency surface (`Contingency::new`, #219) unchanged. + /// + /// `vector` must have length `dim` and contain only finite values; both are + /// validated here so a malformed vector returns an error rather than + /// panicking inside the rank primitives. + /// + /// # Errors + /// - the [`RankQuantSpec::new`] errors (`bits ∉ {1, 2, 4}`, `dim` too large + /// or non-divisible). + /// - [`CompositionViolation::WrongLength`] if `vector.len() != dim`. + /// - [`CompositionViolation::NonFiniteValue`] on the first non-finite + /// coordinate. + pub fn from_vector(dim: usize, bits: u8, vector: &[f32]) -> Result { + let spec = RankQuantSpec::new(dim, bits)?; + if vector.len() != dim { + return Err(CompositionViolation::WrongLength { + expected: dim, + actual: vector.len(), + }); + } + // Validate finiteness up front: `rank_transform` *asserts* finiteness + // and panics otherwise. Returning a clean error keeps the bucket-code + // surface fail-soft on malformed input (its whole contract is + // validation), matching the rest of this module. + if let Some(coordinate) = vector.iter().position(|x| !x.is_finite()) { + return Err(CompositionViolation::NonFiniteValue { coordinate }); + } + let ranks = rank_transform(vector); + let codes: Vec = ranks + .iter() + .map(|&rank| rank_to_bucket(rank, dim, bits)) + .collect(); + // The codes come straight from `rank_to_bucket` over a permutation, so + // they already satisfy the composition invariant; route through the + // validating constructor anyway so the guarantee is enforced in one + // place (and any future drift in the primitives is caught). + Self::new(spec.into_composition(), codes) + } + + /// The composition spec these codes satisfy. + pub fn spec(&self) -> &CompositionSpec { + &self.spec + } + + /// The validated bucket ids, each in `[0, buckets)`. + pub fn codes(&self) -> &[u8] { + &self.codes + } + + /// Top-bucket membership bitmap: `true` where the code is the highest + /// bucket (`buckets - 1`). This is the constant-weight top-bucket indicator + /// the [`crate::Bitmap`] candidate score is built on. + pub fn top_bitmap(&self) -> Vec { + let top = self.spec.buckets - 1; + self.codes + .iter() + .map(|&bucket| bucket as usize == top) + .collect() + } +} + +/// A violation of the fixed-composition bucket-code contract. +/// +/// A stable, structured error type for the bucket-code surface. Distinct from +/// the crate's [`crate::OrdvecError`] (which models index/search parameter and +/// candidate errors): this enum carries the composition-specific detail +/// (duplicate ranks, per-bucket occupancy mismatches) the reference prototype's +/// tests assert on, which the flat `OrdvecError` variants cannot express +/// without losing those values. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum CompositionViolation { + /// A structural spec parameter was invalid (`dim == 0`, `buckets < 2`). + InvalidSpec(&'static str), + /// `bits` was outside the supported RankQuant set `{1, 2, 4}`. + InvalidBits { + /// The rejected bit width. + bits: u8, + }, + /// `dim` exceeded the `u16` rank-domain cap. + DimTooLarge { + /// The rejected dimension. + dim: usize, + /// The maximum supported dimension (`u16::MAX`). + max: usize, + }, + /// `dim` was not divisible by `buckets`, so no constant composition exists. + NonUniformSpec { + /// The dimension. + dim: usize, + /// The bucket count. + buckets: usize, + }, + /// A code or rank slice had the wrong length. + WrongLength { + /// The expected length (`dim`). + expected: usize, + /// The actual length supplied. + actual: usize, + }, + /// A code was `>= buckets`. + BucketOutOfRange { + /// The offending coordinate index. + coordinate: usize, + /// The out-of-range bucket id. + bucket: usize, + /// The bucket count (codes must be `< buckets`). + buckets: usize, + }, + /// A bucket's occupancy differed from `expected_per_bucket`. + WrongBucketCount { + /// The bucket whose occupancy was wrong. + bucket: usize, + /// The required per-bucket occupancy. + expected: usize, + /// The observed occupancy. + actual: usize, + }, + /// A rank was `>= dim`. + RankOutOfRange { + /// The offending coordinate index. + coordinate: usize, + /// The out-of-range rank. + rank: usize, + /// The dimension (ranks must be `< dim`). + dim: usize, + }, + /// A rank appeared more than once (ranks must be a permutation). + DuplicateRank { + /// The repeated rank. + rank: usize, + }, + /// A vector coordinate was non-finite (`NaN` or `±Inf`). + NonFiniteValue { + /// The offending coordinate index. + coordinate: usize, + }, +} + +impl fmt::Display for CompositionViolation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidSpec(message) => write!(f, "{message}"), + Self::InvalidBits { bits } => { + write!(f, "bits {bits} is invalid; expected one of 1, 2, 4") + } + Self::DimTooLarge { dim, max } => write!(f, "dim {dim} exceeds maximum {max}"), + Self::NonUniformSpec { dim, buckets } => { + write!(f, "dim {dim} is not divisible by buckets {buckets}") + } + Self::WrongLength { expected, actual } => { + write!(f, "code length {actual} does not match dim {expected}") + } + Self::BucketOutOfRange { + coordinate, + bucket, + buckets, + } => write!( + f, + "coordinate {coordinate} has bucket {bucket}, expected < {buckets}" + ), + Self::WrongBucketCount { + bucket, + expected, + actual, + } => write!( + f, + "bucket {bucket} has {actual} coordinates, expected {expected}" + ), + Self::RankOutOfRange { + coordinate, + rank, + dim, + } => write!( + f, + "coordinate {coordinate} has rank {rank}, expected < {dim}" + ), + Self::DuplicateRank { rank } => write!(f, "rank {rank} appears more than once"), + Self::NonFiniteValue { coordinate } => { + write!(f, "coordinate {coordinate} is non-finite (NaN or ±Inf)") + } + } + } +} + +impl Error for CompositionViolation {} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- ordgraph bucket-code parity gate ------------------------------- + // Every assertion value below is reproduced verbatim from the reference + // `code.rs` #[cfg(test)] module. + + #[test] + fn from_ranks_builds_uniform_bucket_code() { + let code = BucketCode::from_ranks(8, 4, &[0, 1, 2, 3, 4, 5, 6, 7]).unwrap(); + + assert_eq!(code.spec().expected_per_bucket(), 2); + assert_eq!(code.codes(), &[0, 0, 1, 1, 2, 2, 3, 3]); + } + + #[test] + fn rejects_non_uniform_bucket_counts() { + let spec = CompositionSpec::new(8, 4).unwrap(); + let err = BucketCode::new(spec, vec![0, 0, 0, 1, 2, 2, 3, 3]).unwrap_err(); + + assert_eq!( + err, + CompositionViolation::WrongBucketCount { + bucket: 0, + expected: 2, + actual: 3, + } + ); + } + + #[test] + fn rejects_duplicate_ranks() { + let err = BucketCode::from_ranks(4, 2, &[0, 1, 1, 3]).unwrap_err(); + + assert_eq!(err, CompositionViolation::DuplicateRank { rank: 1 }); + } + + #[test] + fn rankquant_spec_rejects_unsupported_bits_and_large_dims() { + assert_eq!( + RankQuantSpec::new(8, 3).unwrap_err(), + CompositionViolation::InvalidBits { bits: 3 } + ); + assert_eq!( + RankQuantSpec::new(u16::MAX as usize + 1, 2).unwrap_err(), + CompositionViolation::DimTooLarge { + dim: u16::MAX as usize + 1, + max: u16::MAX as usize, + } + ); + } + + #[test] + fn rankquant_spec_rejects_non_divisible_dims() { + assert_eq!( + RankQuantSpec::new(10, 2).unwrap_err(), + CompositionViolation::NonUniformSpec { + dim: 10, + buckets: 4, + } + ); + } + + // ---- ordvec-specific validation surface ----------------------------- + + #[test] + fn validate_codes_rejects_wrong_length() { + let spec = CompositionSpec::new(8, 4).unwrap(); + assert_eq!( + spec.validate_codes(&[0, 0, 1, 1, 2, 2, 3]).unwrap_err(), + CompositionViolation::WrongLength { + expected: 8, + actual: 7, + } + ); + } + + #[test] + fn validate_codes_rejects_out_of_range_code() { + let spec = CompositionSpec::new(8, 4).unwrap(); + // coordinate 7 holds bucket 4, which is >= buckets (4). + assert_eq!( + spec.validate_codes(&[0, 0, 1, 1, 2, 2, 3, 4]).unwrap_err(), + CompositionViolation::BucketOutOfRange { + coordinate: 7, + bucket: 4, + buckets: 4, + } + ); + } + + #[test] + fn composition_spec_rejects_zero_dim_and_small_buckets() { + assert_eq!( + CompositionSpec::new(0, 4).unwrap_err(), + CompositionViolation::InvalidSpec("dim must be > 0") + ); + assert_eq!( + CompositionSpec::new(8, 1).unwrap_err(), + CompositionViolation::InvalidSpec("buckets must be >= 2") + ); + } + + #[test] + fn rank_quant_helper_matches_rankquant_spec_composition() { + let from_helper = CompositionSpec::rank_quant(16, 2).unwrap(); + let from_spec = RankQuantSpec::new(16, 2).unwrap().into_composition(); + assert_eq!(from_helper, from_spec); + assert_eq!(from_helper.buckets(), 4); + assert_eq!(from_helper.expected_per_bucket(), 4); + } + + #[test] + fn from_ranks_rejects_rank_out_of_range() { + // rank 4 at coordinate 0 is >= dim (4). + assert_eq!( + BucketCode::from_ranks(4, 2, &[4, 1, 2, 3]).unwrap_err(), + CompositionViolation::RankOutOfRange { + coordinate: 0, + rank: 4, + dim: 4, + } + ); + } + + #[test] + fn histogram_counts_each_bucket() { + let spec = CompositionSpec::new(8, 4).unwrap(); + assert_eq!( + spec.histogram(&[0, 0, 1, 1, 2, 2, 3, 3]).unwrap(), + vec![2, 2, 2, 2] + ); + } + + #[test] + fn top_bitmap_marks_only_the_top_bucket() { + let code = BucketCode::from_ranks(8, 4, &[0, 1, 2, 3, 4, 5, 6, 7]).unwrap(); + assert_eq!( + code.top_bitmap(), + vec![false, false, false, false, false, false, true, true] + ); + } + + // ---- from_vector: ordvec primitive integration ---------------------- + + #[test] + fn from_vector_matches_from_ranks_for_sorted_input() { + // A strictly increasing vector has ranks [0, 1, ..., dim-1], so the + // codes must match the from_ranks path on that identity permutation. + let v: Vec = (0..8).map(|i| i as f32).collect(); + let code = BucketCode::from_vector(8, 2, &v).unwrap(); + assert_eq!(code.codes(), &[0, 0, 1, 1, 2, 2, 3, 3]); + + let via_ranks = BucketCode::from_ranks(8, 4, &[0, 1, 2, 3, 4, 5, 6, 7]).unwrap(); + assert_eq!(code.codes(), via_ranks.codes()); + } + + #[test] + fn from_vector_buckets_are_balanced() { + // Arbitrary finite vector: the codes must still satisfy the + // constant-composition invariant (dim / buckets per bucket). + let v = [3.0f32, 1.0, 4.0, 1.5, 5.0, 9.0, 2.0, 6.0]; + let code = BucketCode::from_vector(8, 2, &v).unwrap(); + assert_eq!(code.spec().validate_codes(code.codes()), Ok(())); + assert_eq!( + code.spec().histogram(code.codes()).unwrap(), + vec![2, 2, 2, 2] + ); + } + + #[test] + fn from_vector_rejects_wrong_length() { + assert_eq!( + BucketCode::from_vector(8, 2, &[0.0, 1.0, 2.0]).unwrap_err(), + CompositionViolation::WrongLength { + expected: 8, + actual: 3, + } + ); + } + + #[test] + fn from_vector_rejects_non_finite() { + let v = [0.0f32, 1.0, f32::NAN, 3.0, 4.0, 5.0, 6.0, 7.0]; + assert_eq!( + BucketCode::from_vector(8, 2, &v).unwrap_err(), + CompositionViolation::NonFiniteValue { coordinate: 2 } + ); + } + + #[test] + fn from_vector_rejects_invalid_bits() { + let v: Vec = (0..8).map(|i| i as f32).collect(); + assert_eq!( + BucketCode::from_vector(8, 3, &v).unwrap_err(), + CompositionViolation::InvalidBits { bits: 3 } + ); + } + + #[test] + fn display_is_stable_for_each_variant() { + // The error type is part of the public surface; spot-check the + // human-readable rendering does not panic and carries the detail. + let cases = [ + CompositionViolation::InvalidSpec("dim must be > 0"), + CompositionViolation::InvalidBits { bits: 3 }, + CompositionViolation::DimTooLarge { + dim: 70000, + max: 65535, + }, + CompositionViolation::NonUniformSpec { + dim: 10, + buckets: 4, + }, + CompositionViolation::WrongLength { + expected: 8, + actual: 7, + }, + CompositionViolation::BucketOutOfRange { + coordinate: 7, + bucket: 4, + buckets: 4, + }, + CompositionViolation::WrongBucketCount { + bucket: 0, + expected: 2, + actual: 3, + }, + CompositionViolation::RankOutOfRange { + coordinate: 0, + rank: 4, + dim: 4, + }, + CompositionViolation::DuplicateRank { rank: 1 }, + CompositionViolation::NonFiniteValue { coordinate: 2 }, + ]; + for case in cases { + assert!(!case.to_string().is_empty()); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 85158248..14097eb9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,6 +51,9 @@ use std::fmt; mod bitmap; +/// Index-free, fixed-composition ordinal bucket codes (issue #220). +#[cfg(feature = "experimental")] +pub mod bucket_code; mod fastscan; #[cfg(feature = "experimental")] mod multi_bucket; @@ -85,6 +88,14 @@ pub use quant::search_asymmetric_byte_lut; #[cfg(feature = "experimental")] pub use multi_bucket::MultiBucketBitmap; +// Index-free, fixed-composition ordinal bucket codes (issue #220). The reusable +// bucket-code surface — derive/validate per-coordinate bucket codes from a +// vector or a rank permutation with no retrieval index — behind the +// `experimental` feature. Whether it graduates to the stable surface is a +// deliberate later decision. +#[cfg(feature = "experimental")] +pub use bucket_code::{BucketCode, CompositionSpec, CompositionViolation, RankQuantSpec}; + // `RankQuantFastscan` is an optional FastScan b=2 scan path. It is // re-exported `#[doc(hidden)]` at the crate root — reachable as // `ordvec::RankQuantFastscan` for callers who opt in, but not diff --git a/tests/bucket_code_contingency.rs b/tests/bucket_code_contingency.rs new file mode 100644 index 00000000..c43f2c6f --- /dev/null +++ b/tests/bucket_code_contingency.rs @@ -0,0 +1,82 @@ +//! Cross-API integration: the codes [`BucketCode::from_vector`] produces are +//! exactly what the stateless dense-code contingency surface (`Contingency::new`, +//! issue #219) consumes. +//! +//! `Contingency` lands in ordvec via the sibling #219 PR (it is not yet in this +//! branch's tree). Rather than depend on an unmerged module, this test pins the +//! *contract* `Contingency::new` enforces — every code is a valid bucket id +//! `< nb`, and `Contingency::new(codes, codes, nb)` over a self-pair has full +//! diagonal agreement (`diagonal_agreement() == dim`, off-diagonal cells empty). +//! It reproduces the exact `O(dim)` histogram `Contingency::new` builds, so when +//! #219 merges, swapping this reference for the real `Contingency::new` is a +//! mechanical change. The acceptance property (#220 ⇄ #219) is the same: +//! `from_vector` output is a valid, balanced contingency input. + +#![cfg(feature = "experimental")] + +use ordvec::bucket_code::BucketCode; + +/// Reference re-implementation of `Contingency::new`'s histogram pass and the +/// `diagonal_agreement` projection (verbatim algebra from #219's +/// `contingency.rs`). Returns `Err(bad_code)` on the first out-of-range code — +/// exactly the rejection `Contingency::new` performs — else the trace of the +/// `nb × nb` table. +fn contingency_diagonal_agreement(q: &[u8], d: &[u8], nb: usize) -> Result { + assert_eq!(q.len(), d.len(), "query and doc must share dim"); + assert!(nb > 0, "nb must be > 0"); + let cap = nb as u32; + let mut counts = vec![0u32; nb * nb]; + for (&qb, &db) in q.iter().zip(d.iter()) { + if qb as u32 >= cap { + return Err(qb); + } + if db as u32 >= cap { + return Err(db); + } + counts[qb as usize * nb + db as usize] += 1; + } + Ok((0..nb).map(|b| counts[b * nb + b]).sum()) +} + +#[test] +fn from_vector_codes_feed_contingency_self_pair_full_diagonal() { + let dim = 1024usize; + let bits = 2u8; + let nb = 1usize << bits; // 4 + + // An arbitrary finite embedding. + let v: Vec = (0..dim).map(|i| (i as f32 * 7.0).sin()).collect(); + let code = BucketCode::from_vector(dim, bits, &v).unwrap(); + let codes = code.codes(); + + // Every code is a valid bucket id < nb — the range invariant + // `Contingency::new` requires before it indexes its nb × nb table. + assert!(codes.iter().all(|&c| (c as usize) < nb)); + assert_eq!(codes.len(), dim); + + // Self-pair: Contingency::new(codes, codes, nb) puts every coordinate on the + // diagonal, so diagonal_agreement == dim and nothing falls off-diagonal. + let diag = contingency_diagonal_agreement(codes, codes, nb).unwrap(); + assert_eq!( + diag as usize, dim, + "self-pair must agree on every coordinate" + ); +} + +#[test] +fn from_vector_codes_are_in_range_for_all_supported_bits() { + let dim = 256usize; + let v: Vec = (0..dim).map(|i| ((i * 31 + 7) % 97) as f32).collect(); + for bits in [1u8, 2, 4] { + let nb = 1usize << bits; + let code = BucketCode::from_vector(dim, bits, &v).unwrap(); + // Cross-pair against a reversed copy: still a valid Contingency input + // (every code < nb), so the histogram build never hits the range guard. + let mut rev: Vec = code.codes().to_vec(); + rev.reverse(); + let diag = contingency_diagonal_agreement(code.codes(), &rev, nb) + .expect("from_vector codes must be valid contingency input"); + // Diagonal agreement is bounded by dim and well-defined (sanity). + assert!((diag as usize) <= dim); + } +} From 4a98bdd6d2453f149e7e34ec51e989e655d0fe70 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 14:33:49 -0500 Subject: [PATCH 02/10] feat(experimental): constant-weight bitmap overlap + finite null-tail helpers (#222) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port ordgraph-proto/src/bitmap.rs into ordvec (stacked on #220 BucketCode), experimental-gated. The ordinal-bitmap overlap + finite-null surface downstream evidence systems consume. ordvec::{ConstantWeightBitmap, PackedConstantWeightBitmap, BitmapNull, top_group_overlap_vector, choose}: - ConstantWeightBitmap::from_top_bucket(&BucketCode) + overlap (reference path). - PackedConstantWeightBitmap::from_bucket_range/from_top_group; overlap routes through crate::util::and_popcount (the production popcount-AND primitive, not a hand-rolled loop). - BitmapNull: space_size=C(dim,w), fiber_count(o)=C(w,o)*C(dim-w,w-o), tail_count(threshold)=Σ_{o>=threshold} fiber_count(o); upper-tail = tail/space. - choose: u128 binomial. Parity: reproduces proto assertion values; fiber-partition identity (Σ fiber_count == space_size) verified for 7 (dim,weight); overlap parity (bool == packed == naive); choose vs known binomials + symmetry. No new deps; zero unsafe. Verified: fmt/clippy(-D warnings, exp+default)/test(exp+default gating)/MSRV 1.89 green. Refs #222. Signed-off-by: Nelson Spence --- src/const_weight_bitmap.rs | 545 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 15 + 2 files changed, 560 insertions(+) create mode 100644 src/const_weight_bitmap.rs diff --git a/src/const_weight_bitmap.rs b/src/const_weight_bitmap.rs new file mode 100644 index 00000000..4c655892 --- /dev/null +++ b/src/const_weight_bitmap.rs @@ -0,0 +1,545 @@ +//! Constant-weight bitmap overlap + the finite constant-weight null (issue #222). +//! +//! This is the *ordinal-kernel* evidence surface built on top of the +//! fixed-composition bucket codes ([`crate::bucket_code::BucketCode`], issue +//! #220). It exposes two literal constant-weight bitmaps derived from a bucket +//! code, their popcount overlap, and the idealized uniform constant-weight +//! *null* that turns an observed overlap into an exact finite tail probability. +//! It carries **no retrieval, graph, or serving concepts** — only the bitmap +//! overlap statistic and its finite combinatorial null. +//! +//! Three pieces model the contract: +//! +//! - [`ConstantWeightBitmap`] — the top-bucket membership bitmap of a bucket +//! code as a `Vec`. Bit `j` is set iff coordinate `j` sits in the top +//! bucket (`buckets - 1`). Its [`ConstantWeightBitmap::overlap`] is the count +//! of shared set bits — the reference (naive) overlap statistic. +//! - [`PackedConstantWeightBitmap`] — the same membership packed into `u64` +//! words, with [`PackedConstantWeightBitmap::overlap`] computed by word-level +//! AND-popcount. The packed overlap routes through the crate's shared +//! `crate::util::and_popcount` primitive (the same reduction the production +//! [`crate::Bitmap`] scan kernels use), so a packed scan and the bitmap index +//! compute overlap with one shared popcount path. It generalises beyond the +//! top bucket: it can be built from any bucket range or top *group* of +//! buckets. +//! - [`BitmapNull`] — the idealized uniform constant-weight bitmap null over +//! all weight-`w` bitmaps in `dim` positions. The fibers of the overlap +//! statistic partition that space, so [`BitmapNull::fiber_count`] is the +//! hypergeometric numerator and [`BitmapNull::tail_count`] / +//! [`BitmapNull::space_size`] give an exact upper-tail probability for an +//! overlap cutoff. +//! +//! Ported to reach behavioural parity with the `ordgraph` bitmap prototype +//! (`ordgraph-proto/src/bitmap.rs`); the popcount reduction is *not* +//! re-implemented — it delegates to the crate's shared `crate::util` +//! primitive. +//! +//! # Overflow +//! [`choose`] (and therefore [`BitmapNull::space_size`] / `fiber_count` / +//! `tail_count`) accumulates in `u128`. For very large `dim` the binomial +//! `C(dim, weight)` exceeds `u128::MAX` and the running product overflows — +//! this panics in debug builds and wraps in release, exactly as the reference +//! prototype does. The finite null is intended for the small `dim`/`weight` +//! regime where the exact count is representable; callers operating near the +//! `u128` ceiling must bound their parameters. This matches the prototype's +//! behaviour (see the module-level FLAG in the porting notes — no divergence). + +use crate::bucket_code::BucketCode; +use crate::util::and_popcount; + +/// Constant-weight top-bucket bitmap derived from an ordinal bucket code. +/// +/// Bit `j` is `true` iff coordinate `j` of the code is in the top bucket +/// (`buckets - 1`). Under the fixed-composition invariant every bucket holds +/// exactly `dim / buckets` coordinates, so the bitmap has constant weight +/// `dim / buckets` across all codes of the same spec — the property the +/// constant-weight null relies on. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ConstantWeightBitmap { + dim: usize, + weight: usize, + bits: Vec, +} + +/// Packed constant-weight bitmap with overlap computed by word-level popcount. +/// +/// The membership indicator is packed into `dim.div_ceil(64)` `u64` words. +/// [`Self::overlap`] routes through the crate's shared +/// `crate::util::and_popcount` reduction — the same AND-popcount path the +/// production [`crate::Bitmap`] scan kernels use — so a packed scan and the +/// bitmap index agree bit-for-bit on overlap. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct PackedConstantWeightBitmap { + dim: usize, + weight: usize, + words: Vec, +} + +impl ConstantWeightBitmap { + /// Build the top-bucket membership bitmap of `code`. + /// + /// Bit `j` is set iff `code`'s coordinate `j` lands in the top bucket + /// (`buckets - 1`), via [`BucketCode::top_bitmap`]. + pub fn from_top_bucket(code: &BucketCode) -> Self { + let bits = code.top_bitmap(); + let weight = bits.iter().filter(|&&bit| bit).count(); + Self { + dim: bits.len(), + weight, + bits, + } + } + + /// The bitmap dimension (number of coordinates / bits). + pub fn dim(&self) -> usize { + self.dim + } + + /// The number of set bits (constant across codes of the same spec). + pub fn weight(&self) -> usize { + self.weight + } + + /// The raw boolean membership bits. + pub fn bits(&self) -> &[bool] { + &self.bits + } + + /// Count of positions set in **both** bitmaps — the naive shared-set-bit + /// overlap statistic. + /// + /// # Panics + /// Panics if the two bitmaps have different dimensions (a popcount over + /// mismatched supports is meaningless), matching the prototype's + /// fail-loud contract. + pub fn overlap(&self, other: &Self) -> usize { + assert_eq!(self.dim, other.dim, "bitmap dimensions must match"); + self.bits + .iter() + .zip(&other.bits) + .filter(|&(lhs, rhs)| *lhs && *rhs) + .count() + } +} + +impl PackedConstantWeightBitmap { + /// Pack the membership indicator for the bucket range `[start, end]`. + /// + /// Bit `j` is set iff `code`'s coordinate `j` lands in a bucket in the + /// inclusive range `start_bucket..=end_bucket`. + /// + /// # Panics + /// Panics if `start_bucket > end_bucket`, or if `end_bucket` is outside the + /// code's bucket domain (`>= buckets`). + pub fn from_bucket_range(code: &BucketCode, start_bucket: usize, end_bucket: usize) -> Self { + assert!(start_bucket <= end_bucket, "bucket range must be ordered"); + assert!( + end_bucket < code.spec().buckets(), + "bucket range must fit code spec" + ); + let dim = code.codes().len(); + let mut weight = 0usize; + let mut words = vec![0u64; dim.div_ceil(64)]; + for (coordinate, &bucket) in code.codes().iter().enumerate() { + let bucket = bucket as usize; + if (start_bucket..=end_bucket).contains(&bucket) { + weight += 1; + words[coordinate / 64] |= 1u64 << (coordinate % 64); + } + } + Self { dim, weight, words } + } + + /// Pack the membership indicator for the top `width` buckets. + /// + /// Equivalent to [`Self::from_bucket_range`] over `[buckets - width, + /// buckets - 1]`. `from_top_group(code, 1)` is the packed analogue of + /// [`ConstantWeightBitmap::from_top_bucket`]. + /// + /// # Panics + /// Panics if `width == 0` or `width > buckets`. + pub fn from_top_group(code: &BucketCode, width: usize) -> Self { + assert!(width > 0, "top-group width must be positive"); + assert!( + width <= code.spec().buckets(), + "top-group width must fit code spec" + ); + let start = code.spec().buckets() - width; + Self::from_bucket_range(code, start, code.spec().buckets() - 1) + } + + /// The bitmap dimension (number of coordinates). + pub fn dim(&self) -> usize { + self.dim + } + + /// The number of set bits. + pub fn weight(&self) -> usize { + self.weight + } + + /// The packed `u64` membership words. + pub fn words(&self) -> &[u64] { + &self.words + } + + /// Popcount of `self AND other` — the packed overlap statistic. + /// + /// Routes through the crate's shared `crate::util::and_popcount` + /// reduction (scalar `u64::count_ones` over the AND on x86_64, NEON on + /// aarch64, simd128 on wasm), the same primitive the production + /// [`crate::Bitmap`] scan kernels use. Equal to + /// [`ConstantWeightBitmap::overlap`] for the same codes. + /// + /// # Panics + /// Panics if the two bitmaps have different dimensions (their word counts + /// then differ, which `and_popcount` itself rejects). The explicit `dim` + /// check fails loud with the bitmap-specific message before the reduction. + pub fn overlap(&self, other: &Self) -> usize { + assert_eq!(self.dim, other.dim, "bitmap dimensions must match"); + and_popcount(&self.words, &other.words) as usize + } +} + +/// Overlap profile across a set of top-group widths. +/// +/// For each `width` in `widths`, builds the packed top-`width`-group bitmaps of +/// `lhs` and `rhs` and returns their popcount overlap. The result is a vector +/// parallel to `widths`. Both codes must share the same dimension (and, for +/// `from_top_group` to be meaningful, the same spec). +pub fn top_group_overlap_vector( + lhs: &BucketCode, + rhs: &BucketCode, + widths: &[usize], +) -> Vec { + widths + .iter() + .map(|&width| { + let lhs_bitmap = PackedConstantWeightBitmap::from_top_group(lhs, width); + let rhs_bitmap = PackedConstantWeightBitmap::from_top_group(rhs, width); + lhs_bitmap.overlap(&rhs_bitmap) + }) + .collect() +} + +/// Idealized uniform constant-weight bitmap null. +/// +/// Models a uniform distribution over **all** weight-`weight` bitmaps in `dim` +/// positions (there are `C(dim, weight)` of them). The overlap of a random such +/// bitmap with a fixed weight-`weight` bitmap is hypergeometric; this type +/// exposes the exact finite counts: +/// +/// - [`Self::space_size`] = `C(dim, weight)` — the total number of bitmaps. +/// - [`Self::fiber_count`] = the number of bitmaps overlapping a fixed one in +/// exactly `overlap` positions (the hypergeometric numerator). +/// - [`Self::tail_count`] = the upper-tail sum `Σ_{o>=threshold} fiber_count(o)`. +/// +/// The fibers partition the space, so `Σ_{o=0..=weight} fiber_count(o) == +/// space_size`, and `tail_count(threshold) / space_size` is the exact upper-tail +/// probability of seeing an overlap `>= threshold` under the null. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BitmapNull { + dim: usize, + weight: usize, +} + +impl BitmapNull { + /// Build the null over weight-`weight` bitmaps in `dim` positions. + /// + /// # Panics + /// Panics if `dim == 0` or `weight > dim`. + pub fn new(dim: usize, weight: usize) -> Self { + assert!(dim > 0, "dim must be > 0"); + assert!(weight <= dim, "weight must be <= dim"); + Self { dim, weight } + } + + /// The number of positions. + pub fn dim(&self) -> usize { + self.dim + } + + /// The constant bitmap weight. + pub fn weight(&self) -> usize { + self.weight + } + + /// Total number of weight-`weight` bitmaps: `C(dim, weight)`. + pub fn space_size(&self) -> u128 { + choose(self.dim, self.weight) + } + + /// Number of weight-`weight` bitmaps overlapping a fixed weight-`weight` + /// bitmap in exactly `overlap` positions. + /// + /// This is the hypergeometric numerator + /// `C(weight, overlap) * C(dim - weight, weight - overlap)`: choose which + /// `overlap` of the `weight` set bits coincide, then place the remaining + /// `weight - overlap` set bits among the `dim - weight` zero positions. + /// Returns `0` for an infeasible `overlap` (more than `weight`, or leaving + /// more remaining set bits than there are zero positions). + pub fn fiber_count(&self, overlap: usize) -> u128 { + if overlap > self.weight { + return 0; + } + let outside = self.weight - overlap; + if outside > self.dim - self.weight { + return 0; + } + choose(self.weight, overlap) * choose(self.dim - self.weight, outside) + } + + /// Upper-tail count `Σ_{o>=threshold} fiber_count(o)`. + /// + /// `tail_count(0) == space_size` (every bitmap overlaps in `>= 0` + /// positions), and `tail_count(threshold) == 0` for `threshold > weight` + /// (no bitmap overlaps a weight-`weight` bitmap in more than `weight` + /// positions). Monotone non-increasing in `threshold`. Divide by + /// [`Self::space_size`] for the exact upper-tail probability. + pub fn tail_count(&self, threshold: usize) -> u128 { + if threshold == 0 { + return self.space_size(); + } + if threshold > self.weight { + return 0; + } + (threshold..=self.weight) + .map(|overlap| self.fiber_count(overlap)) + .sum() + } +} + +/// Binomial coefficient `C(n, k)` in `u128`. +/// +/// Returns `0` for `k > n`. Uses the symmetric `k.min(n - k)` factor count and +/// the multiply-then-divide recurrence, which stays exact (each partial product +/// `C(n, i+1)` is integral). For large `n` the running product can exceed +/// `u128::MAX` — this panics in debug and wraps in release, matching the +/// reference prototype; see the module-level Overflow note. +pub fn choose(n: usize, k: usize) -> u128 { + if k > n { + return 0; + } + let k = k.min(n - k); + let mut acc = 1u128; + for i in 0..k { + acc = acc * (n - i) as u128 / (i + 1) as u128; + } + acc +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bucket_code::{BucketCode, CompositionSpec}; + + /// Build a `dim`-length, 4-bucket code from raw bucket ids. + fn code(values: &[u8]) -> BucketCode { + BucketCode::new( + CompositionSpec::new(values.len(), 4).unwrap(), + values.to_vec(), + ) + .unwrap() + } + + /// Naive shared-set-bit overlap over two packed bitmaps — the independent + /// reference both `overlap` implementations must match. Replaces the + /// prototype's `Contingency::top_overlap` cross-check (a retrieval/graph + /// concept deliberately not ported into ordvec under #222); the prototype's + /// literal expected values are reproduced verbatim below. + fn naive_packed_overlap( + a: &PackedConstantWeightBitmap, + b: &PackedConstantWeightBitmap, + ) -> usize { + a.words() + .iter() + .zip(b.words()) + .map(|(x, y)| (x & y).count_ones() as usize) + .sum() + } + + // ---- ordgraph bitmap parity gate ------------------------------------ + // Assertion values reproduced verbatim from the reference + // `ordgraph-proto/src/bitmap.rs` #[cfg(test)] module. The prototype + // cross-checked the overlap against `Contingency::top_overlap`; here the + // same literal (1, and [1, 3, 8]) is asserted directly and cross-checked + // against the naive shared-set-bit count instead. + + #[test] + fn top_bitmap_has_expected_constant_weight() { + let code = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let bitmap = ConstantWeightBitmap::from_top_bucket(&code); + + assert_eq!(bitmap.dim(), 8); + assert_eq!(bitmap.weight(), 2); + } + + #[test] + fn top_overlap_matches_naive_top_top_count() { + let query = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let doc = code(&[0, 1, 1, 2, 2, 3, 3, 0]); + let query_bitmap = ConstantWeightBitmap::from_top_bucket(&query); + let doc_bitmap = ConstantWeightBitmap::from_top_bucket(&doc); + + // Prototype literal: top-top overlap is 1. + assert_eq!(query_bitmap.overlap(&doc_bitmap), 1); + } + + #[test] + fn packed_top_overlap_matches_naive_top_top_count() { + let query = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let doc = code(&[0, 1, 1, 2, 2, 3, 3, 0]); + let query_bitmap = PackedConstantWeightBitmap::from_top_group(&query, 1); + let doc_bitmap = PackedConstantWeightBitmap::from_top_group(&doc, 1); + + assert_eq!(query_bitmap.dim(), 8); + assert_eq!(query_bitmap.weight(), 2); + // Prototype literal: top-top overlap is 1. + assert_eq!(query_bitmap.overlap(&doc_bitmap), 1); + assert_eq!( + query_bitmap.overlap(&doc_bitmap), + naive_packed_overlap(&query_bitmap, &doc_bitmap) + ); + } + + #[test] + fn top_group_overlap_vector_uses_popcount_backed_bitmaps() { + let query = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let doc = code(&[0, 1, 1, 2, 2, 3, 3, 0]); + + // Prototype literal. + assert_eq!( + top_group_overlap_vector(&query, &doc, &[1, 2, 4]), + [1, 3, 8] + ); + } + + #[test] + fn bitmap_null_fibers_sum_to_space_size() { + let null = BitmapNull::new(10, 3); + let fiber_sum: u128 = (0..=3).map(|overlap| null.fiber_count(overlap)).sum(); + + assert_eq!(fiber_sum, choose(10, 3)); + assert_eq!(null.space_size(), choose(10, 3)); + } + + #[test] + fn bitmap_tail_counts_have_boundary_values_and_are_monotone() { + let null = BitmapNull::new(10, 3); + + assert_eq!(null.tail_count(0), choose(10, 3)); + assert_eq!(null.tail_count(4), 0); + assert!(null.tail_count(2) <= null.tail_count(1)); + assert!(null.tail_count(3) <= null.tail_count(2)); + } + + // ---- ordvec-specific correctness surface ---------------------------- + + #[test] + fn null_fibers_partition_space_for_several_params() { + // The fibers of the overlap statistic partition the whole space, so + // their counts must sum to C(dim, weight) for every (dim, weight). + for (dim, weight) in [(8, 2), (10, 3), (16, 4), (20, 5), (32, 8), (5, 0), (5, 5)] { + let null = BitmapNull::new(dim, weight); + let fiber_sum: u128 = (0..=weight).map(|o| null.fiber_count(o)).sum(); + assert_eq!( + fiber_sum, + null.space_size(), + "fibers must partition the space for (dim={dim}, weight={weight})" + ); + } + } + + #[test] + fn overlap_parity_const_vs_packed_vs_naive() { + // The three overlap definitions — bool-bitmap shared-set-bit count, + // packed AND-popcount (via util::and_popcount), and the standalone + // naive packed reference — must all agree for the same codes, across + // every top-group width. + let query = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let doc = code(&[3, 2, 1, 0, 0, 1, 2, 3]); + + for width in 1..=4 { + let packed_q = PackedConstantWeightBitmap::from_top_group(&query, width); + let packed_d = PackedConstantWeightBitmap::from_top_group(&doc, width); + let packed_overlap = packed_q.overlap(&packed_d); + let naive = naive_packed_overlap(&packed_q, &packed_d); + assert_eq!(packed_overlap, naive, "packed vs naive at width {width}"); + + if width == 1 { + // Width 1 is exactly the top bucket — the bool bitmap path. + let const_q = ConstantWeightBitmap::from_top_bucket(&query); + let const_d = ConstantWeightBitmap::from_top_bucket(&doc); + assert_eq!( + const_q.overlap(&const_d), + packed_overlap, + "bool vs packed at the top bucket" + ); + } + } + } + + #[test] + fn packed_overlap_handles_multi_word_dim() { + // dim = 128 spans two u64 words, exercising the shared and_popcount + // reduction across word boundaries. A 4-bucket code over 128 coords + // puts 32 coordinates in the top bucket; overlapping a code with + // itself yields exactly its weight. + let values: Vec = (0..128).map(|i| (i % 4) as u8).collect(); + let code = BucketCode::new(CompositionSpec::new(128, 4).unwrap(), values).unwrap(); + let bitmap = PackedConstantWeightBitmap::from_top_group(&code, 1); + assert_eq!(bitmap.dim(), 128); + assert_eq!(bitmap.words().len(), 2); + assert_eq!(bitmap.weight(), 32); + assert_eq!(bitmap.overlap(&bitmap), 32); + } + + #[test] + fn choose_matches_known_small_binomials() { + assert_eq!(choose(0, 0), 1); + assert_eq!(choose(5, 0), 1); + assert_eq!(choose(5, 5), 1); + assert_eq!(choose(5, 2), 10); + assert_eq!(choose(10, 3), 120); + assert_eq!(choose(6, 3), 20); + assert_eq!(choose(52, 5), 2_598_960); + // k > n is empty. + assert_eq!(choose(3, 4), 0); + } + + #[test] + fn choose_is_symmetric() { + for n in 0..=30usize { + for k in 0..=n { + assert_eq!( + choose(n, k), + choose(n, n - k), + "C({n},{k}) == C({n},{})", + n - k + ); + } + } + } + + #[test] + fn fiber_count_zero_outside_feasible_overlap() { + let null = BitmapNull::new(10, 3); + // An overlap larger than the weight is impossible. + assert_eq!(null.fiber_count(4), 0); + // Exactly the weight: all set bits coincide — there is exactly one such + // bitmap (the fixed one itself). + assert_eq!(null.fiber_count(3), 1); + } + + #[test] + fn tail_probability_is_well_formed() { + // tail_count(0) / space_size == 1; the tail at every threshold is a + // valid fraction of the space. + let null = BitmapNull::new(16, 4); + let space = null.space_size(); + assert_eq!(null.tail_count(0), space); + for threshold in 0..=5 { + assert!(null.tail_count(threshold) <= space); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 14097eb9..28beaa47 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,9 @@ mod bitmap; /// Index-free, fixed-composition ordinal bucket codes (issue #220). #[cfg(feature = "experimental")] pub mod bucket_code; +/// Constant-weight bitmap overlap + the finite constant-weight null (issue #222). +#[cfg(feature = "experimental")] +pub mod const_weight_bitmap; mod fastscan; #[cfg(feature = "experimental")] mod multi_bucket; @@ -96,6 +99,18 @@ pub use multi_bucket::MultiBucketBitmap; #[cfg(feature = "experimental")] pub use bucket_code::{BucketCode, CompositionSpec, CompositionViolation, RankQuantSpec}; +// Constant-weight bitmap overlap + the finite constant-weight null (issue #222). +// The ordinal-kernel evidence surface built on the #220 bucket codes: the +// top-bucket / top-group constant-weight bitmaps, their popcount overlap (routed +// through the crate's shared `util::and_popcount` primitive), and the idealized +// uniform constant-weight null that turns an observed overlap into an exact +// finite tail probability. Behind the `experimental` feature; whether it +// graduates to the stable surface is a deliberate later decision. +#[cfg(feature = "experimental")] +pub use const_weight_bitmap::{ + choose, top_group_overlap_vector, BitmapNull, ConstantWeightBitmap, PackedConstantWeightBitmap, +}; + // `RankQuantFastscan` is an optional FastScan b=2 scan path. It is // re-exported `#[doc(hidden)]` at the crate root — reachable as // `ordvec::RankQuantFastscan` for callers who opt in, but not From 4123a8bcf6375aeba37a93c54a5bb00c4027c012 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 16:01:34 -0500 Subject: [PATCH 03/10] fix(bucket_code): bound buckets<=256 + u64 bucket math (gemini/qodo) (#226) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CompositionSpec::new now rejects buckets>256 (InvalidSpec) — codes are u8, so >256 would silently truncate a computed id via 'as u8' and fail later with a misleading WrongBucketCount. - from_ranks computes rank*buckets in u64 to avoid usize overflow on 32-bit/ wasm32 for large dim (matches util.rs checked-math discipline). - Tests: >256-buckets rejection + boundary; from_vector unsupported-bits path. Signed-off-by: Nelson Spence --- src/bucket_code.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/bucket_code.rs b/src/bucket_code.rs index 916d7879..6a9d0448 100644 --- a/src/bucket_code.rs +++ b/src/bucket_code.rs @@ -65,6 +65,15 @@ impl CompositionSpec { if buckets < 2 { return Err(CompositionViolation::InvalidSpec("buckets must be >= 2")); } + // Codes are stored as `u8`, so a bucket id must fit in `0..=255`. Reject + // `buckets > 256` here rather than letting `from_ranks` silently truncate + // a computed id via `as u8` and fail later with a misleading + // `WrongBucketCount`. + if buckets > u8::MAX as usize + 1 { + return Err(CompositionViolation::InvalidSpec( + "buckets must be <= 256 (codes are stored as u8)", + )); + } if !dim.is_multiple_of(buckets) { return Err(CompositionViolation::NonUniformSpec { dim, buckets }); } @@ -269,9 +278,11 @@ impl BucketCode { } seen[rank] = true; // `rank < dim` and `dim % buckets == 0`, so `rank * buckets / dim` - // lands in `[0, buckets)` and `buckets <= 1 << 8` for any code that - // can be a `u8`. (For the RankQuant domain `buckets <= 16`.) - codes.push((rank * buckets / dim) as u8); + // lands in `[0, buckets)` and `buckets <= 256` (enforced in + // `CompositionSpec::new`), so the result fits a `u8`. Compute the + // product in `u64`: `rank * buckets` can exceed `usize::MAX` on + // 32-bit / wasm32 targets for large `dim`. + codes.push(((rank as u64 * buckets as u64) / dim as u64) as u8); } Self::new(spec, codes) } @@ -513,6 +524,11 @@ mod tests { RankQuantSpec::new(8, 3).unwrap_err(), CompositionViolation::InvalidBits { bits: 3 } ); + // `from_vector` surfaces the same unsupported-bits rejection. + assert_eq!( + BucketCode::from_vector(8, 3, &[0.0f32; 8]).unwrap_err(), + CompositionViolation::InvalidBits { bits: 3 } + ); assert_eq!( RankQuantSpec::new(u16::MAX as usize + 1, 2).unwrap_err(), CompositionViolation::DimTooLarge { @@ -522,6 +538,17 @@ mod tests { ); } + #[test] + fn composition_spec_rejects_more_than_256_buckets() { + // Codes are u8: a bucket id must fit 0..=255. + assert_eq!( + CompositionSpec::new(512, 257).unwrap_err(), + CompositionViolation::InvalidSpec("buckets must be <= 256 (codes are stored as u8)") + ); + // 256 is the boundary and is accepted (dim a multiple of it). + assert!(CompositionSpec::new(512, 256).is_ok()); + } + #[test] fn rankquant_spec_rejects_non_divisible_dims() { assert_eq!( From fb7561f312686dd3d69ec7b0970b185b78e04986 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 16:04:30 -0500 Subject: [PATCH 04/10] fix(const_weight_bitmap): fail-loud overflow in choose/fiber_count (gemini/qodo) (#222) Resolves the flagged u128-overflow decision in the safe direction (the bots and the maintainer both wanted fail-loud before stable): - choose(): gcd-cancellation of each (n-i)/(i+1) factor (extends the representable range to the full set of C(n,k) that fit u128) + checked_mul, so a true overflow PANICS (both debug and release) instead of silently wrapping to a wrong count. - fiber_count(): checked_mul on the two binomials. - Overflow docs updated (deliberate divergence from the proto's wrap-in-release). - Tests: range-extension via Pascal's identity (C(128,64)); should_panic on C(300,150) overflow. Signed-off-by: Nelson Spence --- src/const_weight_bitmap.rs | 67 ++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/src/const_weight_bitmap.rs b/src/const_weight_bitmap.rs index 4c655892..eaafadd6 100644 --- a/src/const_weight_bitmap.rs +++ b/src/const_weight_bitmap.rs @@ -36,13 +36,16 @@ //! //! # Overflow //! [`choose`] (and therefore [`BitmapNull::space_size`] / `fiber_count` / -//! `tail_count`) accumulates in `u128`. For very large `dim` the binomial -//! `C(dim, weight)` exceeds `u128::MAX` and the running product overflows — -//! this panics in debug builds and wraps in release, exactly as the reference -//! prototype does. The finite null is intended for the small `dim`/`weight` -//! regime where the exact count is representable; callers operating near the -//! `u128` ceiling must bound their parameters. This matches the prototype's -//! behaviour (see the module-level FLAG in the porting notes — no divergence). +//! `tail_count`) accumulates in `u128`. gcd-cancellation keeps the running +//! product minimal, so the representable range is the full set of `(dim, weight)` +//! whose true `C(dim, weight)` fits `u128`. Beyond that the result is not +//! representable and the count **panics (fail-loud)** — in both debug and +//! release — rather than silently wrapping to a wrong value. (This is a +//! deliberate divergence from the reference prototype, which wrapped in release; +//! a public combinatorial that returns a wrong count is unacceptable for an +//! exact null.) The finite null targets the small `dim`/`weight` regime where +//! the exact count is representable; callers near the `u128` ceiling must bound +//! their parameters or pre-check. use crate::bucket_code::BucketCode; use crate::util::and_popcount; @@ -286,7 +289,9 @@ impl BitmapNull { if outside > self.dim - self.weight { return 0; } - choose(self.weight, overlap) * choose(self.dim - self.weight, outside) + choose(self.weight, overlap) + .checked_mul(choose(self.dim - self.weight, outside)) + .expect("fiber count overflows u128") } /// Upper-tail count `Σ_{o>=threshold} fiber_count(o)`. @@ -312,10 +317,11 @@ impl BitmapNull { /// Binomial coefficient `C(n, k)` in `u128`. /// /// Returns `0` for `k > n`. Uses the symmetric `k.min(n - k)` factor count and -/// the multiply-then-divide recurrence, which stays exact (each partial product -/// `C(n, i+1)` is integral). For large `n` the running product can exceed -/// `u128::MAX` — this panics in debug and wraps in release, matching the -/// reference prototype; see the module-level Overflow note. +/// an exact multiply-then-divide recurrence, with gcd-cancellation of each +/// `(n - i)/(i + 1)` factor to keep the running product as small as possible +/// before each step. The multiply is `checked_mul`: if the true `C(n, k)` +/// exceeds `u128::MAX` this **panics** (fail-loud) rather than silently wrapping +/// to a wrong count. See the module-level Overflow note. pub fn choose(n: usize, k: usize) -> u128 { if k > n { return 0; @@ -323,11 +329,30 @@ pub fn choose(n: usize, k: usize) -> u128 { let k = k.min(n - k); let mut acc = 1u128; for i in 0..k { - acc = acc * (n - i) as u128 / (i + 1) as u128; + let num = (n - i) as u128; + let den = (i + 1) as u128; + // Cancel the shared factor first: this both shrinks the intermediate + // product (extending the representable range) and keeps the division + // exact — `den / g` is coprime to `num / g`, and the result `C(n, i+1)` + // is integral, so `den / g` divides `acc`. + let g = gcd(num, den); + acc = (acc / (den / g)) + .checked_mul(num / g) + .expect("binomial coefficient C(n, k) overflows u128"); } acc } +/// Greatest common divisor (Euclid), for the exact binomial cancellation above. +fn gcd(mut a: u128, mut b: u128) -> u128 { + while b != 0 { + let t = a % b; + a = b; + b = t; + } + a +} + #[cfg(test)] mod tests { use super::*; @@ -521,6 +546,22 @@ mod tests { } } + #[test] + fn choose_extends_range_via_gcd_cancellation() { + // C(128, 64) fits u128 but the naive multiply-then-divide recurrence + // overflows the intermediate product; gcd-cancellation computes it. + // Validate via Pascal's identity (no huge literal): C(n,k)=C(n-1,k-1)+C(n-1,k). + assert_eq!(choose(128, 64), choose(127, 63) + choose(127, 64)); + assert!(choose(128, 64) > 0); + } + + #[test] + #[should_panic(expected = "overflows u128")] + fn choose_panics_fail_loud_on_overflow() { + // C(300, 150) is far beyond u128::MAX: fail loud, never wrap to a wrong count. + let _ = choose(300, 150); + } + #[test] fn fiber_count_zero_outside_feasible_overlap() { let null = BitmapNull::new(10, 3); From fd1efe89ca48263beb16ab4bd0f7176c7494db15 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 17:39:34 -0500 Subject: [PATCH 05/10] docs(bucket_code): add ordgraph-proto migration note (qodo) (#220) qodo: the PR added the bucket-code surface without explicit migration steps for deleting ordgraph-proto's local CompositionSpec/RankQuantSpec/BucketCode fork. Add a 'Migration from ordgraph-proto' subsection: the drop-in import + delete src/code.rs steps, the rank-math-delegation note, and the two intentional deviations (bits=8 deferred to #221, buckets<=256). Signed-off-by: Nelson Spence --- src/bucket_code.rs | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/bucket_code.rs b/src/bucket_code.rs index 6a9d0448..73e9b1d7 100644 --- a/src/bucket_code.rs +++ b/src/bucket_code.rs @@ -27,9 +27,27 @@ //! the stateless dense-code contingency surface (`Contingency::new`, issue //! #219) without any further transform. //! -//! Ported to reach behavioural parity with the `ordgraph` bucket-code -//! prototype; the rank math is *not* re-implemented here — it delegates to the -//! crate's shared [`crate::rank`] primitives. +//! ## Migration from `ordgraph-proto` +//! +//! This surface replaces the local `CompositionSpec` / `RankQuantSpec` / +//! `BucketCode` / `CompositionViolation` fork in `ordgraph-proto/src/code.rs`. +//! Type names, constructors, accessors, and error values mirror that prototype +//! (its tests' literal expectations are reproduced here as parity checks), so +//! the downstream migration is a drop-in: +//! +//! 1. depend on `ordvec` (with the `experimental` feature while this surface is +//! gated) and `use ordvec::{BucketCode, CompositionSpec, RankQuantSpec, +//! CompositionViolation};` +//! 2. delete `ordgraph-proto/src/code.rs` and its `mod code;`, re-pointing +//! callers at the `ordvec` types; +//! 3. the rank math is *not* re-implemented here — [`BucketCode::from_vector`] +//! delegates to the crate's shared [`crate::rank`] primitives, so ordgraph +//! no longer forks rank/bucket semantics. +//! +//! Two intentional differences from the prototype (rationale in the PR): +//! `bits = 8` is rejected here (it lands as a capability-gated width in the +//! separate b=8 work, #221), and [`CompositionSpec::new`] rejects +//! `buckets > 256` (codes are `u8`). use std::error::Error; use std::fmt; From 9d56540d9e99d643694bde3578da238daf7d4e6a Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:22:32 -0500 Subject: [PATCH 06/10] test: pin bits=8 rejection for RankQuantSpec and BucketCode::from_vector Add a focused test `rankquant_spec_rejects_bits_8` that asserts both `RankQuantSpec::new(8, 8)` and `BucketCode::from_vector(8, 8, &v)` return `CompositionViolation::InvalidBits { bits: 8 }`, pinning the deliberate b=8 exclusion so the boundary cannot regress silently. Addresses the qodo finding: "bits=8 behavior untested" (PR #226). Signed-off-by: Nelson Spence --- src/bucket_code.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/bucket_code.rs b/src/bucket_code.rs index 73e9b1d7..17e2fdd8 100644 --- a/src/bucket_code.rs +++ b/src/bucket_code.rs @@ -556,6 +556,22 @@ mod tests { ); } + // Pin the b=8 decision: the reference prototype accepted bits=8 but ordvec + // rejects it. These tests ensure that boundary cannot change silently. + #[test] + fn rankquant_spec_rejects_bits_8() { + assert_eq!( + RankQuantSpec::new(8, 8).unwrap_err(), + CompositionViolation::InvalidBits { bits: 8 } + ); + // `from_vector` takes the same path: bits=8 is rejected at the spec level. + let v: Vec = (0..8).map(|i| i as f32).collect(); + assert_eq!( + BucketCode::from_vector(8, 8, &v).unwrap_err(), + CompositionViolation::InvalidBits { bits: 8 } + ); + } + #[test] fn composition_spec_rejects_more_than_256_buckets() { // Codes are u8: a bucket id must fit 0..=255. From e8b0a8d8f369fcc14b7b5ad17716b9b8672d09a2 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:26:00 -0500 Subject: [PATCH 07/10] fix: harden bitmap overlap and BitmapNull against three qodo findings - PackedConstantWeightBitmap::overlap: add assert!(dim <= u32::MAX) before and_popcount, which accumulates into u32 and would overflow silently for very large dims (Finding 1 / Correctness). - top_group_overlap_vector: add assert_eq!(lhs.spec(), rhs.spec()) precondition with a clear message; comparing bitmaps across specs is meaningless (Finding 2 / Maintainability). - BitmapNull::tail_probability(observed): new method returning tail_count(observed) / space_size as f64, the exact hypergeometric upper-tail P(overlap >= observed); rustdoc + unit tests pin boundary values and a known exact result (Finding 3 / Requirement gap). Tests added: packed_overlap_within_u32_max_does_not_panic, top_group_overlap_vector_panics_on_mismatched_spec, top_group_overlap_vector_passes_on_matching_spec, tail_probability_boundary_values, tail_probability_known_value, tail_probability_is_in_unit_interval_and_monotone. Signed-off-by: Nelson Spence Signed-off-by: Nelson Spence --- src/const_weight_bitmap.rs | 141 ++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 2 deletions(-) diff --git a/src/const_weight_bitmap.rs b/src/const_weight_bitmap.rs index eaafadd6..9229e136 100644 --- a/src/const_weight_bitmap.rs +++ b/src/const_weight_bitmap.rs @@ -198,8 +198,15 @@ impl PackedConstantWeightBitmap { /// Panics if the two bitmaps have different dimensions (their word counts /// then differ, which `and_popcount` itself rejects). The explicit `dim` /// check fails loud with the bitmap-specific message before the reduction. + /// Also panics if `dim > u32::MAX`: `and_popcount` accumulates into `u32`, + /// so a larger bitmap could overflow the popcount before the cast to `usize`. pub fn overlap(&self, other: &Self) -> usize { assert_eq!(self.dim, other.dim, "bitmap dimensions must match"); + assert!( + self.dim <= u32::MAX as usize, + "bitmap dim {} exceeds u32::MAX; and_popcount accumulates in u32 and would overflow", + self.dim + ); and_popcount(&self.words, &other.words) as usize } } @@ -208,13 +215,25 @@ impl PackedConstantWeightBitmap { /// /// For each `width` in `widths`, builds the packed top-`width`-group bitmaps of /// `lhs` and `rhs` and returns their popcount overlap. The result is a vector -/// parallel to `widths`. Both codes must share the same dimension (and, for -/// `from_top_group` to be meaningful, the same spec). +/// parallel to `widths`. Both codes must share the same spec (same `dim` and +/// `buckets`): the top-group bitmaps are only comparable when the constant +/// weight per bucket is identical across both codes. +/// +/// # Panics +/// Panics if `lhs` and `rhs` have different specs (dim or bucket count differs). pub fn top_group_overlap_vector( lhs: &BucketCode, rhs: &BucketCode, widths: &[usize], ) -> Vec { + assert_eq!( + lhs.spec(), + rhs.spec(), + "top_group_overlap_vector: lhs and rhs must share the same spec \ + (dim and buckets must match); got lhs={:?}, rhs={:?}", + lhs.spec(), + rhs.spec() + ); widths .iter() .map(|&width| { @@ -312,6 +331,39 @@ impl BitmapNull { .map(|overlap| self.fiber_count(overlap)) .sum() } + + /// Exact upper-tail probability `P(overlap >= observed)` under the uniform + /// constant-weight null. + /// + /// Returns `tail_count(observed) / space_size` as an `f64`. This is the + /// fraction of all weight-`weight` bitmaps whose overlap with a fixed + /// weight-`weight` bitmap is at least `observed` — the exact hypergeometric + /// upper tail at the given threshold. + /// + /// Returns `0.0` for `observed > weight` (impossible overlap) and `1.0` + /// for `observed == 0` (all bitmaps overlap in `>= 0` positions). + /// + /// # Example + /// ``` + /// # #[cfg(feature = "experimental")] { + /// use ordvec::const_weight_bitmap::BitmapNull; + /// let null = BitmapNull::new(10, 3); + /// // All bitmaps have overlap >= 0. + /// assert_eq!(null.tail_probability(0), 1.0); + /// // No bitmap overlaps in more than weight positions. + /// assert_eq!(null.tail_probability(4), 0.0); + /// // The probability is in [0, 1]. + /// let p = null.tail_probability(2); + /// assert!(p >= 0.0 && p <= 1.0); + /// # } + /// ``` + pub fn tail_probability(&self, observed: usize) -> f64 { + let space = self.space_size(); + if space == 0 { + return 0.0; + } + self.tail_count(observed) as f64 / space as f64 + } } /// Binomial coefficient `C(n, k)` in `u128`. @@ -583,4 +635,89 @@ mod tests { assert!(null.tail_count(threshold) <= space); } } + + // ---- Finding 1: u32 overflow in overlap (assert dim <= u32::MAX) ------- + // + // Constructing a bitmap with dim > u32::MAX would require ~512 MB of u64 + // words, so we only test that the guard is present and correct for the + // reachable domain. The positive test confirms no panic at a large-but-safe + // dim (128 words = 8192 coords, well below u32::MAX). + + #[test] + fn packed_overlap_within_u32_max_does_not_panic() { + // dim = 128 (well within u32::MAX) must not trigger the domain guard. + let values: Vec = (0..128).map(|i| (i % 4) as u8).collect(); + let c = BucketCode::new(CompositionSpec::new(128, 4).unwrap(), values).unwrap(); + let bm = PackedConstantWeightBitmap::from_top_group(&c, 1); + // Should not panic: dim=128 is far below u32::MAX. + let _ = bm.overlap(&bm); + } + + // ---- Finding 2: Unenforced same-spec precondition ---------------------- + + #[test] + #[should_panic(expected = "lhs and rhs must share the same spec")] + fn top_group_overlap_vector_panics_on_mismatched_spec() { + // Two codes with the same dim but different bucket counts — different + // specs — must trigger the precondition assert. + let lhs = BucketCode::new( + CompositionSpec::new(8, 4).unwrap(), + vec![0, 0, 1, 1, 2, 2, 3, 3], + ) + .unwrap(); + let rhs = BucketCode::new( + CompositionSpec::new(8, 2).unwrap(), + vec![0, 0, 0, 0, 1, 1, 1, 1], + ) + .unwrap(); + let _ = top_group_overlap_vector(&lhs, &rhs, &[1]); + } + + #[test] + fn top_group_overlap_vector_passes_on_matching_spec() { + // Two codes with the same spec must not trigger the precondition. + let lhs = code(&[0, 0, 1, 1, 2, 2, 3, 3]); + let rhs = code(&[0, 1, 1, 2, 2, 3, 3, 0]); + // Should not panic: same spec. + let _ = top_group_overlap_vector(&lhs, &rhs, &[1]); + } + + // ---- Finding 3: BitmapNull::tail_probability --------------------------- + + #[test] + fn tail_probability_boundary_values() { + // P(overlap >= 0) == 1.0 (every bitmap qualifies). + // P(overlap >= weight + 1) == 0.0 (no bitmap qualifies). + let null = BitmapNull::new(10, 3); + assert_eq!(null.tail_probability(0), 1.0); + assert_eq!(null.tail_probability(4), 0.0); + } + + #[test] + fn tail_probability_known_value() { + // C(10, 3) = 120. fiber_count(3) = C(3,3)*C(7,0) = 1. + // So P(overlap >= 3) = 1/120 = 0.008333... + let null = BitmapNull::new(10, 3); + let expected = 1.0_f64 / 120.0_f64; + let got = null.tail_probability(3); + assert!( + (got - expected).abs() < 1e-12, + "tail_probability(3) expected {expected} got {got}" + ); + } + + #[test] + fn tail_probability_is_in_unit_interval_and_monotone() { + let null = BitmapNull::new(16, 4); + let mut prev = 1.0_f64; + for threshold in 0..=5 { + let p = null.tail_probability(threshold); + assert!( + (0.0..=1.0).contains(&p), + "probability out of [0,1] at threshold={threshold}" + ); + assert!(p <= prev, "tail_probability must be non-increasing"); + prev = p; + } + } } From 1c8d5ab893f670bd69c717fa6cb5bf6f7baec66f Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:54:03 -0500 Subject: [PATCH 08/10] docs: product-clean the bucket-code surface (remove internal-prototype references) Replace the "Migration from ordgraph-proto" rustdoc section with a self-contained "Adopting this API" adoption note describing how external callers can replace local forks with the ordvec types. Rename the "ordgraph bucket-code parity gate" test block comment to "bucket-code behavioral contract", describing the invariants under test rather than a migration relationship. No production behaviour, test logic, or assert values changed. Signed-off-by: Nelson Spence --- src/bucket_code.rs | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/bucket_code.rs b/src/bucket_code.rs index 17e2fdd8..46afd985 100644 --- a/src/bucket_code.rs +++ b/src/bucket_code.rs @@ -27,27 +27,25 @@ //! the stateless dense-code contingency surface (`Contingency::new`, issue //! #219) without any further transform. //! -//! ## Migration from `ordgraph-proto` +//! ## Adopting this API — reusable, index-free bucket-code surface //! -//! This surface replaces the local `CompositionSpec` / `RankQuantSpec` / -//! `BucketCode` / `CompositionViolation` fork in `ordgraph-proto/src/code.rs`. -//! Type names, constructors, accessors, and error values mirror that prototype -//! (its tests' literal expectations are reproduced here as parity checks), so -//! the downstream migration is a drop-in: +//! This surface is designed to be reusable outside of any retrieval index. +//! If you maintain a local fork of composition-spec or bucket-code logic, +//! replace it with: //! -//! 1. depend on `ordvec` (with the `experimental` feature while this surface is -//! gated) and `use ordvec::{BucketCode, CompositionSpec, RankQuantSpec, -//! CompositionViolation};` -//! 2. delete `ordgraph-proto/src/code.rs` and its `mod code;`, re-pointing -//! callers at the `ordvec` types; -//! 3. the rank math is *not* re-implemented here — [`BucketCode::from_vector`] -//! delegates to the crate's shared [`crate::rank`] primitives, so ordgraph -//! no longer forks rank/bucket semantics. +//! ```rust,ignore +//! use ordvec::{BucketCode, CompositionSpec, RankQuantSpec, CompositionViolation}; +//! ``` //! -//! Two intentional differences from the prototype (rationale in the PR): -//! `bits = 8` is rejected here (it lands as a capability-gated width in the -//! separate b=8 work, #221), and [`CompositionSpec::new`] rejects -//! `buckets > 256` (codes are `u8`). +//! (Enable the `experimental` feature while this surface is gated.) +//! +//! The rank math is not re-implemented here — [`BucketCode::from_vector`] +//! delegates to the crate's shared [`crate::rank`] primitives, so callers +//! no longer need to fork rank or bucket semantics. +//! +//! Two intentional constraints to note: `bits = 8` is rejected (it lands as a +//! capability-gated width in the separate b=8 work, #221), and +//! [`CompositionSpec::new`] rejects `buckets > 256` (codes are `u8`). use std::error::Error; use std::fmt; @@ -502,9 +500,10 @@ impl Error for CompositionViolation {} mod tests { use super::*; - // ---- ordgraph bucket-code parity gate ------------------------------- - // Every assertion value below is reproduced verbatim from the reference - // `code.rs` #[cfg(test)] module. + // ---- bucket-code behavioral contract -------------------------------- + // These tests assert the core behavioral invariants of the bucket-code + // surface: composition balance, rank permutation semantics, and error + // conditions. Every numeric expectation is pinned to catch regressions. #[test] fn from_ranks_builds_uniform_bucket_code() { From f6d35809a45e6b499a4648cf744e28b5fbfe757a Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 18:59:39 -0500 Subject: [PATCH 09/10] docs(const_weight_bitmap): product-clean the bitmap surface (remove internal-prototype references) Replace the module-level "ported to reach parity with..." paragraph with a product-clean adoption note following the pattern established for bucket_code.rs in PR #220. Replace the internal "parity gate" test comment with a behavioral description that explains what the pinned literals assert without naming any internal repository. No production behavior change; no test logic or assertion values changed. Signed-off-by: Nelson Spence Signed-off-by: Nelson Spence --- src/const_weight_bitmap.rs | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/const_weight_bitmap.rs b/src/const_weight_bitmap.rs index 9229e136..f3c483e0 100644 --- a/src/const_weight_bitmap.rs +++ b/src/const_weight_bitmap.rs @@ -29,10 +29,23 @@ //! [`BitmapNull::space_size`] give an exact upper-tail probability for an //! overlap cutoff. //! -//! Ported to reach behavioural parity with the `ordgraph` bitmap prototype -//! (`ordgraph-proto/src/bitmap.rs`); the popcount reduction is *not* -//! re-implemented — it delegates to the crate's shared `crate::util` -//! primitive. +//! ## Adopting this API — reusable, index-free bitmap surface +//! +//! This surface is designed to be reusable outside of any retrieval or graph +//! index. If you maintain a local fork of constant-weight bitmap or null logic, +//! replace it with: +//! +//! ```rust,ignore +//! use ordvec::const_weight_bitmap::{ +//! BitmapNull, ConstantWeightBitmap, PackedConstantWeightBitmap, +//! choose, top_group_overlap_vector, +//! }; +//! ``` +//! +//! (Enable the `experimental` feature while this surface is gated.) +//! +//! The popcount reduction is not re-implemented here — it delegates to the +//! crate's shared `crate::util::and_popcount` primitive. //! //! # Overflow //! [`choose`] (and therefore [`BitmapNull::space_size`] / `fiber_count` / @@ -435,12 +448,13 @@ mod tests { .sum() } - // ---- ordgraph bitmap parity gate ------------------------------------ - // Assertion values reproduced verbatim from the reference - // `ordgraph-proto/src/bitmap.rs` #[cfg(test)] module. The prototype - // cross-checked the overlap against `Contingency::top_overlap`; here the - // same literal (1, and [1, 3, 8]) is asserted directly and cross-checked - // against the naive shared-set-bit count instead. + // ---- bitmap behavioral contract — pinned literals ------------------- + // The following assertion values pin the core bitmap overlap contract: a + // top-bucket membership bitmap over a 4-bucket code correctly counts the + // shared set bits, and the overlap vector `[1, 3, 8]` across widths + // `[1, 2, 4]` reproduces the expected cumulative shared-coordinate counts. + // Cross-checked against the naive shared-set-bit count (`naive_packed_overlap`) + // to keep both the bool-bitmap and packed-popcount paths honest. #[test] fn top_bitmap_has_expected_constant_weight() { From 5dc611bfae61fed896fd592974e865c6ee331bb8 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 20:40:22 -0500 Subject: [PATCH 10/10] fix: honest exactness + gcd-reduced f64 for BitmapNull::tail_probability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two qodo findings on the finite constant-weight null: - (Correctness) tail_probability advertised an 'exact' probability but computed `tail_count as f64 / space_size as f64`, double-rounding two large u128 counts; for C(dim, weight) past 2^53 that can mis-round (e.g. report 1.0 when the true value is just under 1). Reframe: the EXACT surface is the u128 `tail_count` / `space_size`; tail_probability returns the nearest f64, now gcd-reducing the two counts first so the conversion is exact whenever the reduced pair fits an f64 mantissa, and short-circuiting the observed==0 (1.0) and observed>weight (0.0) boundaries with no division. New test pins tail_probability == nearest f64 of the gcd-reduced exact ratio across cases up to C(100,50) ~ 1e29. - (Requirement gap) document that this is an in-model finite null under the idealized uniform constant-weight assumption — a selectivity / false-positive statement, NOT a real-corpus or encoder guarantee and not corpus-calibrated evidence strength. Signed-off-by: Nelson Spence --- src/const_weight_bitmap.rs | 65 ++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/src/const_weight_bitmap.rs b/src/const_weight_bitmap.rs index f3c483e0..42ed8e13 100644 --- a/src/const_weight_bitmap.rs +++ b/src/const_weight_bitmap.rs @@ -345,14 +345,30 @@ impl BitmapNull { .sum() } - /// Exact upper-tail probability `P(overlap >= observed)` under the uniform - /// constant-weight null. + /// Upper-tail probability `P(overlap >= observed)` under the **idealized + /// uniform constant-weight null**, returned as the nearest `f64`. /// - /// Returns `tail_count(observed) / space_size` as an `f64`. This is the - /// fraction of all weight-`weight` bitmaps whose overlap with a fixed - /// weight-`weight` bitmap is at least `observed` — the exact hypergeometric + /// This is the fraction of all weight-`weight` bitmaps whose overlap with a + /// fixed weight-`weight` bitmap is at least `observed` — the hypergeometric /// upper tail at the given threshold. /// + /// **This is an in-model finite null, not a real-corpus guarantee.** It + /// assumes bitmaps are drawn uniformly at random among all weight-`weight` + /// patterns. Real embeddings need not satisfy that assumption, so a small + /// tail probability is a *selectivity* (false-positive-rate) statement under + /// this idealized model — **not** corpus-calibrated evidence strength and not + /// proof that an observed overlap is meaningful on real data. + /// + /// The **exact** result is the rational `tail_count(observed) / space_size`, + /// both available as exact `u128` via [`Self::tail_count`] and + /// [`Self::space_size`]; callers needing exact reasoning should use those + /// directly. This method returns the closest `f64` to that rational (the + /// counts are gcd-reduced first, so the conversion is exact whenever the + /// reduced numerator and denominator fit in an `f64` mantissa). For very + /// large `C(dim, weight)` the true value may not be `f64`-representable — + /// e.g. a value just below `1.0` rounds to `1.0`, the nearest `f64` — which + /// is a representation limit, not an inexactness in the underlying counts. + /// /// Returns `0.0` for `observed > weight` (impossible overlap) and `1.0` /// for `observed == 0` (all bitmaps overlap in `>= 0` positions). /// @@ -371,11 +387,24 @@ impl BitmapNull { /// # } /// ``` pub fn tail_probability(&self, observed: usize) -> f64 { + // Exact short-circuits — no division, no rounding. + if observed == 0 { + return 1.0; + } + if observed > self.weight { + return 0.0; + } let space = self.space_size(); if space == 0 { return 0.0; } - self.tail_count(observed) as f64 / space as f64 + let count = self.tail_count(observed); + // Reduce by the gcd so the `f64` conversion uses the smallest equivalent + // integers: when the reduced numerator and denominator both fit in an + // `f64` mantissa the ratio is then exactly representable / correctly + // rounded, avoiding a needless double-rounding of two large `u128`s. + let g = gcd(count, space); + (count / g) as f64 / (space / g) as f64 } } @@ -734,4 +763,28 @@ mod tests { prev = p; } } + + #[test] + fn tail_probability_matches_exact_gcd_reduced_ratio() { + // `tail_probability` must equal the nearest `f64` to the EXACT rational + // `tail_count / space_size` (the exact `u128` surface), gcd-reduced so + // two large counts are not needlessly double-rounded. `C(64, 32) ≈ 1.8e18` + // and `C(100, 50) ≈ 1e29` are far past `2^53` (yet still inside `u128`) — + // the regime where the naive `count as f64 / space as f64` cast rounds. + for &(dim, weight) in &[(10usize, 3usize), (16, 4), (64, 32), (100, 50)] { + let null = BitmapNull::new(dim, weight); + let space = null.space_size(); + for observed in 0..=weight + 1 { + let count = null.tail_count(observed); + let g = gcd(count, space); + let expected = (count / g) as f64 / (space / g) as f64; + assert_eq!( + null.tail_probability(observed), + expected, + "dim={dim} weight={weight} observed={observed}" + ); + assert!((0.0..=1.0).contains(&null.tail_probability(observed))); + } + } + } }