diff --git a/examples/bench_contingency.rs b/examples/bench_contingency.rs
new file mode 100644
index 00000000..d2f2a8ab
--- /dev/null
+++ b/examples/bench_contingency.rs
@@ -0,0 +1,497 @@
+//! Profiling harness for the contingency / projection surface (issue #219),
+//! on a SYNTHETIC seeded corpus — NOT real data, fully reproducible.
+//!
+//! Two APIs, three regimes:
+//!
+//! * API 1 — stateless dense [`ordvec::Contingency`]: the `nb × nb`
+//!   bucket-overlap table for two `&[u8]` code slices, plus its named
+//!   projections.
+//! * API 2 — the indexed [`ordvec::MultiBucketBitmap`] twin:
+//!   `contingency_row` / `diagonal_overlap_row` / `project_all_batched`
+//!   accumulate the same table straight from the stored bucket bitmaps.
+//!
+//! Regimes (issue #219 acceptance):
+//!
+//!   (a) ONE PAIR / MANY PROJECTIONS — build the dense `Contingency` table
+//!       once, then apply all ~7 projections off the cached integer table,
+//!       vs naively recomputing each projection from the raw `&[u8]` codes
+//!       every time. Shows the build-once-project-many win.
+//!
+//!   (b) ONE QUERY / MANY DOCS / ONE PROJECTION — the indexed diagonal fast
+//!       path over the whole corpus (dispatched = SIMD-when-available, and a
+//!       forced-scalar twin), vs the dense pairwise loop that rebuilds the
+//!       full `nb × nb` table per doc just to read its diagonal.
+//!
+//!   (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — the indexed batched
+//!       `project_all_batched` (one table per doc, K projections off it), vs
+//!       calling the single-projection indexed path K times (one corpus
+//!       rescan per projection). Shows the no-rescan win, dispatched vs
+//!       forced-scalar.
+//!
+//! Timing is median-of-N wall clock (same discipline as
+//! `examples/bench_retrieval.rs`). Every number is SYNTHETIC.
+//!
+//! NOTE on SIMD: the indexed kernels dispatch to an AVX-512 VPOPCNTDQ
+//! popcount-AND path only when the host advertises `avx512f` AND
+//! `avx512vpopcntdq` (the dense API-1 kernel additionally needs `avx512bw`).
+//! On a host without VPOPCNTDQ the "dispatched" column runs the SAME scalar
+//! kernel as the forced-scalar column, so the SIMD/scalar ratio is ~1.0 —
+//! that is the honest result, not a bug. The header prints the host features
+//! so the ratio can be read in context.
+//!
+//! Run: cargo run --release --features experimental --example bench_contingency
+
+#[cfg(not(feature = "experimental"))]
+fn main() {
+    eprintln!(
+        "bench_contingency requires --features experimental (Contingency / \
+         MultiBucketBitmap). Re-run: cargo run --release --features experimental \
+         --example bench_contingency"
+    );
+}
+
+#[cfg(feature = "experimental")]
+fn main() {
+    bench::run();
+}
+
+#[cfg(feature = "experimental")]
+mod bench {
+    use std::time::Instant;
+
+    use ordvec::{Contingency, MultiBucketBitmap, Projection};
+    use rand::{RngExt, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    // ---- timing -----------------------------------------------------------
+
+    /// Median wall-clock seconds over `reps` timed runs (one warmup first),
+    /// each run doing `iters` repetitions of `f`. Returns seconds PER `f`
+    /// call (total / iters), so callers report cost of a single operation.
+    fn median_secs(reps: usize, iters: usize, mut f: impl FnMut()) -> f64 {
+        // warmup
+        for _ in 0..iters {
+            f();
+        }
+        let mut lats = Vec::with_capacity(reps);
+        for _ in 0..reps {
+            let t = Instant::now();
+            for _ in 0..iters {
+                f();
+            }
+            lats.push(t.elapsed().as_secs_f64() / iters as f64);
+        }
+        lats.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+        lats[lats.len() / 2]
+    }
+
+    // ---- corpus -----------------------------------------------------------
+
+    fn gauss(rng: &mut ChaCha8Rng) -> f32 {
+        let u1: f32 = rng.random_range(1e-9..1.0);
+        let u2: f32 = rng.random_range(0.0..1.0);
+        (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos()
+    }
+
+    /// Clustered gaussian corpus + a single query, seeded for reproducibility.
+    /// Each doc/query draws a cluster prototype plus noise (queries quieter),
+    /// so bucket assignments correlate the way real ordinal codes would —
+    /// pure iid noise would make every contingency table look the same.
+    fn make_data(n: usize, dim: usize, n_clusters: usize, seed: u64) -> (Vec<f32>, Vec<f32>) {
+        let mut rng = ChaCha8Rng::seed_from_u64(seed);
+        let mut protos = vec![0.0f32; n_clusters * dim];
+        for x in protos.iter_mut() {
+            *x = gauss(&mut rng);
+        }
+        let emit = |count: usize, noise: f32, rng: &mut ChaCha8Rng| -> Vec<f32> {
+            let mut out = vec![0.0f32; count * dim];
+            for i in 0..count {
+                let c = rng.random_range(0..n_clusters as u32) as usize;
+                for j in 0..dim {
+                    out[i * dim + j] = protos[c * dim + j] + noise * gauss(rng);
+                }
+            }
+            out
+        };
+        let corpus = emit(n, 1.0, &mut rng);
+        let query = emit(1, 0.5, &mut rng);
+        (corpus, query)
+    }
+
+    /// Reconstruct the per-coordinate bucket codes (`0..nb`) a doc was encoded
+    /// with by reading back the index's stored bitmaps via the indexed
+    /// contingency: feeding a one-hot "query" for bucket `b` recovers, per
+    /// coordinate, whether the doc sits in bucket `b`. Simpler here: rebuild
+    /// the codes from `query_bitmaps_from_ranks` of the raw float vector, which
+    /// is exactly the encoding `add` applies.
+    fn codes_from_bitmaps(bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec<u8> {
+        let mut codes = vec![0u8; dim];
+        for b in 0..nb {
+            let off = b * qpb;
+            for (j, code) in codes.iter_mut().enumerate() {
+                if (bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 {
+                    *code = b as u8;
+                }
+            }
+        }
+        codes
+    }
+
+    // ---- the ~7 projections, as both named ops (API 1) and weight matrices --
+
+    /// The named-projection battery used in regime (a): the full surface a
+    /// caller might evaluate off one table (top-overlap, top-group, diagonal,
+    /// two bands, L1 transport, centred outer product).
+    fn named_projections(nb: usize) -> Vec<Projection> {
+        let mut p = vec![
+            Projection::TopOverlap,
+            Projection::TopGroupOverlap { width: 2 },
+            Projection::DiagonalAgreement,
+            Projection::BandAgreement { radius: 1 },
+            Projection::BucketL1Distance,
+            Projection::RankQuantSymmetric,
+        ];
+        if nb > 4 {
+            p.push(Projection::BandAgreement { radius: 2 });
+        }
+        p
+    }
+
+    /// The same battery expressed as raw `nb × nb` weight matrices, for the
+    /// indexed `project_all_batched` path in regime (c). (TopOverlap,
+    /// TopGroup, diagonal, band1, band2, outer product — six dense matrices;
+    /// L1 distance is also expressible as a weight matrix `|a-b|`.)
+    fn weight_matrices(nb: usize) -> Vec<Vec<f32>> {
+        let c = (nb as f32 - 1.0) / 2.0;
+        let mut out: Vec<Vec<f32>> = Vec::new();
+
+        // top-overlap: single cell (nb-1, nb-1)
+        let mut w = vec![0.0f32; nb * nb];
+        w[(nb - 1) * nb + (nb - 1)] = 1.0;
+        out.push(w);
+
+        // top-group width 2
+        let mut w = vec![0.0f32; nb * nb];
+        let start = nb - 2;
+        for a in start..nb {
+            for b in start..nb {
+                w[a * nb + b] = 1.0;
+            }
+        }
+        out.push(w);
+
+        // diagonal
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            w[a * nb + a] = 1.0;
+        }
+        out.push(w);
+
+        // band radius 1 (unit weights)
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            for b in 0..nb {
+                if a.abs_diff(b) <= 1 {
+                    w[a * nb + b] = 1.0;
+                }
+            }
+        }
+        out.push(w);
+
+        // bucket-L1 transport cost |a-b|
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            for b in 0..nb {
+                w[a * nb + b] = a.abs_diff(b) as f32;
+            }
+        }
+        out.push(w);
+
+        // centred outer product (a-c)(b-c)
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            for b in 0..nb {
+                w[a * nb + b] = (a as f32 - c) * (b as f32 - c);
+            }
+        }
+        out.push(w);
+
+        if nb > 4 {
+            // band radius 2
+            let mut w = vec![0.0f32; nb * nb];
+            for a in 0..nb {
+                for b in 0..nb {
+                    if a.abs_diff(b) <= 2 {
+                        w[a * nb + b] = 1.0;
+                    }
+                }
+            }
+            out.push(w);
+        }
+        out
+    }
+
+    pub fn run() {
+        #[cfg(target_arch = "x86_64")]
+        {
+            println!(
+                "# host SIMD: avx2={} avx512f={} avx512bw={} avx512vpopcntdq={}",
+                is_x86_feature_detected!("avx2"),
+                is_x86_feature_detected!("avx512f"),
+                is_x86_feature_detected!("avx512bw"),
+                is_x86_feature_detected!("avx512vpopcntdq"),
+            );
+            if !is_x86_feature_detected!("avx512vpopcntdq") {
+                println!(
+                    "# NOTE: no avx512vpopcntdq on this host — the 'dispatched' \
+                     and 'scalar' columns run the SAME kernel; ratio ~1.0 is honest."
+                );
+            }
+        }
+        #[cfg(not(target_arch = "x86_64"))]
+        println!("# host SIMD: non-x86_64 target — scalar kernels only");
+
+        println!("# bench_contingency — SYNTHETIC clustered gaussian corpus (not real data)");
+
+        for &bits in &[2u8, 4u8] {
+            run_bits(bits);
+            println!();
+        }
+    }
+
+    fn run_bits(bits: u8) {
+        let dim = 1024usize;
+        let n = 20_000usize;
+        let n_clusters = 64usize;
+        let seed = 219u64;
+        let nb = 1usize << bits;
+        let qpb = dim / 64;
+
+        println!("# ===== bits={bits} (nb={nb})  dim={dim}  n={n}  clusters={n_clusters}  seed={seed} =====");
+
+        let (corpus, query) = make_data(n, dim, n_clusters, seed);
+
+        // Build the indexed surface (API 2).
+        let mut idx = MultiBucketBitmap::new(dim, bits);
+        idx.add(&corpus);
+        let q_bitmaps = idx.query_bitmaps_from_ranks(&query);
+
+        // Per-coord codes for API 1 (dense). `query_bitmaps_from_ranks` applies
+        // the exact same rank→bucket encoding `add` uses, so re-encoding doc 0's
+        // float vector and the query recovers the real codes the index holds.
+        let q_codes = codes_from_bitmaps(&q_bitmaps, nb, qpb, dim);
+        let doc0 = &corpus[0..dim];
+        let d0_bitmaps = idx.query_bitmaps_from_ranks(doc0);
+        let d_codes = codes_from_bitmaps(&d0_bitmaps, nb, qpb, dim);
+
+        // ---- regime (a): ONE PAIR / MANY PROJECTIONS (dense API 1) --------
+        regime_a(&q_codes, &d_codes, nb);
+
+        // ---- regime (b): ONE QUERY / MANY DOCS / ONE PROJECTION ----------
+        regime_b(&idx, &q_bitmaps, n, nb);
+
+        // ---- regime (c): ONE QUERY / MANY DOCS / MANY PROJECTIONS --------
+        regime_c(&idx, &q_bitmaps, nb);
+    }
+
+    /// (a) Build the dense `Contingency` once, apply all projections off the
+    /// cached table — vs recomputing each projection from the raw codes (build
+    /// a fresh table per projection). Single pair, repeated for timing.
+    fn regime_a(q_codes: &[u8], d_codes: &[u8], nb: usize) {
+        let projs = named_projections(nb);
+        let n_proj = projs.len();
+
+        // Build-once-project-many: one table, then all projections.
+        let build_once = median_secs(25, 200, || {
+            let c = Contingency::new(q_codes, d_codes, nb).unwrap();
+            let mut acc = 0.0f32;
+            for p in &projs {
+                acc += p.score(&c);
+            }
+            std::hint::black_box(acc);
+        });
+
+        // Naive: rebuild the table from raw codes for EVERY projection.
+        let rebuild_each = median_secs(25, 200, || {
+            let mut acc = 0.0f32;
+            for p in &projs {
+                let c = Contingency::new(q_codes, d_codes, nb).unwrap();
+                acc += p.score(&c);
+            }
+            std::hint::black_box(acc);
+        });
+
+        let speedup = rebuild_each / build_once;
+        println!("# (a) ONE PAIR / {n_proj} PROJECTIONS — dense Contingency (API 1)");
+        println!("{:<34} {:>14} {:>14}", "approach", "us/pair", "speedup");
+        println!(
+            "{:<34} {:>14.3} {:>14}",
+            "build-once, project-many",
+            build_once * 1e6,
+            "1.00x (ref)"
+        );
+        println!(
+            "{:<34} {:>14.3} {:>13.2}x",
+            "rebuild-per-projection",
+            rebuild_each * 1e6,
+            speedup
+        );
+        println!(
+            "DATA\ta\tbits={}\tbuild_once_us={:.3}\trebuild_each_us={:.3}\tn_proj={}\tspeedup={:.2}",
+            (nb as f32).log2() as u32,
+            build_once * 1e6,
+            rebuild_each * 1e6,
+            n_proj,
+            speedup
+        );
+    }
+
+    /// (b) ONE QUERY / MANY DOCS / ONE PROJECTION — indexed diagonal fast path
+    /// (dispatched + forced-scalar) over the corpus, vs the dense pairwise
+    /// loop (full table per doc, read its diagonal).
+    fn regime_b(idx: &MultiBucketBitmap, q_bitmaps: &[u64], n: usize, nb: usize) {
+        // dispatched diagonal fast path over every doc
+        let diag_dispatched = median_secs(7, 3, || {
+            let mut acc = 0u64;
+            for di in 0..n {
+                let d = idx.diagonal_overlap_row(q_bitmaps, di);
+                acc += d.iter().map(|&x| x as u64).sum::<u64>();
+            }
+            std::hint::black_box(acc);
+        });
+
+        // forced-scalar diagonal fast path
+        let diag_scalar = median_secs(7, 3, || {
+            let mut acc = 0u64;
+            for di in 0..n {
+                let d = idx.diagonal_overlap_row_scalar(q_bitmaps, di);
+                acc += d.iter().map(|&x| x as u64).sum::<u64>();
+            }
+            std::hint::black_box(acc);
+        });
+
+        // dense pairwise loop: build the FULL nb*nb table per doc (dispatched
+        // contingency_row), then take its diagonal trace. This is the work the
+        // diagonal fast path avoids (nb passes vs nb*nb).
+        let dense_full = median_secs(7, 3, || {
+            let mut acc = 0u64;
+            for di in 0..n {
+                let table = idx.contingency_row(q_bitmaps, di);
+                for a in 0..nb {
+                    acc += table[a * nb + a] as u64;
+                }
+            }
+            std::hint::black_box(acc);
+        });
+
+        let simd_speedup = diag_scalar / diag_dispatched;
+        let fastpath_speedup = dense_full / diag_dispatched;
+
+        let per = |s: f64| s / n as f64 * 1e9; // ns per doc
+        println!("# (b) ONE QUERY / {n} DOCS / ONE PROJECTION (diagonal) — indexed (API 2)");
+        println!(
+            "{:<34} {:>14} {:>16}",
+            "approach", "ns/doc", "vs dispatched"
+        );
+        println!(
+            "{:<34} {:>14.2} {:>16}",
+            "diagonal fast path (dispatched)",
+            per(diag_dispatched),
+            "1.00x (ref)"
+        );
+        println!(
+            "{:<34} {:>14.2} {:>15.2}x",
+            "diagonal fast path (scalar)",
+            per(diag_scalar),
+            simd_speedup
+        );
+        println!(
+            "{:<34} {:>14.2} {:>15.2}x",
+            "dense full-table diagonal",
+            per(dense_full),
+            fastpath_speedup
+        );
+        println!(
+            "DATA\tb\tbits={}\tdiag_disp_ns={:.2}\tdiag_scalar_ns={:.2}\tdense_full_ns={:.2}\tsimd_speedup={:.2}\tfastpath_speedup={:.2}",
+            (nb as f32).log2() as u32,
+            per(diag_dispatched),
+            per(diag_scalar),
+            per(dense_full),
+            simd_speedup,
+            fastpath_speedup
+        );
+    }
+
+    /// (c) ONE QUERY / MANY DOCS / MANY PROJECTIONS — batched
+    /// `project_all_batched` (table once, K projections off it) vs calling the
+    /// single-projection path K times (one corpus rescan per projection).
+    /// Both dispatched and forced-scalar.
+    fn regime_c(idx: &MultiBucketBitmap, q_bitmaps: &[u64], nb: usize) {
+        let mats = weight_matrices(nb);
+        let k = mats.len();
+        let weights: Vec<&[f32]> = mats.iter().map(|w| w.as_slice()).collect();
+        let n = idx.len();
+
+        // batched dispatched: one table per doc, all K projections off it.
+        let batched_disp = median_secs(7, 3, || {
+            let out = idx.project_all_batched(q_bitmaps, &weights);
+            std::hint::black_box(out.len());
+        });
+
+        // batched forced-scalar.
+        let batched_scalar = median_secs(7, 3, || {
+            let out = idx.project_all_batched_scalar(q_bitmaps, &weights);
+            std::hint::black_box(out.len());
+        });
+
+        // rescan: call the single-projection batched path K times (each call
+        // rescans the whole corpus building a fresh table per doc). Dispatched.
+        let rescan_disp = median_secs(7, 3, || {
+            let mut acc = 0usize;
+            for w in &weights {
+                let single: [&[f32]; 1] = [w];
+                let out = idx.project_all_batched(q_bitmaps, &single);
+                acc += out.len();
+            }
+            std::hint::black_box(acc);
+        });
+
+        let norescan_speedup = rescan_disp / batched_disp;
+        let simd_speedup = batched_scalar / batched_disp;
+
+        let per = |s: f64| s / n as f64 * 1e9; // ns per doc (over all K)
+        println!("# (c) ONE QUERY / {n} DOCS / {k} PROJECTIONS — indexed batched (API 2)");
+        println!(
+            "{:<38} {:>14} {:>16}",
+            "approach", "ns/doc(allK)", "vs batched"
+        );
+        println!(
+            "{:<38} {:>14.2} {:>16}",
+            "project_all_batched (dispatched)",
+            per(batched_disp),
+            "1.00x (ref)"
+        );
+        println!(
+            "{:<38} {:>14.2} {:>15.2}x",
+            "project_all_batched (scalar)",
+            per(batched_scalar),
+            simd_speedup
+        );
+        println!(
+            "{:<38} {:>14.2} {:>15.2}x",
+            "rescan: single-proj x K (dispatched)",
+            per(rescan_disp),
+            norescan_speedup
+        );
+        println!(
+            "DATA\tc\tbits={}\tbatched_disp_ns={:.2}\tbatched_scalar_ns={:.2}\trescan_disp_ns={:.2}\tk={}\tnorescan_speedup={:.2}\tsimd_speedup={:.2}",
+            (nb as f32).log2() as u32,
+            per(batched_disp),
+            per(batched_scalar),
+            per(rescan_disp),
+            k,
+            norescan_speedup,
+            simd_speedup
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index b912a100..1daa7f39 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,14 @@
 //!   coordinate, set when the coordinate is positive) for sign-cosine
 //!   candidate generation.
 //!
+//! These four families are the retrieval surface. The `experimental`
+//! `MultiBucketBitmap` indexed contingency / projection API is a niche
+//! research/analysis substrate for the bilinear bucket-overlap decomposition —
+//! it is **not** a default single-score retrieval path and was never
+//! kernel-optimized for that role. For primary nearest-neighbour retrieval use
+//! [`RankQuant`], [`Bitmap`], or the two-stage candidate-generation → rerank
+//! flow instead.
+//!
 //! The [`Bitmap`] candidate score is the implementation surface with the
 //! strongest formal story: in the companion Lean formalization, literal
 //! constant-weight bitmap overlap is the query-preserving quotient statistic,
diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs
index 0b4a2d9c..a375a363 100644
--- a/src/multi_bucket.rs
+++ b/src/multi_bucket.rs
@@ -1,5 +1,25 @@
 //! MultiBucketBitmap: `2^bits` bitmaps per document, one per bucket.
 //!
+//! **Stability — not a default surface.** [`MultiBucketBitmap`] and the indexed
+//! contingency / projection kernels built on it are gated behind the
+//! **non-default `experimental` Cargo feature**. They are a research scaffold,
+//! **not** part of the stable public API, and are **excluded from semver
+//! guarantees** — they may change or be removed without a major-version bump.
+//! The stable, semver-covered bucket-overlap surface is the stateless dense
+//! `Contingency` / `Projection` API; reach for this indexed path only when you
+//! have explicitly opted into `experimental`.
+//!
+//! **Positioning — not a primary retrieval path.** `MultiBucketBitmap` is a
+//! niche research/analysis substrate for the bilinear bucket-overlap
+//! decomposition; it is **not** the default single-score retrieval path and was
+//! never kernel-optimized for that role. For primary nearest-neighbour
+//! retrieval, use the headline paths instead — [`crate::RankQuant`] (symmetric
+//! and asymmetric float-query scoring), [`crate::Bitmap`] (top-bucket
+//! `popcount(Q AND D)` candidate scoring), and the two-stage
+//! candidate-generation → rerank flow. Reach for this indexed contingency /
+//! projection surface only to *analyze* bucket-overlap structure, never as a
+//! primary retrieval index.
+//!
 //! Represents the constant-composition bucket assignment of each
 //! document explicitly as a set of `2^bits` disjoint bitmaps over the
 //! `dim` coordinates. The bilinear bucket-overlap score
@@ -131,6 +151,42 @@ impl MultiBucketBitmap {
         w
     }
 
+    /// Unit-diagonal weight matrix: `W[a, a] = 1`, off-diagonal `0`. The
+    /// bilinear score reduces to `Σ_a |Q_a ∩ D_a|` — the same-bucket
+    /// agreement count. This is the cheapest truncation: `nb` popcount-AND
+    /// passes per doc instead of the full `nb²`. It is a pure overlap signal
+    /// (no bucket-magnitude weighting), so it is closest in spirit to a
+    /// multi-level [`crate::Bitmap`].
+    pub fn diagonal_weights(&self) -> Vec<f32> {
+        let nb = self.n_buckets;
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            w[a * nb + a] = 1.0;
+        }
+        w
+    }
+
+    /// Outer-product weights `(a − c)(b − c)` restricted to the band
+    /// `|a − b| <= half_width` (off-band entries zeroed). `half_width = 0`
+    /// keeps only the magnitude-weighted diagonal; `half_width >= nb − 1`
+    /// recovers the full [`Self::outer_product_weights`] matrix. Sweeping
+    /// `half_width` interpolates candidate-gen cost (non-zero band entries ⇒
+    /// popcount-AND passes per doc) between the diagonal and the exact
+    /// bilinear probe, tracing the recall/latency frontier.
+    pub fn banded_weights(&self, half_width: usize) -> Vec<f32> {
+        let nb = self.n_buckets;
+        let c = (nb as f32 - 1.0) / 2.0;
+        let mut w = vec![0.0f32; nb * nb];
+        for a in 0..nb {
+            for b in 0..nb {
+                if a.abs_diff(b) <= half_width {
+                    w[a * nb + b] = (a as f32 - c) * (b as f32 - c);
+                }
+            }
+        }
+        w
+    }
+
     /// Compute the bilinear bucket-overlap score
     ///   `Σ_{a, b} W[a, b] · |Q_a ∩ D_b|`
     /// for a single (query, doc) pair. Scales nothing — caller
@@ -194,6 +250,250 @@ impl MultiBucketBitmap {
         head
     }
 
+    // -----------------------------------------------------------------
+    // Indexed contingency / projection surface (issue #219, API 2 of 2).
+    //
+    // **Feature gate:** this entire surface is behind the non-default
+    // `experimental` Cargo feature and is excluded from semver guarantees.
+    // It is *not* the default single-score retrieval path — for standard
+    // nearest-neighbour scoring use [`crate::RankQuant`] (asymmetric LUT
+    // or symmetric dot-product) or [`crate::Bitmap`] (top-bucket overlap).
+    // `MultiBucketBitmap` targets niche workloads where the full bilinear
+    // decomposition, multi-projection evidence, or the diagonal fast path
+    // are needed (one query vs. many docs with many weight projections).
+    //
+    // `bilinear_score` / `top_m_bilinear` recompute one weighted `Σ W·C`
+    // per call and rescan the doc's bitmaps once per *projection*. The
+    // methods below decouple the accumulation from the projection: the
+    // `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` is built **once**
+    // per (query, doc) over a single pass of the doc's bitmaps, then any
+    // number of weight matrices are applied as cheap `nb²` dot products
+    // over the cached integer table. This is the in-index analogue of the
+    // stateless [`crate::Contingency`] (API 1) — same `nb × nb` object,
+    // but accumulated straight from the bitmap rows rather than from raw
+    // `&[u8]` codes.
+    //
+    // The hot inner accumulation (the `nb × nb` popcount-AND over `qpb`
+    // u64 words) dispatches at runtime to an AVX-512 VPOPCNTDQ kernel,
+    // mirroring [`crate::Bitmap`]'s masked-tail popcount-AND scan, with a
+    // portable scalar fallback on every other target / unsupported CPU.
+    // -----------------------------------------------------------------
+
+    /// Full `nb × nb` contingency table `C[a, b] = |Q_a ∩ D_b|` for one
+    /// document, accumulated in a single pass over the doc's bitmaps.
+    ///
+    /// Returns a row-major `nb × nb` `Vec<u32>` where `out[a * nb + b]` is
+    /// the count of coordinates the query placed in bucket `a` and the doc
+    /// placed in bucket `b`. This is the indexed twin of
+    /// [`crate::Contingency::counts`]: the same table, built from the
+    /// bitmap rows instead of `&[u8]` codes. A caller can then apply any
+    /// projection (e.g. via [`crate::Contingency`]'s projections, or a raw
+    /// `nb²` weighted sum) without rescanning the bitmaps.
+    ///
+    /// # Panics
+    /// Panics if `doc_idx >= len()`. `q_bitmaps` must be `nb * qpb` words
+    /// (as produced by [`Self::query_bitmaps_from_ranks`]).
+    pub fn contingency_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec<u32> {
+        let qpb = self.qwords_per_bitmap;
+        let nb = self.n_buckets;
+        assert!(
+            doc_idx < self.n_vectors,
+            "contingency_row: doc_idx {doc_idx} out of range (n_vectors {})",
+            self.n_vectors,
+        );
+        assert_eq!(
+            q_bitmaps.len(),
+            nb * qpb,
+            "contingency_row: q_bitmaps must be nb * qpb words",
+        );
+        let doc_base = doc_idx * nb * qpb;
+        let doc = &self.bitmaps[doc_base..doc_base + nb * qpb];
+        let mut table = vec![0u32; nb * nb];
+        contingency_accumulate(q_bitmaps, doc, nb, qpb, &mut table);
+        table
+    }
+
+    /// Diagonal-only fast path: the `nb` cells `C[a, a] = |Q_a ∩ D_a|` for
+    /// one document, in a single pass over the doc's bitmaps.
+    ///
+    /// This is the common cheap projection — same-bucket agreement per
+    /// bucket — and costs `nb` popcount-AND passes per doc instead of the
+    /// full `nb²`. `out[a]` is `|Q_a ∩ D_a|`. Summing the result is the
+    /// [`Self::diagonal_weights`] bilinear score.
+    ///
+    /// # Panics
+    /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`.
+    pub fn diagonal_overlap_row(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec<u32> {
+        let qpb = self.qwords_per_bitmap;
+        let nb = self.n_buckets;
+        assert!(
+            doc_idx < self.n_vectors,
+            "diagonal_overlap_row: doc_idx {doc_idx} out of range (n_vectors {})",
+            self.n_vectors,
+        );
+        assert_eq!(
+            q_bitmaps.len(),
+            nb * qpb,
+            "diagonal_overlap_row: q_bitmaps must be nb * qpb words",
+        );
+        let doc_base = doc_idx * nb * qpb;
+        let doc = &self.bitmaps[doc_base..doc_base + nb * qpb];
+        let mut diag = vec![0u32; nb];
+        diagonal_accumulate(q_bitmaps, doc, nb, qpb, &mut diag);
+        diag
+    }
+
+    /// Batched indexed projection: for every document, build its `nb × nb`
+    /// contingency table **once**, then apply *all* of `weights` to that
+    /// single cached table, returning `docs × projections` scores.
+    ///
+    /// `weights[p]` is the row-major `nb × nb` weight matrix for projection
+    /// `p`. The returned `out[di][p]` is `Σ_{a, b} weights[p][a*nb+b] ·
+    /// C_di[a, b]`. Each document's bitmaps are streamed once regardless of
+    /// how many projections are requested — the projections share the
+    /// accumulated table rather than each rescanning the corpus (which is
+    /// what `bilinear_score` would do per projection).
+    ///
+    /// Runs across documents in parallel via rayon. The per-doc table is a
+    /// small `nb²` integer buffer; the projection dot products are cheap
+    /// relative to the popcount-AND accumulation that dominates.
+    ///
+    /// # Panics
+    /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len()
+    /// != nb * nb`.
+    #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"]
+    pub fn project_all_batched(&self, q_bitmaps: &[u64], weights: &[&[f32]]) -> Vec<Vec<f32>> {
+        let qpb = self.qwords_per_bitmap;
+        let nb = self.n_buckets;
+        assert_eq!(
+            q_bitmaps.len(),
+            nb * qpb,
+            "project_all_batched: q_bitmaps must be nb * qpb words",
+        );
+        for (p, w) in weights.iter().enumerate() {
+            assert_eq!(
+                w.len(),
+                nb * nb,
+                "project_all_batched: weights[{p}] must be an nb * nb matrix",
+            );
+        }
+        let n = self.n_vectors;
+        let n_proj = weights.len();
+        if n == 0 || n_proj == 0 {
+            return vec![Vec::new(); n];
+        }
+        let per_doc = nb * qpb;
+        let bitmaps = &self.bitmaps;
+        (0..n)
+            .into_par_iter()
+            .map(|di| {
+                let doc = &bitmaps[di * per_doc..(di + 1) * per_doc];
+                // One accumulation pass over this doc's bitmaps. nb <= 16 ⇒
+                // nb*nb <= 256, so a stack table avoids a per-doc heap
+                // allocation inside the parallel map (allocator contention).
+                assert!(
+                    nb * nb <= 256,
+                    "project_all_batched: nb={nb} exceeds stack table capacity \
+                     (nb*nb={} > 256); bits must be <=4 so nb<=16",
+                    nb * nb,
+                );
+                let mut table = [0u32; 256];
+                let table = &mut table[..nb * nb];
+                contingency_accumulate(q_bitmaps, doc, nb, qpb, table);
+                project_table(table, weights)
+            })
+            .collect()
+    }
+
+    /// `#[doc(hidden)]` bench-only twin of [`Self::diagonal_overlap_row`] that
+    /// forces the **portable scalar** diagonal kernel, bypassing the runtime
+    /// AVX-512 dispatch. It exists so `examples/bench_contingency` can time the
+    /// scalar and SIMD diagonal paths against each other on the same index
+    /// (mirroring the `#[doc(hidden)]` `search_asymmetric_byte_lut` bench
+    /// reference at the crate root). Not part of the stable API — production
+    /// callers use [`Self::diagonal_overlap_row`], which dispatches to the
+    /// fastest available kernel.
+    ///
+    /// # Panics
+    /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`.
+    #[doc(hidden)]
+    pub fn diagonal_overlap_row_scalar(&self, q_bitmaps: &[u64], doc_idx: usize) -> Vec<u32> {
+        let qpb = self.qwords_per_bitmap;
+        let nb = self.n_buckets;
+        assert!(
+            doc_idx < self.n_vectors,
+            "diagonal_overlap_row_scalar: doc_idx {doc_idx} out of range (n_vectors {})",
+            self.n_vectors,
+        );
+        assert_eq!(
+            q_bitmaps.len(),
+            nb * qpb,
+            "diagonal_overlap_row_scalar: q_bitmaps must be nb * qpb words",
+        );
+        let doc_base = doc_idx * nb * qpb;
+        let doc = &self.bitmaps[doc_base..doc_base + nb * qpb];
+        let mut diag = vec![0u32; nb];
+        diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, &mut diag);
+        diag
+    }
+
+    /// `#[doc(hidden)]` bench-only twin of [`Self::project_all_batched`] that
+    /// forces the **portable scalar** contingency-accumulation kernel,
+    /// bypassing the runtime AVX-512 dispatch. Lets
+    /// `examples/bench_contingency` time the scalar and SIMD batched-projection
+    /// paths on the same index. Not part of the stable API.
+    ///
+    /// # Panics
+    /// Panics if `q_bitmaps.len() != nb * qpb` or if any `weights[p].len()
+    /// != nb * nb`.
+    #[doc(hidden)]
+    #[must_use = "this scans every doc's bitmaps to build contingency tables; dropping the result discards that work"]
+    pub fn project_all_batched_scalar(
+        &self,
+        q_bitmaps: &[u64],
+        weights: &[&[f32]],
+    ) -> Vec<Vec<f32>> {
+        let qpb = self.qwords_per_bitmap;
+        let nb = self.n_buckets;
+        assert_eq!(
+            q_bitmaps.len(),
+            nb * qpb,
+            "project_all_batched_scalar: q_bitmaps must be nb * qpb words",
+        );
+        for (p, w) in weights.iter().enumerate() {
+            assert_eq!(
+                w.len(),
+                nb * nb,
+                "project_all_batched_scalar: weights[{p}] must be an nb * nb matrix",
+            );
+        }
+        let n = self.n_vectors;
+        let n_proj = weights.len();
+        if n == 0 || n_proj == 0 {
+            return vec![Vec::new(); n];
+        }
+        let per_doc = nb * qpb;
+        let bitmaps = &self.bitmaps;
+        (0..n)
+            .into_par_iter()
+            .map(|di| {
+                let doc = &bitmaps[di * per_doc..(di + 1) * per_doc];
+                // Stack table (nb*nb <= 256): no per-doc heap alloc in the
+                // parallel map.
+                assert!(
+                    nb * nb <= 256,
+                    "project_all_batched_scalar: nb={nb} exceeds stack table capacity \
+                     (nb*nb={} > 256); bits must be <=4 so nb<=16",
+                    nb * nb,
+                );
+                let mut table = [0u32; 256];
+                let table = &mut table[..nb * nb];
+                contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table);
+                project_table(table, weights)
+            })
+            .collect()
+    }
+
     pub fn len(&self) -> usize {
         self.n_vectors
     }
@@ -216,3 +516,497 @@ impl MultiBucketBitmap {
         self.bitmaps.len() * std::mem::size_of::<u64>()
     }
 }
+
+// =====================================================================
+// Contingency-accumulation kernels.
+//
+// `q_bitmaps` and `doc` are both row-major `nb * qpb` u64 layouts: bucket
+// `a`'s bitmap occupies words `[a * qpb, (a + 1) * qpb)`. The full table
+// cell is
+//   C[a, b] = Σ_{w < qpb} popcount(Q_a[w] & D_b[w]),
+// the popcount over the bitwise-AND of bucket `a`'s query bitmap and
+// bucket `b`'s doc bitmap — the exact popcount-AND shape of
+// [`crate::Bitmap`]'s overlap kernel, here run for every (a, b) pair.
+//
+// Each path dispatches to an AVX-512 VPOPCNTDQ kernel when the host
+// supports it (mirroring the masked-tail discipline in `index/bitmap.rs`),
+// otherwise the portable scalar reference. Counts are exact integers, so
+// the SIMD and scalar paths are required to agree bit-for-bit (the parity
+// test enforces this against API 1's dense `Contingency`).
+// =====================================================================
+
+/// Build the full row-major `nb × nb` contingency table for one
+/// (query, doc) pair into `table` (length `nb * nb`).
+///
+/// Dispatches to the AVX-512 VPOPCNTDQ kernel when available, else the
+/// scalar reference. Both paths require `q_bitmaps.len() == doc.len() ==
+/// nb * qpb` and `table.len() == nb * nb`.
+fn contingency_accumulate(
+    q_bitmaps: &[u64],
+    doc: &[u64],
+    nb: usize,
+    qpb: usize,
+    table: &mut [u32],
+) {
+    debug_assert_eq!(q_bitmaps.len(), nb * qpb);
+    debug_assert_eq!(doc.len(), nb * qpb);
+    debug_assert_eq!(table.len(), nb * nb);
+
+    #[cfg(target_arch = "x86_64")]
+    let use_avx512vpop =
+        is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq");
+    #[cfg(not(target_arch = "x86_64"))]
+    let use_avx512vpop = false;
+
+    if use_avx512vpop {
+        #[cfg(target_arch = "x86_64")]
+        unsafe {
+            contingency_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, table);
+            return;
+        }
+    }
+    contingency_accumulate_scalar(q_bitmaps, doc, nb, qpb, table);
+}
+
+/// Apply every weight matrix in `weights` to a single cached `nb × nb`
+/// contingency `table`, returning one `f32` projection score per matrix.
+/// Each score is `Σ_{cell} weight[cell] · table[cell]` — the cheap dot
+/// product the batched path shares across projections after one accumulation
+/// pass. Caller guarantees every `weights[p].len() == table.len()`.
+fn project_table(table: &[u32], weights: &[&[f32]]) -> Vec<f32> {
+    weights
+        .iter()
+        .map(|w| {
+            w.iter()
+                .zip(table.iter())
+                .map(|(&weight, &c)| weight * c as f32)
+                .sum()
+        })
+        .collect()
+}
+
+/// Portable scalar reference for the full `nb × nb` table. One
+/// popcount-AND reduction per (a, b) cell over the `qpb` u64 words.
+fn contingency_accumulate_scalar(
+    q_bitmaps: &[u64],
+    doc: &[u64],
+    nb: usize,
+    qpb: usize,
+    table: &mut [u32],
+) {
+    for a in 0..nb {
+        let q_off = a * qpb;
+        let row = a * nb;
+        for b in 0..nb {
+            let d_off = b * qpb;
+            let mut overlap: u32 = 0;
+            for w in 0..qpb {
+                overlap += (q_bitmaps[q_off + w] & doc[d_off + w]).count_ones();
+            }
+            table[row + b] = overlap;
+        }
+    }
+}
+
+/// Build only the `nb` diagonal cells `C[a, a] = |Q_a ∩ D_a|` into `diag`
+/// (length `nb`). The common cheap projection.
+///
+/// Dispatches to the AVX-512 VPOPCNTDQ diagonal kernel when available,
+/// else the scalar reference.
+fn diagonal_accumulate(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize, diag: &mut [u32]) {
+    debug_assert_eq!(q_bitmaps.len(), nb * qpb);
+    debug_assert_eq!(doc.len(), nb * qpb);
+    debug_assert_eq!(diag.len(), nb);
+
+    #[cfg(target_arch = "x86_64")]
+    let use_avx512vpop =
+        is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq");
+    #[cfg(not(target_arch = "x86_64"))]
+    let use_avx512vpop = false;
+
+    if use_avx512vpop {
+        #[cfg(target_arch = "x86_64")]
+        unsafe {
+            diagonal_accumulate_avx512vpop(q_bitmaps, doc, nb, qpb, diag);
+            return;
+        }
+    }
+    diagonal_accumulate_scalar(q_bitmaps, doc, nb, qpb, diag);
+}
+
+/// Portable scalar reference for the `nb` diagonal cells.
+fn diagonal_accumulate_scalar(
+    q_bitmaps: &[u64],
+    doc: &[u64],
+    nb: usize,
+    qpb: usize,
+    diag: &mut [u32],
+) {
+    // `a` is load-bearing: it indexes the bucket bitmap offset (`a * qpb`)
+    // as well as `diag[a]`.
+    #[allow(clippy::needless_range_loop)]
+    for a in 0..nb {
+        let off = a * qpb;
+        let mut overlap: u32 = 0;
+        for w in 0..qpb {
+            overlap += (q_bitmaps[off + w] & doc[off + w]).count_ones();
+        }
+        diag[a] = overlap;
+    }
+}
+
+/// AVX-512 VPOPCNTDQ full-table contingency accumulation.
+///
+/// Tiles the `qpb`-word bucket bitmaps into 512-bit (8×u64) ZMM blocks
+/// with a `bzhi`-style masked tail for the final `qpb % 8` words, exactly
+/// as [`crate::Bitmap`]'s scan kernel does. For each block, every query
+/// bucket `a`'s lane and every doc bucket `b`'s lane is AND-ed,
+/// `_mm512_popcnt_epi64`-counted, and reduced into the `(a, b)` cell — the
+/// masked-tail popcount-AND, run over all `nb²` bucket pairs.
+///
+/// # Safety
+/// Requires the `avx512f` and `avx512vpopcntdq` target features, confirmed
+/// by the `#[target_feature]` gate plus the caller's runtime
+/// `is_x86_feature_detected!`. `q_bitmaps.len() == doc.len() == nb * qpb`
+/// and `table.len() == nb * nb` are caller contracts (asserted at the
+/// public entry points); the tail load uses a `(1 << rem) - 1` word mask
+/// so no lane past `qpb` is read for either operand.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f,avx512vpopcntdq")]
+unsafe fn contingency_accumulate_avx512vpop(
+    q_bitmaps: &[u64],
+    doc: &[u64],
+    nb: usize,
+    qpb: usize,
+    table: &mut [u32],
+) {
+    use std::arch::x86_64::*;
+    // SAFETY: `q_bitmaps.len() == doc.len() == nb * qpb` and `table.len()
+    // == nb * nb` (caller contract). Each ZMM load reads 8 u64 words at a
+    // word offset `< qpb` within bucket `a`/`b`'s `qpb`-word region; the
+    // final partial block uses `_mm512_maskz_loadu_epi64` with a
+    // `(1 << rem) - 1` mask so lanes past `qpb` read as zero and are never
+    // dereferenced. AVX-512 F/VPOPCNTDQ are confirmed by `#[target_feature]`
+    // plus the runtime detection in the caller.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        let full_blocks = qpb / 8;
+        let rem = qpb % 8;
+        let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 };
+        let q_ptr = q_bitmaps.as_ptr();
+        let d_ptr = doc.as_ptr();
+
+        for a in 0..nb {
+            let q_base = a * qpb;
+            let row = a * nb;
+            for b in 0..nb {
+                let d_base = b * qpb;
+                let mut acc = _mm512_setzero_si512();
+                // Whole 512-bit blocks.
+                for blk in 0..full_blocks {
+                    let w = blk * 8;
+                    let q_zmm = _mm512_loadu_si512(q_ptr.add(q_base + w) as *const __m512i);
+                    let d_zmm = _mm512_loadu_si512(d_ptr.add(d_base + w) as *const __m512i);
+                    let and_zmm = _mm512_and_si512(q_zmm, d_zmm);
+                    let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                    acc = _mm512_add_epi64(acc, pop_zmm);
+                }
+                // Masked tail: only the low `rem` words are loaded; lanes
+                // past `qpb` read as zero and contribute no popcount.
+                if rem != 0 {
+                    let w = full_blocks * 8;
+                    let q_zmm =
+                        _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(q_base + w) as *const i64);
+                    let d_zmm =
+                        _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(d_base + w) as *const i64);
+                    let and_zmm = _mm512_and_si512(q_zmm, d_zmm);
+                    let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                    acc = _mm512_add_epi64(acc, pop_zmm);
+                }
+                let sum: i64 = _mm512_reduce_add_epi64(acc);
+                table[row + b] = sum as u32;
+            }
+        }
+    }
+}
+
+/// AVX-512 VPOPCNTDQ diagonal-only accumulation: the `nb` cells
+/// `C[a, a]`, each a masked-tail popcount-AND over bucket `a`'s query and
+/// doc bitmaps. Same masked-tail discipline as the full kernel, restricted
+/// to the diagonal.
+///
+/// # Safety
+/// Same contract as [`contingency_accumulate_avx512vpop`]:
+/// `q_bitmaps.len() == doc.len() == nb * qpb`, `diag.len() == nb`, the
+/// target features confirmed by the gate plus runtime detection, and the
+/// masked tail bounding every load to the live `qpb` words.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f,avx512vpopcntdq")]
+unsafe fn diagonal_accumulate_avx512vpop(
+    q_bitmaps: &[u64],
+    doc: &[u64],
+    nb: usize,
+    qpb: usize,
+    diag: &mut [u32],
+) {
+    use std::arch::x86_64::*;
+    // SAFETY: identical bounding to `contingency_accumulate_avx512vpop`,
+    // restricted to `b == a`: every load is at a word offset `< qpb` within
+    // bucket `a`'s `qpb`-word region, with the final partial block masked to
+    // the low `rem` words. `diag.len() == nb` bounds every `diag[a]` write.
+    // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        let full_blocks = qpb / 8;
+        let rem = qpb % 8;
+        let tail_mask: __mmask8 = if rem == 0 { 0 } else { (1u8 << rem) - 1 };
+        let q_ptr = q_bitmaps.as_ptr();
+        let d_ptr = doc.as_ptr();
+
+        // `a` indexes the bucket bitmap offset (`a * qpb`) and `diag[a]`.
+        #[allow(clippy::needless_range_loop)]
+        for a in 0..nb {
+            let base = a * qpb;
+            let mut acc = _mm512_setzero_si512();
+            for blk in 0..full_blocks {
+                let w = blk * 8;
+                let q_zmm = _mm512_loadu_si512(q_ptr.add(base + w) as *const __m512i);
+                let d_zmm = _mm512_loadu_si512(d_ptr.add(base + w) as *const __m512i);
+                let and_zmm = _mm512_and_si512(q_zmm, d_zmm);
+                let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                acc = _mm512_add_epi64(acc, pop_zmm);
+            }
+            if rem != 0 {
+                let w = full_blocks * 8;
+                let q_zmm = _mm512_maskz_loadu_epi64(tail_mask, q_ptr.add(base + w) as *const i64);
+                let d_zmm = _mm512_maskz_loadu_epi64(tail_mask, d_ptr.add(base + w) as *const i64);
+                let and_zmm = _mm512_and_si512(q_zmm, d_zmm);
+                let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                acc = _mm512_add_epi64(acc, pop_zmm);
+            }
+            let sum: i64 = _mm512_reduce_add_epi64(acc);
+            diag[a] = sum as u32;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::Contingency;
+    use rand::{RngExt, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    /// Reconstruct the per-coordinate bucket codes a doc was encoded with
+    /// by reading back the index's stored bitmaps. Used to feed API 1's
+    /// dense `Contingency` the *same* bucket assignment the index holds.
+    fn doc_codes(idx: &MultiBucketBitmap, doc_idx: usize) -> Vec<u8> {
+        let nb = idx.n_buckets();
+        let qpb = idx.qwords_per_bitmap;
+        let dim = idx.dim();
+        let base = doc_idx * nb * qpb;
+        let mut codes = vec![0u8; dim];
+        for b in 0..nb {
+            let off = base + b * qpb;
+            // `j` is load-bearing: it indexes the bit position (`j/64`,
+            // `j%64`) as well as `codes[j]`.
+            #[allow(clippy::needless_range_loop)]
+            for j in 0..dim {
+                if (idx.bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 {
+                    codes[j] = b as u8;
+                }
+            }
+        }
+        codes
+    }
+
+    /// Reconstruct the query bucket codes from the query bitmaps.
+    fn query_codes(q_bitmaps: &[u64], nb: usize, qpb: usize, dim: usize) -> Vec<u8> {
+        let mut codes = vec![0u8; dim];
+        for b in 0..nb {
+            let off = b * qpb;
+            // `j` is load-bearing: bit position and `codes[j]` index.
+            #[allow(clippy::needless_range_loop)]
+            for j in 0..dim {
+                if (q_bitmaps[off + j / 64] >> (j % 64)) & 1 == 1 {
+                    codes[j] = b as u8;
+                }
+            }
+        }
+        codes
+    }
+
+    /// Independent pure-scalar reference for the full table, not routed
+    /// through the dispatch (so the dispatched path can't mask its own bug).
+    fn reference_table(q_bitmaps: &[u64], doc: &[u64], nb: usize, qpb: usize) -> Vec<u32> {
+        let mut table = vec![0u32; nb * nb];
+        for a in 0..nb {
+            for b in 0..nb {
+                let mut overlap = 0u32;
+                for w in 0..qpb {
+                    overlap += (q_bitmaps[a * qpb + w] & doc[b * qpb + w]).count_ones();
+                }
+                table[a * nb + b] = overlap;
+            }
+        }
+        table
+    }
+
+    fn make_corpus(seed: u64, n: usize, dim: usize) -> Vec<f32> {
+        let mut rng = ChaCha8Rng::seed_from_u64(seed);
+        (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect()
+    }
+
+    /// CORRECTNESS GATE (issue #219): the dispatched contingency path
+    /// (AVX-512 on this host) and the diagonal fast path must produce
+    /// BIT-IDENTICAL results to (a) an independent scalar reference, the
+    /// forced scalar kernel, and (c) a freshly-built dense `Contingency`
+    /// (API 1) over the same query/doc bucket codes. Counts are exact
+    /// integers — equality is exact, not tolerance-based.
+    ///
+    /// Dims chosen to exercise the masked tail: 384 (qpb=6, all-tail, no
+    /// full ZMM), 768 (qpb=12, one full ZMM + 4-word tail), 1024 (qpb=16,
+    /// two full ZMMs, no tail).
+    #[test]
+    fn parity_scalar_simd_diagonal_dense() {
+        for &dim in &[384usize, 768, 1024] {
+            for bits in [2u8, 4u8] {
+                let nb = 1usize << bits;
+                let qpb = dim / 64;
+                let n = 24;
+                let corpus = make_corpus(0xC0FFEE ^ dim as u64 ^ (bits as u64) << 32, n, dim);
+                let mut idx = MultiBucketBitmap::new(dim, bits);
+                idx.add(&corpus);
+
+                let mut rng = ChaCha8Rng::seed_from_u64(0x5EED ^ dim as u64);
+                let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+                let q_bitmaps = idx.query_bitmaps_from_ranks(&query);
+                let q_codes = query_codes(&q_bitmaps, nb, qpb, dim);
+
+                for di in 0..n {
+                    let doc_base = di * nb * qpb;
+                    let doc = &idx.bitmaps[doc_base..doc_base + nb * qpb];
+
+                    // (a) independent scalar reference table.
+                    let want = reference_table(&q_bitmaps, doc, nb, qpb);
+
+                    // forced scalar kernel.
+                    let mut scalar = vec![0u32; nb * nb];
+                    contingency_accumulate_scalar(&q_bitmaps, doc, nb, qpb, &mut scalar);
+                    assert_eq!(
+                        scalar, want,
+                        "scalar kernel != reference (dim={dim}, bits={bits}, doc={di})",
+                    );
+
+                    // dispatched path (AVX-512 on this host).
+                    let dispatched = idx.contingency_row(&q_bitmaps, di);
+                    assert_eq!(
+                        dispatched, want,
+                        "dispatched (SIMD) != reference (dim={dim}, bits={bits}, doc={di})",
+                    );
+
+                    // diagonal fast path == diagonal of the full table.
+                    let diag = idx.diagonal_overlap_row(&q_bitmaps, di);
+                    let want_diag: Vec<u32> = (0..nb).map(|a| want[a * nb + a]).collect();
+                    assert_eq!(
+                        diag, want_diag,
+                        "diagonal fast path != table diagonal (dim={dim}, bits={bits}, doc={di})",
+                    );
+
+                    // dense `Contingency` (API 1) over the same codes.
+                    let d_codes = doc_codes(&idx, di);
+                    let dense = Contingency::new(&q_codes, &d_codes, nb).unwrap();
+                    assert_eq!(
+                        dispatched,
+                        dense.counts(),
+                        "dispatched (SIMD) != dense Contingency (dim={dim}, bits={bits}, doc={di})",
+                    );
+                    // And the dense diagonal trace matches the fast-path sum.
+                    assert_eq!(
+                        diag.iter().sum::<u32>(),
+                        dense.diagonal_agreement(),
+                        "diagonal sum != dense diagonal_agreement (dim={dim}, bits={bits}, doc={di})",
+                    );
+                }
+            }
+        }
+    }
+
+    /// `project_all_batched` applies every projection to each doc's table
+    /// built once, and agrees with per-doc `bilinear_score` (which rescans
+    /// per projection) for the same weight matrices.
+    #[test]
+    fn project_all_batched_matches_bilinear_score() {
+        let dim = 768;
+        let bits = 4u8;
+        let n = 40;
+        let corpus = make_corpus(0xBA7C4, n, dim);
+        let mut idx = MultiBucketBitmap::new(dim, bits);
+        idx.add(&corpus);
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0xD0C5);
+        let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let q_bitmaps = idx.query_bitmaps_from_ranks(&query);
+
+        let outer = idx.outer_product_weights();
+        let diagonal = idx.diagonal_weights();
+        let banded = idx.banded_weights(1);
+        let weights: Vec<&[f32]> = vec![&outer, &diagonal, &banded];
+
+        let batched = idx.project_all_batched(&q_bitmaps, &weights);
+        assert_eq!(batched.len(), n);
+        for (di, row) in batched.iter().enumerate() {
+            assert_eq!(row.len(), 3);
+            // bilinear_score rescans the doc per weight matrix — the
+            // batched path must reproduce it exactly (same integer table,
+            // same weighted sum, same f32 accumulation order per matrix).
+            for (p, w) in weights.iter().enumerate() {
+                let want = idx.bilinear_score(&q_bitmaps, w, di);
+                assert_eq!(
+                    row[p], want,
+                    "project_all_batched[{di}][{p}] != bilinear_score",
+                );
+            }
+        }
+    }
+
+    /// Diagonal fast path sums to the `diagonal_weights` bilinear score.
+    #[test]
+    fn diagonal_row_sums_to_diagonal_bilinear_score() {
+        let dim = 384;
+        let bits = 2u8;
+        let n = 16;
+        let corpus = make_corpus(0xD1A6, n, dim);
+        let mut idx = MultiBucketBitmap::new(dim, bits);
+        idx.add(&corpus);
+        let mut rng = ChaCha8Rng::seed_from_u64(0xF00D);
+        let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let q_bitmaps = idx.query_bitmaps_from_ranks(&query);
+        let w = idx.diagonal_weights();
+        for di in 0..n {
+            let diag_sum: u32 = idx.diagonal_overlap_row(&q_bitmaps, di).iter().sum();
+            assert_eq!(diag_sum as f32, idx.bilinear_score(&q_bitmaps, &w, di));
+        }
+    }
+
+    #[test]
+    fn empty_index_project_all_batched_is_empty() {
+        let idx = MultiBucketBitmap::new(256, 2);
+        let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.0f32; 256]);
+        let diag = idx.diagonal_weights();
+        let weights: Vec<&[f32]> = vec![&diag];
+        assert!(idx.project_all_batched(&q_bitmaps, &weights).is_empty());
+    }
+
+    #[test]
+    fn project_all_batched_no_weights_yields_empty_rows() {
+        let dim = 256;
+        let mut idx = MultiBucketBitmap::new(dim, 2);
+        idx.add(&make_corpus(1, 5, dim));
+        let q_bitmaps = idx.query_bitmaps_from_ranks(&vec![0.5f32; dim]);
+        let weights: Vec<&[f32]> = Vec::new();
+        let out = idx.project_all_batched(&q_bitmaps, &weights);
+        assert_eq!(out.len(), 5);
+        assert!(out.iter().all(|row| row.is_empty()));
+    }
+}