From 790f4781bb34ae5c539a40ebb551f5d922ea5fa4 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 15:20:41 -0500
Subject: [PATCH 1/9] feat: capability-gated b=8 RankQuant (research/evidence
 width) (#221)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

b=8 is an evidence/refinement-oriented RankQuant width (asymmetric quant after
repair flows, edge-case rerank healing) — a stable, documented surface, NOT
experimental-unstable. Capability matrix:
  - bucket-code generation / pair-evidence / asymmetric scoring: ANY dim
  - symmetric scoring + analytical norm: ONLY dim % 256 == 0

API (additive, no breaking changes to existing signatures):
- RankQuantCapability { AsymmetricOnly, SymmetricAndAsymmetric } + capability() +
  symmetric_supported().
- RankQuant::new(dim, 8) requires dim % 256 == 0 (full capability; fail-loud else,
  directing to new_asymmetric). RankQuant::new_asymmetric(dim, 8): any dim,
  AsymmetricOnly (auto-upgrades to full when 256-aligned). b=1/2/4 unchanged.
- search() on an AsymmetricOnly instance fails loud with the exact message
  'RankQuant b=8 symmetric scoring requires dim % 256 == 0; dim={dim} supports
  asymmetric/evidence APIs only.' (documented; check symmetric_supported() first).
- validate_params(dim, 8): code-validity any dim (no dim%256). Primitives widened
  to bits<=8 (mask in u16 to avoid 1u8<<8 overflow); b=8 packs 1 byte/coord.

Kernel: b=8 asymmetric uses an AVX-512 vgatherdps kernel (dim*256 LUT gather),
runtime-dispatched (avx512f + dim%16==0, explicit tail handling), scalar LUT
fallback; ~1.23x over scalar on this host (gather/LUT-latency bound — honest,
shipped). Parity vs scalar within 1e-4 across dims 384/400/768/1024/1536.

Persistence: b=8 write() returns io::Error(Unsupported) (no file) — the .tvrq
loader admits {1,2,4} only; b=8 is in-memory this phase (FLAGGED, see PR).

Verified: fmt/clippy(-D warnings, default+experimental)/test(196 default + 206
experimental + no-default-features)/MSRV 1.89 green; b=1/2/4 unchanged; unsafe
gather bounds proven. Closes #221.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/lib.rs              |   2 +-
 src/quant.rs            | 385 +++++++++++++++++++++++++++++-----
 src/quant_kernels.rs    | 450 ++++++++++++++++++++++++++++++++++++++++
 src/rank.rs             | 169 ++++++++++++---
 tests/index/main.rs     |   1 +
 tests/index/quant_b8.rs | 449 +++++++++++++++++++++++++++++++++++++++
 tests/redteam_gamma.rs  |   5 +-
 7 files changed, 1373 insertions(+), 88 deletions(-)
 create mode 100644 tests/index/quant_b8.rs

diff --git a/src/lib.rs b/src/lib.rs
index 85158248..d0408d1a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,7 +63,7 @@ pub mod sign_bitmap;
 mod util;
 
 pub use bitmap::Bitmap;
-pub use quant::{rankquant_eval_search, RankQuant, TwoStageCandidatePolicy};
+pub use quant::{rankquant_eval_search, RankQuant, RankQuantCapability, TwoStageCandidatePolicy};
 pub use rank::Rank;
 pub use rank_io::{probe_index_metadata, IndexKind, IndexMetadata, IndexParams};
 pub use sign_bitmap::SignBitmap;
diff --git a/src/quant.rs b/src/quant.rs
index 2c664033..d82d09cd 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -1,9 +1,22 @@
 //! `B`-bit bucketed-rank index ([`RankQuant`]).
 //!
-//! Storage is `dim * bits / 8` bytes per document at `bits ∈ {1, 2, 4}`.
-//! Symmetric search uses a per-query, per-coord LUT; asymmetric search
-//! dispatches AVX-512 → AVX2 → scalar via the kernels in
-//! [`crate::quant_kernels`].
+//! Storage is `dim * bits / 8` bytes per document at `bits ∈ {1, 2, 4, 8}`
+//! (`b=8` is one byte per coordinate). Symmetric search uses a per-query,
+//! per-coord LUT; asymmetric search dispatches AVX-512 → AVX2 → scalar via
+//! the kernels in [`crate::quant_kernels`].
+//!
+//! `b=8` is an evidence/refinement-oriented width: it is supported for
+//! asymmetric scoring and code/projection generation at **any** dimension,
+//! but symmetric scoring uses the equal-bucket analytical norm and therefore
+//! requires `dim % 256 == 0`. For `b ∈ {1, 2, 4}` the existing retrieval
+//! modes remain the stable headline surface; `b=8` is an opt-in,
+//! explicitly-documented high-precision evidence/refinement surface
+//! (e.g. asymmetric quant storage after repair flows, edge-case rerank
+//! healing), not a broad retrieval-quant method. It is **not**
+//! unstable-experimental. See [`RankQuantCapability`] and
+//! [`RankQuant::new_asymmetric`]. Its asymmetric path is a per-coordinate
+//! gather against the `dim * 256` LUT: an AVX-512 `vgatherdps` kernel when
+//! available (`avx512f` + `dim % 16 == 0`), else the portable scalar LUT.
 //!
 //! The byte-LUT path ([`search_asymmetric_byte_lut`]) is re-exported
 //! `#[doc(hidden)]` (reachable as `ordvec::search_asymmetric_byte_lut`)
@@ -13,7 +26,8 @@
 use rayon::prelude::*;
 
 use crate::quant_kernels::{
-    scan_b1_to_topk, scan_b2_to_topk, scan_b4_to_topk, scan_via_lut_scalar,
+    scan_b1_to_topk, scan_b2_to_topk, scan_b4_to_topk, scan_b8_asym, scan_b8_to_topk,
+    scan_via_lut_scalar,
 };
 #[cfg(target_arch = "x86_64")]
 use crate::quant_kernels::{
@@ -61,21 +75,66 @@ fn rankquant_eval_buckets(v: &[f32], bits: u8, out: &mut [u8]) {
     }
 }
 
+/// Which scoring modes a [`RankQuant`] instance supports.
+///
+/// The distinction only matters for `b=8`. For `b ∈ {1, 2, 4}` every
+/// constructor produces a [`SymmetricAndAsymmetric`](Self::SymmetricAndAsymmetric)
+/// instance (the `dim % 2^bits == 0` constructor invariant always holds),
+/// so callers never need to branch on this for the headline widths.
+///
+/// For `b=8` the symmetric analytical L2 norm is exact only when every
+/// bucket receives equal occupancy, i.e. `dim % 256 == 0`. When that
+/// holds the instance is [`SymmetricAndAsymmetric`](Self::SymmetricAndAsymmetric);
+/// otherwise it is [`AsymmetricOnly`](Self::AsymmetricOnly) — code/projection
+/// generation, pair-evidence/contingency, and asymmetric (float-query)
+/// scoring all work at *any* dim, but the symmetric path
+/// ([`RankQuant::search`]) panics.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RankQuantCapability {
+    /// Asymmetric (float-query) scoring and code/projection generation
+    /// only. Reachable for `b=8` when `dim % 256 != 0`. Symmetric
+    /// scoring ([`RankQuant::search`]) panics on these instances.
+    AsymmetricOnly,
+    /// Full surface: both symmetric and asymmetric scoring. The only
+    /// capability for `b ∈ {1, 2, 4}`, and the capability for `b=8` when
+    /// `dim % 256 == 0`.
+    SymmetricAndAsymmetric,
+}
+
 /// `B`-bit RankQuant index.
 ///
 /// Each document is encoded by bucketing its rank vector into
 /// `1 << bits` equal-width bins on `[0, dim)` and packing `bits` bits
 /// per coordinate. Storage is `dim * bits / 8` bytes per document.
-/// Supported bit widths are `1`, `2`, and `4` (3-bit packing is left
-/// for a follow-up; use `2` or `4` in the interim).
+/// Supported bit widths are `1`, `2`, `4`, and `8` (3-bit packing is
+/// left for a follow-up; use `2` or `4` in the interim).
 ///
 /// The mean-centred bucket vector has fixed analytical L2 norm
 /// `sqrt(dim * (2^(2B) - 1) / 12)` when `dim % (1 << bits) == 0`, so
 /// no per-document norms are stored.
+///
+/// # `b=8` — evidence/refinement width
+/// `b=8` is an evidence/refinement-oriented RankQuant width. It is
+/// supported for asymmetric scoring and code/projection generation at
+/// any dimension; symmetric scoring uses the equal-bucket analytical
+/// norm and therefore requires `dim % 256 == 0`. For `b ∈ {1, 2, 4}`,
+/// the existing retrieval modes remain the stable headline surface;
+/// `b=8` is an opt-in, explicitly-documented high-precision
+/// evidence/refinement surface (e.g. asymmetric quant storage after
+/// repair flows, edge-case rerank healing), not a broad retrieval-quant
+/// method. It is **not** unstable-experimental — it is a stable, core
+/// surface — but it is capability-gated: construct an asymmetric-only
+/// `b=8` index for non-`256`-aligned dims via [`Self::new_asymmetric`]
+/// and check [`Self::symmetric_supported`] before calling
+/// [`Self::search`]. See [`RankQuantCapability`].
 pub struct RankQuant {
     pub(crate) dim: usize,
     pub(crate) bits: u8,
     pub(crate) n_vectors: usize,
+    /// Scoring modes this instance supports — see [`RankQuantCapability`].
+    /// Computed once at construction; for `b ∈ {1, 2, 4}` always
+    /// [`RankQuantCapability::SymmetricAndAsymmetric`].
+    pub(crate) capability: RankQuantCapability,
     /// Row-major packed bucket bytes. `n_vectors * dim * bits / 8` total.
     pub(crate) packed: Vec<u8>,
 }
@@ -178,11 +237,27 @@ fn select_simd_tier(dim: usize, bits: u8) -> SimdTier {
 }
 
 impl RankQuant {
+    /// Validate `(dim, bits)` for **code validity** — the precondition for
+    /// generating bucket codes, projections, and asymmetric scores.
+    ///
+    /// Accepts `bits ∈ {1, 2, 4, 8}` and `dim ∈ [2, u16::MAX]`.
+    ///
+    /// For `b ∈ {1, 2, 4}` this additionally requires `dim % 2^bits == 0`
+    /// (the equal-bucket constant-composition invariant): those widths only
+    /// expose a full symmetric+asymmetric surface, so code validity and
+    /// symmetric-norm validity coincide.
+    ///
+    /// For `b = 8` it validates **only** that codes pack (`codes_per_byte ==
+    /// 1`, so any `dim` works) — it does **not** require `dim % 256 == 0`.
+    /// That `dim % 256 == 0` rule is a *symmetric-scoring* precondition, not
+    /// a code-validity one, and is checked separately on the symmetric path
+    /// (and by [`Self::new`], which constructs a full-capability `b=8`
+    /// instance). Use [`Self::new_asymmetric`] for any-`dim` `b=8`.
     pub fn validate_params(dim: usize, bits: u8) -> Result<(), OrdvecError> {
-        if !matches!(bits, 1 | 2 | 4) {
+        if !matches!(bits, 1 | 2 | 4 | 8) {
             return Err(OrdvecError::InvalidParameter {
                 name: "bits",
-                message: "must be 1, 2, or 4".to_string(),
+                message: "must be 1, 2, 4, or 8".to_string(),
             });
         }
         if dim < 2 {
@@ -204,20 +279,45 @@ impl RankQuant {
                 message: format!("must be a multiple of {codes_per_byte} for bits = {bits}"),
             });
         }
-        let n_buckets = 1usize << bits;
-        if !dim.is_multiple_of(n_buckets) {
-            return Err(OrdvecError::InvalidParameter {
-                name: "dim",
-                message: format!(
-                    "must be divisible by 2^bits = {n_buckets} so every bucket receives exactly dim / 2^bits rank entries"
-                ),
-            });
+        // The constant-composition invariant `dim % 2^bits == 0` exists only to
+        // make the symmetric analytical L2 norm exact (equal bucket occupancy).
+        // For b ∈ {1,2,4} we keep requiring it here (those widths are
+        // full-capability by definition), but for b=8 it is a *symmetric*
+        // precondition checked elsewhere — code/projection/asymmetric paths
+        // never need equal buckets, so a non-256-aligned dim is a valid b=8
+        // *code* configuration.
+        if bits != 8 {
+            let n_buckets = 1usize << bits;
+            if !dim.is_multiple_of(n_buckets) {
+                return Err(OrdvecError::InvalidParameter {
+                    name: "dim",
+                    message: format!(
+                        "must be divisible by 2^bits = {n_buckets} so every bucket receives exactly dim / 2^bits rank entries"
+                    ),
+                });
+            }
         }
         Ok(())
     }
 
+    /// Construct a full-capability (`SymmetricAndAsymmetric`) index.
+    ///
+    /// For `b ∈ {1, 2, 4}` this is unchanged: `bits` must be one of those
+    /// widths and `dim % 2^bits == 0` (and `dim % (8 / bits) == 0`).
+    ///
+    /// For `b = 8` this requires `dim % 256 == 0`, which yields the full
+    /// symmetric+asymmetric surface. If `dim % 256 != 0` it **panics**
+    /// (consistent with this constructor's existing fail-loud style),
+    /// directing the caller to [`Self::new_asymmetric`] for an any-`dim`
+    /// asymmetric-only `b=8` index. See [`RankQuantCapability`].
+    ///
+    /// # Panics
+    /// Panics if `bits ∉ {1, 2, 4, 8}`, if `dim < 2`, if `dim > u16::MAX`,
+    /// if `dim % (8 / bits) != 0`, or — for the equal-bucket symmetric
+    /// invariant — if `dim % 2^bits != 0` (`b ∈ {1,2,4}`) / `dim % 256 != 0`
+    /// (`b = 8`).
     pub fn new(dim: usize, bits: u8) -> Self {
-        assert!(matches!(bits, 1 | 2 | 4), "bits must be 1, 2, or 4");
+        assert!(matches!(bits, 1 | 2 | 4 | 8), "bits must be 1, 2, 4, or 8");
         assert!(dim >= 2, "dim must be >= 2");
         assert!(dim <= u16::MAX as usize, "dim must fit in u16");
         let codes_per_byte = (8 / bits) as usize;
@@ -226,6 +326,27 @@ impl RankQuant {
             0,
             "dim must be a multiple of {codes_per_byte} for bits = {bits}",
         );
+        if bits == 8 {
+            // b=8 full-capability requires dim % 256 == 0 (equal bucket
+            // occupancy → exact symmetric analytical norm). Fail loud and
+            // point at the asymmetric-only constructor so the caller has a
+            // non-surprising path for non-aligned dims.
+            assert_eq!(
+                dim % 256,
+                0,
+                "RankQuant::new(dim, 8) requires dim % 256 == 0 for symmetric \
+                 scoring (equal-bucket analytical norm); dim={dim} is not \
+                 256-aligned. Use RankQuant::new_asymmetric(dim, 8) for an \
+                 asymmetric-only b=8 index at any dim.",
+            );
+            return Self {
+                dim,
+                bits,
+                n_vectors: 0,
+                capability: RankQuantCapability::SymmetricAndAsymmetric,
+                packed: Vec::new(),
+            };
+        }
         // Audit-safety: require dim divisible by 2^bits so every bucket
         // gets exactly dim / (1 << bits) rank entries per document. This
         // is what makes `rankquant_norm` analytically exact (every doc
@@ -245,10 +366,94 @@ impl RankQuant {
             dim,
             bits,
             n_vectors: 0,
+            capability: RankQuantCapability::SymmetricAndAsymmetric,
             packed: Vec::new(),
         }
     }
 
+    /// Construct an asymmetric-capable index at **any** valid `dim`.
+    ///
+    /// This is the non-surprising entry point for `b = 8` at a dimension
+    /// that is not `256`-aligned: it produces a
+    /// [`RankQuantCapability::AsymmetricOnly`] instance whose
+    /// code/projection generation, pair-evidence/contingency, and
+    /// asymmetric (float-query) scoring all work, but whose symmetric path
+    /// ([`Self::search`]) panics (the equal-bucket analytical norm is not
+    /// exact off the `256`-aligned grid). When `dim % 256 == 0`, the `b=8`
+    /// instance is upgraded to full [`RankQuantCapability::SymmetricAndAsymmetric`]
+    /// (there is no reason to withhold symmetric scoring when it is exact).
+    ///
+    /// For `b ∈ {1, 2, 4}` this constructs the same full-capability instance
+    /// as [`Self::new`] (those widths are always symmetric-capable when their
+    /// constructor invariants hold), so it is never *less* capable than
+    /// `new` — it is simply the width-agnostic constructor.
+    ///
+    /// # Panics
+    /// Panics if `(dim, bits)` is not a valid **code** configuration —
+    /// i.e. `bits ∉ {1, 2, 4, 8}`, `dim < 2`, `dim > u16::MAX`, or
+    /// `dim % (8 / bits) != 0`. For `b ∈ {1, 2, 4}` it additionally requires
+    /// `dim % 2^bits == 0` (same as [`Self::new`]).
+    pub fn new_asymmetric(dim: usize, bits: u8) -> Self {
+        // Reuse the code-validity gate (accepts any 256-unaligned dim for b=8,
+        // still requires dim % 2^bits for b ∈ {1,2,4}). Convert the structured
+        // error into a panic so this constructor matches `new`'s fail-loud style.
+        Self::validate_params(dim, bits)
+            .unwrap_or_else(|e| panic!("RankQuant::new_asymmetric invalid params: {e}"));
+        let capability = Self::capability_for(dim, bits);
+        Self {
+            dim,
+            bits,
+            n_vectors: 0,
+            capability,
+            packed: Vec::new(),
+        }
+    }
+
+    /// Compute the capability for a code-valid `(dim, bits)` pair.
+    ///
+    /// `b ∈ {1, 2, 4}` and `256`-aligned `b=8` are full-capability; any
+    /// other (i.e. non-`256`-aligned) `b=8` is asymmetric-only.
+    #[inline]
+    fn capability_for(dim: usize, bits: u8) -> RankQuantCapability {
+        if bits == 8 && !dim.is_multiple_of(256) {
+            RankQuantCapability::AsymmetricOnly
+        } else {
+            RankQuantCapability::SymmetricAndAsymmetric
+        }
+    }
+
+    /// The scoring modes this instance supports — see [`RankQuantCapability`].
+    ///
+    /// Always [`RankQuantCapability::SymmetricAndAsymmetric`] for
+    /// `b ∈ {1, 2, 4}`. For `b=8` it reflects whether `dim % 256 == 0`.
+    #[inline]
+    pub fn capability(&self) -> RankQuantCapability {
+        self.capability
+    }
+
+    /// Whether [`Self::search`] (symmetric scoring) is supported on this
+    /// instance. `true` for `b ∈ {1, 2, 4}` and for `256`-aligned `b=8`;
+    /// `false` for `b=8` at a non-`256`-aligned dim (asymmetric-only).
+    ///
+    /// Callers should check this before invoking [`Self::search`] on a
+    /// `b=8` index built via [`Self::new_asymmetric`].
+    #[inline]
+    pub fn symmetric_supported(&self) -> bool {
+        matches!(self.capability, RankQuantCapability::SymmetricAndAsymmetric)
+    }
+
+    /// Fail loud with the exact symmetric-gating message when symmetric
+    /// scoring is invoked on an asymmetric-only (`b=8`, non-`256`-aligned)
+    /// instance. No-op for symmetric-capable instances.
+    #[inline]
+    fn assert_symmetric_supported(&self) {
+        assert!(
+            self.symmetric_supported(),
+            "RankQuant b=8 symmetric scoring requires dim % 256 == 0; dim={} supports asymmetric/evidence APIs only.",
+            self.dim,
+        );
+    }
+
     /// Add documents. Each vector is rank-transformed, bucketed to `bits`
     /// bits/coord, and bit-packed row-major.
     ///
@@ -288,7 +493,21 @@ impl RankQuant {
 
     /// Symmetric search: bucket the query and score against bucketed
     /// docs.
+    ///
+    /// # Panics
+    /// For a `b=8` index built via [`Self::new_asymmetric`] at a
+    /// non-`256`-aligned dim (an [`RankQuantCapability::AsymmetricOnly`]
+    /// instance), this **panics**: the symmetric analytical norm requires
+    /// equal bucket occupancy (`dim % 256 == 0`). Check
+    /// [`Self::symmetric_supported`] first, or use [`Self::search_asymmetric`],
+    /// which works at any dim. (`b ∈ {1, 2, 4}` and `256`-aligned `b=8`
+    /// instances never trip this.) The panic message is:
+    /// `RankQuant b=8 symmetric scoring requires dim % 256 == 0; dim={dim}
+    /// supports asymmetric/evidence APIs only.`
     pub fn search(&self, queries: &[f32], k: usize) -> SearchResults {
+        // Symmetric gating: fail loud (with the exact message) for an
+        // asymmetric-only b=8 instance before doing any work.
+        self.assert_symmetric_supported();
         let nq = queries.len() / self.dim;
         assert_eq!(queries.len(), nq * self.dim);
         assert_all_finite(queries);
@@ -338,6 +557,7 @@ impl RankQuant {
                     1 => scan_b1_to_topk(&self.packed, n, dim, &lut, inv_norm_sq, &mut top),
                     2 => scan_b2_to_topk(&self.packed, n, dim, &lut, inv_norm_sq, &mut top),
                     4 => scan_b4_to_topk(&self.packed, n, dim, &lut, inv_norm_sq, &mut top),
+                    8 => scan_b8_to_topk(&self.packed, n, dim, &lut, inv_norm_sq, &mut top),
                     _ => unreachable!(),
                 }
                 top.finalize_into(out_scores, out_indices);
@@ -359,6 +579,15 @@ impl RankQuant {
     /// (`LUT[d][b] = q_unit[d] * bucket_centre(b)`). The scan unpacks
     /// `8 / bits` codes per byte and accumulates via LUT lookups; the
     /// compiler autovectorises the inner sum.
+    ///
+    /// Works at **any** valid dim for all supported widths including `b=8`
+    /// (the asymmetric path needs no equal-bucket precondition). For `b=8`
+    /// the score is a per-coordinate gather `Σ_d lut[d*256 + code[d]]`
+    /// against the `dim * 256` LUT: it dispatches to the AVX-512
+    /// `vgatherdps` kernel (`scan_b8_asym` → `scan_b8_asym_avx512_gather`)
+    /// when `avx512f` is present and `dim % 16 == 0`, else the portable
+    /// scalar LUT reference (`scan_b8_to_topk`). Unlike [`Self::search`],
+    /// this never panics on an asymmetric-only instance.
     pub fn search_asymmetric(&self, queries: &[f32], k: usize) -> SearchResults {
         let nq = queries.len() / self.dim;
         assert_eq!(queries.len(), nq * self.dim);
@@ -421,6 +650,18 @@ impl RankQuant {
             .for_each(|((q, out_scores), out_indices)| {
                 let q_unit = l2_normalise(q);
                 let mut top = TopK::new(k_eff);
+
+                // b=8 is a per-coordinate gather (`Σ_d lut[d*256 + code[d]]`),
+                // not a centre-drop dot product — it routes to its own
+                // dispatch (AVX-512 vgatherdps → scalar LUT) and never uses
+                // the centre-drop offset (its LUT bakes the centre in).
+                if bits == 8 {
+                    scan_b8_asym(&self.packed, n, dim, &q_unit, inv_norm, &mut top);
+                    top.finalize_into(out_scores, out_indices);
+                    let _ = bytes_per_vec; // shape clarity
+                    return;
+                }
+
                 #[cfg(target_arch = "x86_64")]
                 let centre_offset = {
                     let q_sum: f32 = q_unit.iter().sum();
@@ -518,7 +759,23 @@ impl RankQuant {
     }
 
     /// Persist to a `.tvrq` file. Format: 14-byte header + packed bytes.
+    ///
+    /// # `b=8`
+    /// The `.tvrq` on-disk format and its loader currently support only
+    /// `bits ∈ {1, 2, 4}`. `b=8` is an in-memory evidence/refinement surface
+    /// in this phase; persisting it is a follow-up. To avoid writing a file
+    /// that [`Self::load`] would then reject (a silent broken round-trip),
+    /// this returns `io::Error` (kind `Unsupported`) for a `b=8` index rather
+    /// than emitting an unloadable file.
     pub fn write(&self, path: impl AsRef<std::path::Path>) -> std::io::Result<()> {
+        if self.bits == 8 {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::Unsupported,
+                "RankQuant b=8 persistence is not supported yet (the .tvrq loader \
+                 accepts bits ∈ {1, 2, 4}); b=8 is an in-memory evidence surface \
+                 in this phase",
+            ));
+        }
         crate::rank_io::write_rankquant(path, self.bits, self.dim, self.n_vectors, &self.packed)
     }
 
@@ -578,10 +835,16 @@ impl RankQuant {
                 ),
             ));
         }
+        // `load_rankquant` only admits bits ∈ {1,2,4} (b=8 is not persistable
+        // in this phase — see `write`), and those widths are always
+        // full-capability, so the loaded instance is SymmetricAndAsymmetric.
+        // `capability_for` keeps that derivation in one place.
+        let capability = Self::capability_for(dim, bits);
         Ok(Self {
             dim,
             bits,
             n_vectors,
+            capability,
             packed,
         })
     }
@@ -682,48 +945,56 @@ impl RankQuant {
         #[cfg_attr(not(target_arch = "x86_64"), allow(unused_variables))]
         let simd_tier = select_simd_tier(dim, bits);
         let mut top = TopK::new_with_tie_keys(k_eff, candidates);
-        #[cfg(target_arch = "x86_64")]
-        unsafe {
-            match (simd_tier, bits) {
-                (SimdTier::Avx512, 2) => {
-                    top.set_score_offset(centre_offset);
-                    scan_b2_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                }
-                (SimdTier::Avx512, 4) => {
-                    top.set_score_offset(centre_offset);
-                    scan_b4_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                }
-                (SimdTier::Avx2, 2) => {
-                    top.set_score_offset(centre_offset);
-                    scan_b2_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
-                }
-                (SimdTier::Avx2, 4) => {
-                    top.set_score_offset(centre_offset);
-                    scan_b4_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+        // b=8 routes to its own gather dispatch (AVX-512 vgatherdps → scalar
+        // LUT), with the centre baked into the LUT (no score-offset trick).
+        // The tie keys on `top` still map local scratch positions → global
+        // row IDs exactly as for b ∈ {1,2,4}.
+        if bits == 8 {
+            scan_b8_asym(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+        } else {
+            #[cfg(target_arch = "x86_64")]
+            unsafe {
+                match (simd_tier, bits) {
+                    (SimdTier::Avx512, 2) => {
+                        top.set_score_offset(centre_offset);
+                        scan_b2_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+                    }
+                    (SimdTier::Avx512, 4) => {
+                        top.set_score_offset(centre_offset);
+                        scan_b4_asym_avx512(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+                    }
+                    (SimdTier::Avx2, 2) => {
+                        top.set_score_offset(centre_offset);
+                        scan_b2_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+                    }
+                    (SimdTier::Avx2, 4) => {
+                        top.set_score_offset(centre_offset);
+                        scan_b4_asym_avx2(&sub_packed, m, dim, &q_unit, inv_norm, &mut top);
+                    }
+                    _ => scan_via_lut_scalar(
+                        &sub_packed,
+                        m,
+                        dim,
+                        bits,
+                        n_buckets,
+                        &q_unit,
+                        inv_norm,
+                        &mut top,
+                    ),
                 }
-                _ => scan_via_lut_scalar(
-                    &sub_packed,
-                    m,
-                    dim,
-                    bits,
-                    n_buckets,
-                    &q_unit,
-                    inv_norm,
-                    &mut top,
-                ),
             }
+            #[cfg(not(target_arch = "x86_64"))]
+            scan_via_lut_scalar(
+                &sub_packed,
+                m,
+                dim,
+                bits,
+                n_buckets,
+                &q_unit,
+                inv_norm,
+                &mut top,
+            );
         }
-        #[cfg(not(target_arch = "x86_64"))]
-        scan_via_lut_scalar(
-            &sub_packed,
-            m,
-            dim,
-            bits,
-            n_buckets,
-            &q_unit,
-            inv_norm,
-            &mut top,
-        );
 
         let mut scores = vec![f32::NEG_INFINITY; k_eff];
         let mut local_indices = vec![-1i64; k_eff];
diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs
index 92f790c7..f4319a0f 100644
--- a/src/quant_kernels.rs
+++ b/src/quant_kernels.rs
@@ -40,6 +40,7 @@ pub(crate) fn scan_via_lut_scalar(
         1 => scan_b1_to_topk(packed, n, dim, &lut, scale, top),
         2 => scan_b2_to_topk(packed, n, dim, &lut, scale, top),
         4 => scan_b4_to_topk(packed, n, dim, &lut, scale, top),
+        8 => scan_b8_to_topk(packed, n, dim, &lut, scale, top),
         _ => unreachable!("bits validated in new()"),
     }
 }
@@ -127,6 +128,57 @@ pub(crate) fn scan_b4_to_topk(
     }
 }
 
+/// Build the `dim * 256` per-coordinate asymmetric LUT for `b=8`:
+/// `lut[d * 256 + code] = q_unit[d] * bucket_centre(code, 8)`. This is the
+/// shared input to both the scalar [`scan_b8_to_topk`] reference and the
+/// AVX-512 [`scan_b8_asym_avx512_gather`] kernel, so they score-parity.
+///
+/// `bucket_centre(code, 8) = code - 127.5`, so each row is the query
+/// coordinate scaled across the 256 centred bucket values.
+pub(crate) fn build_b8_asym_lut(q_unit: &[f32]) -> Vec<f32> {
+    let dim = q_unit.len();
+    let mut lut = vec![0.0f32; dim * 256];
+    for d in 0..dim {
+        let qd = q_unit[d];
+        let row = &mut lut[d * 256..(d + 1) * 256];
+        for (code, slot) in row.iter_mut().enumerate() {
+            *slot = qd * bucket_centre(code as u8, 8);
+        }
+    }
+    lut
+}
+
+/// 8-bit scan. 1 code per byte; n_buckets = 256. The degenerate
+/// one-code-per-byte case: `doc[d]` is the code at coordinate `d`, so the
+/// inner loop is a single LUT lookup per byte against the `dim * 256`
+/// per-coord LUT. Used by both the symmetric path (`bucket_centre` LUT)
+/// and the asymmetric scalar LUT path (`q_unit[d] * bucket_centre(b)`).
+///
+/// This is also the **portable scalar reference** for the `b=8` asymmetric
+/// gather: it sums in strict coordinate order, one lookup + add per byte,
+/// so it is the bit-exact baseline the AVX-512 gather kernel is parity-
+/// tested against (within the crate's 1e-4 cross-backend tolerance).
+pub(crate) fn scan_b8_to_topk(
+    packed: &[u8],
+    n: usize,
+    dim: usize,
+    lut: &[f32],
+    scale: f32,
+    top: &mut TopK,
+) {
+    let bytes_per_vec = dim; // 1 byte per coordinate
+    for di in 0..n {
+        let doc = &packed[di * bytes_per_vec..(di + 1) * bytes_per_vec];
+        let mut acc = 0.0f32;
+        for (d, &code) in doc.iter().enumerate() {
+            // LUT row `d` has 256 entries (one per code value); the code is
+            // already the bucket index for b=8.
+            acc += lut[d * 256 + code as usize];
+        }
+        top.maybe_insert(acc * scale, di);
+    }
+}
+
 // -------------------------------------------------------------------
 // AVX2 + FMA kernels for the asymmetric path.
 //
@@ -499,3 +551,401 @@ pub(crate) unsafe fn scan_b4_asym_avx512(
         }
     }
 }
+
+/// Single entry point for the `b=8` asymmetric scan.
+///
+/// Builds the shared `dim * 256` per-coordinate LUT once
+/// ([`build_b8_asym_lut`]), then dispatches to the AVX-512 gather kernel
+/// ([`scan_b8_asym_avx512_gather`]) when `avx512f` is detected at runtime
+/// and `dim % 16 == 0`, falling back to the portable scalar reference
+/// ([`scan_b8_to_topk`]) on every other target / CPU / dim. Centralising
+/// the dispatch here keeps the `unsafe` SIMD reach in one place and out of
+/// `quant.rs`.
+pub(crate) fn scan_b8_asym(
+    packed: &[u8],
+    n: usize,
+    dim: usize,
+    q_unit: &[f32],
+    scale: f32,
+    top: &mut TopK,
+) {
+    let lut = build_b8_asym_lut(q_unit);
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512f") && dim.is_multiple_of(16) {
+            // SAFETY: `avx512f` is confirmed by the runtime detection above
+            // and `dim % 16 == 0` satisfies the kernel's lane invariant;
+            // `packed.len() == n * dim` and `lut.len() == dim * 256` hold by
+            // construction (b=8 packs one byte/coord; the LUT is built just
+            // above). The explicit block is required by
+            // `#![deny(unsafe_op_in_unsafe_fn)]`.
+            unsafe {
+                scan_b8_asym_avx512_gather(packed, n, dim, &lut, scale, top);
+            }
+            return;
+        }
+    }
+    scan_b8_to_topk(packed, n, dim, &lut, scale, top);
+}
+
+// -------------------------------------------------------------------
+// AVX-512 gather kernel for the b=8 asymmetric path.
+//
+// Unlike b ∈ {2, 4} — whose tiny per-byte arithmetic (shift/mask/cvt/FMA)
+// beats any memory indirection — b=8 carries a large per-coordinate
+// 256-entry float LUT (`lut[d * 256 + code]`), so the score is an honest
+// gather: `Σ_d lut[d * 256 + doc_code[d]]`. The dominant cost is the
+// gather, which `vgatherdps` (`_mm512_i32gather_ps`) issues 16-wide in a
+// single instruction.
+//
+// Per 16-coordinate chunk:
+//   * load 16 doc bytes, zero-extend to i32 lanes (`_mm512_cvtepu8_epi32`);
+//   * add the per-position row-base vector `[d*256, (d+1)*256, …]` so lane
+//     `j` indexes `lut[(d+j) * 256 + code[d+j]]`;
+//   * `_mm512_i32gather_ps(idx, lut_ptr, 4)` gathers all 16 contributions;
+//   * accumulate (plain add — the LUT already encodes `q · centre`).
+// Four independent accumulators break the add dependency chain, matching
+// the b=2/b=4 AVX-512 kernels. Unlike those, b=8 needs no centre-drop
+// trick: the asymmetric LUT bakes the per-coordinate query weight in, so
+// there is no per-query constant offset to reapply at finalize.
+//
+// Caller must verify `is_x86_feature_detected!("avx512f")` once. The LUT
+// is the same `dim * 256` f32 layout the scalar `scan_b8_to_topk` consumes,
+// so the two paths are score-parity (modulo f32 summation order, within
+// the crate's 1e-4 cross-backend tolerance).
+// -------------------------------------------------------------------
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+pub(crate) unsafe fn scan_b8_asym_avx512_gather(
+    packed: &[u8],
+    n: usize,
+    dim: usize,
+    lut: &[f32],
+    scale: f32,
+    top: &mut TopK,
+) {
+    use std::arch::x86_64::*;
+
+    // SAFETY: a `pub(crate) unsafe fn` reachable only via `quant.rs`'s
+    // runtime-detected dispatch, which upholds the invariants the raw doc
+    // reads (`packed.as_ptr().add(di * dim + base)`), the LUT gather
+    // (`_mm512_i32gather_ps` off `lut.as_ptr()`), and the chunk loop depend
+    // on:
+    //   * `packed.len() == n * dim` (b=8 stores one byte per coordinate),
+    //   * `lut.len() == dim * 256` (one 256-entry row per coordinate),
+    //   * `dim % 16 == 0` (asserted immediately below) so the 16-lane chunk
+    //     loop tiles each doc exactly with no tail.
+    // Every gather index `(d + j) * 256 + code` is `< dim * 256` because
+    // `d + j < dim` and `code <= 255`, so each gathered f32 is in-bounds.
+    // `RankQuant::{new_asymmetric,add}` pack exactly `dim` bytes/doc and the
+    // dispatch builds a `dim * 256` LUT, so this holds on every path here.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
+        // loudly in release, not silently drop the trailing chunk.
+        assert_eq!(dim % 16, 0, "b=8 AVX-512 gather path needs dim % 16 == 0");
+        debug_assert_eq!(lut.len(), dim * 256, "b=8 LUT must be dim * 256 entries");
+        let bytes_per_vec = dim; // one byte per coordinate
+        let lut_ptr = lut.as_ptr();
+
+        // Per-position row bases for one 16-lane chunk: lane j contributes
+        // `j * 256`. The chunk's coordinate offset `c * 16 * 256` is folded
+        // into the doc-byte indices below.
+        let lane_row_base = _mm512_setr_epi32(
+            0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584,
+            3840,
+        );
+        let chunks_per_vec = bytes_per_vec / 16;
+
+        for di in 0..n {
+            let doc = packed.as_ptr().add(di * bytes_per_vec);
+            let mut acc0 = _mm512_setzero_ps();
+            let mut acc1 = _mm512_setzero_ps();
+            let mut acc2 = _mm512_setzero_ps();
+            let mut acc3 = _mm512_setzero_ps();
+
+            // Round chunks down to a multiple of 4 for the unrolled body;
+            // a `dim % 64 != 0` (but `% 16 == 0`) dim leaves a ≤3-chunk tail
+            // handled by the single-accumulator loop after.
+            let unrolled = chunks_per_vec & !3;
+
+            let mut c = 0usize;
+            while c < unrolled {
+                macro_rules! step {
+                    ($cc:expr, $acc:expr) => {{
+                        // Coordinate base for this chunk: `cc * 16 * 256`.
+                        let chunk_base = _mm512_set1_epi32(($cc * 16 * 256) as i32);
+                        // Load 16 doc bytes, zero-extend to 16 i32 lanes.
+                        let bytes = _mm_loadu_si128(doc.add($cc * 16) as *const __m128i);
+                        let codes = _mm512_cvtepu8_epi32(bytes);
+                        // idx[j] = chunk_base + (j * 256) + code[j]
+                        //        = (cc*16 + j) * 256 + code[cc*16 + j]
+                        let idx =
+                            _mm512_add_epi32(_mm512_add_epi32(chunk_base, lane_row_base), codes);
+                        // Gather 16 LUT contributions (scale = 4 bytes/f32).
+                        let vals = _mm512_i32gather_ps::<4>(idx, lut_ptr);
+                        $acc = _mm512_add_ps($acc, vals);
+                    }};
+                }
+                step!(c, acc0);
+                step!(c + 1, acc1);
+                step!(c + 2, acc2);
+                step!(c + 3, acc3);
+                c += 4;
+            }
+
+            // Tail: remaining (< 4) chunks fold into acc0.
+            while c < chunks_per_vec {
+                let chunk_base = _mm512_set1_epi32((c * 16 * 256) as i32);
+                let bytes = _mm_loadu_si128(doc.add(c * 16) as *const __m128i);
+                let codes = _mm512_cvtepu8_epi32(bytes);
+                let idx = _mm512_add_epi32(_mm512_add_epi32(chunk_base, lane_row_base), codes);
+                let vals = _mm512_i32gather_ps::<4>(idx, lut_ptr);
+                acc0 = _mm512_add_ps(acc0, vals);
+                c += 1;
+            }
+
+            let s01 = _mm512_add_ps(acc0, acc1);
+            let s23 = _mm512_add_ps(acc2, acc3);
+            let total = _mm512_add_ps(s01, s23);
+            let raw = _mm512_reduce_add_ps(total);
+            top.maybe_insert(raw * scale, di);
+        }
+    }
+}
+
+#[cfg(all(test, target_arch = "x86_64"))]
+mod b8_gather_tests {
+    use super::{build_b8_asym_lut, scan_b8_asym_avx512_gather, scan_b8_to_topk};
+    use crate::util::TopK;
+    use rand::{RngExt, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    /// Drain a `k`-slot `TopK` into a flat `(score, idx)` vec sorted by the
+    /// collector's own composite key, so the two kernels are compared on the
+    /// exact tuples a caller would receive.
+    fn drain(top: &TopK, k: usize) -> (Vec<f32>, Vec<i64>) {
+        let mut scores = vec![f32::NEG_INFINITY; k];
+        let mut idxs = vec![-1i64; k];
+        top.finalize_into(&mut scores, &mut idxs);
+        (scores, idxs)
+    }
+
+    /// The AVX-512 `vgatherdps` b=8 kernel must match the scalar LUT
+    /// reference within the crate's 1e-4 cross-backend score tolerance,
+    /// across the headline embedding dims (all `% 16 == 0`, so the gather
+    /// path is actually exercised). 768/1536 are `% 64 == 0` (full
+    /// 4-way-unrolled body); to also cover the ≤3-chunk tail path we add
+    /// dim=400 (`400 % 16 == 0`, `400 % 64 == 16`).
+    #[test]
+    fn b8_gather_matches_scalar_reference() {
+        if !is_x86_feature_detected!("avx512f") {
+            eprintln!("skipping b8 gather parity: no avx512f on this host");
+            return;
+        }
+        for &dim in &[384usize, 400, 768, 1024, 1536] {
+            assert_eq!(dim % 16, 0, "test dims must be % 16 for the gather path");
+            let n = 64;
+            let k = 10;
+            let mut rng = ChaCha8Rng::seed_from_u64(0x00B8_0000 + dim as u64);
+
+            // Random doc codes (any byte 0..=255) and a random unit-ish query.
+            let packed: Vec<u8> = (0..n * dim).map(|_| rng.random::<u8>()).collect();
+            let q: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+            let qn: f32 = q.iter().map(|x| x * x).sum::<f32>().sqrt();
+            let q_unit: Vec<f32> = q.iter().map(|x| x / qn).collect();
+            let scale = 1.0f32 / 137.0; // arbitrary inv_norm-like scale
+
+            let lut = build_b8_asym_lut(&q_unit);
+
+            let mut top_scalar = TopK::new(k);
+            scan_b8_to_topk(&packed, n, dim, &lut, scale, &mut top_scalar);
+            let (s_scalar, i_scalar) = drain(&top_scalar, k);
+
+            let mut top_gather = TopK::new(k);
+            // SAFETY: avx512f confirmed above; dim % 16 == 0; packed has
+            // n*dim bytes and lut has dim*256 entries by construction.
+            unsafe {
+                scan_b8_asym_avx512_gather(&packed, n, dim, &lut, scale, &mut top_gather);
+            }
+            let (s_gather, i_gather) = drain(&top_gather, k);
+
+            for slot in 0..k {
+                assert!(
+                    (s_scalar[slot] - s_gather[slot]).abs() < 1e-4,
+                    "dim={dim} slot={slot}: scalar {} vs gather {}",
+                    s_scalar[slot],
+                    s_gather[slot],
+                );
+            }
+            // With well-separated random scores the top-k id sets agree too.
+            assert_eq!(
+                i_scalar, i_gather,
+                "dim={dim}: top-{k} id ordering diverged between scalar and gather"
+            );
+        }
+    }
+
+    /// The gather kernel's per-doc raw score equals the brute-force
+    /// `Σ_d lut[d*256 + code[d]]` (before the `scale` multiply), confirming
+    /// the index math `idx[j] = (c*16 + j) * 256 + code` is exact.
+    ///
+    /// This compares the *unscaled* sum, whose magnitude (~10² for centred
+    /// b=8 codes up to ±127.5 over `dim` terms) is far larger than the
+    /// `inv_norm`-scaled score a caller sees. The SIMD kernel's 4-way
+    /// parallel accumulation rounds in a different order from the strict
+    /// sequential brute-force, so the check is *relative* (~1e-5): the
+    /// production 1e-4 *absolute* tolerance applies to the small final
+    /// scaled scores, which the parity test above covers.
+    #[test]
+    fn b8_gather_raw_score_is_exact_gather_sum() {
+        if !is_x86_feature_detected!("avx512f") {
+            return;
+        }
+        let dim = 256usize;
+        let n = 8;
+        let k = n;
+        let mut rng = ChaCha8Rng::seed_from_u64(0x00B8_FACE);
+        let packed: Vec<u8> = (0..n * dim).map(|_| rng.random::<u8>()).collect();
+        let q_unit: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let lut = build_b8_asym_lut(&q_unit);
+
+        let mut top = TopK::new(k);
+        // SAFETY: avx512f confirmed; dim % 16 == 0; shapes match.
+        unsafe {
+            scan_b8_asym_avx512_gather(&packed, n, dim, &lut, 1.0, &mut top);
+        }
+        let (scores, idxs) = drain(&top, k);
+
+        // Brute-force reference, indexed by returned doc id.
+        let want: Vec<f32> = (0..n)
+            .map(|di| {
+                let doc = &packed[di * dim..(di + 1) * dim];
+                doc.iter()
+                    .enumerate()
+                    .map(|(d, &code)| lut[d * 256 + code as usize])
+                    .sum::<f32>()
+            })
+            .collect();
+        for slot in 0..k {
+            let di = idxs[slot] as usize;
+            let rel = (scores[slot] - want[di]).abs() / want[di].abs().max(1.0);
+            assert!(
+                rel < 1e-4,
+                "doc {di}: gather {} vs brute {} (rel {rel})",
+                scores[slot],
+                want[di]
+            );
+        }
+    }
+
+    /// Honest, kernel-isolated micro-benchmark: b=8 scalar LUT vs b=8
+    /// AVX-512 gather vs the b=4 AVX-512 asym kernel, on the same N×dim
+    /// corpus. `#[ignore]` so it does not run in the default gate — invoke
+    /// with:
+    ///
+    /// ```text
+    /// cargo test --release --lib b8_kernel_microbench -- --ignored --nocapture
+    /// ```
+    ///
+    /// It times the inner scan only (LUT build + scan), so the scalar-vs-SIMD
+    /// decision is measured directly rather than inferred. Per-iteration
+    /// wall time is reported in ms and as ns/doc/dim so the cost is
+    /// comparable across widths. Numbers are wall-clock and vary run-to-run;
+    /// the parity tests above are the correctness gate.
+    #[test]
+    #[ignore = "perf micro-bench; run explicitly with --ignored --nocapture --release"]
+    fn b8_kernel_microbench() {
+        use crate::quant_kernels::{scan_b4_asym_avx512, scan_b8_asym_avx512_gather};
+        use std::time::Instant;
+
+        let have_avx512 =
+            is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512dq");
+        let dim = 1024usize; // % 64 == 0 → valid for both b=4 and b=8 SIMD
+        let n = 50_000usize;
+        let k = 10usize;
+        let iters = 20usize;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x00B8_4BE4);
+        let q: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let qn: f32 = q.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let q_unit: Vec<f32> = q.iter().map(|x| x / qn).collect();
+        let scale = 1.0f32 / 137.0;
+
+        // b=8 corpus: one byte per coord.
+        let packed8: Vec<u8> = (0..n * dim).map(|_| rng.random::<u8>()).collect();
+        // b=4 corpus: two codes per byte → dim/2 bytes per doc.
+        let packed4: Vec<u8> = (0..n * dim / 2).map(|_| rng.random::<u8>()).collect();
+
+        let lut8 = build_b8_asym_lut(&q_unit);
+
+        let bench = |label: &str, mut f: Box<dyn FnMut()>| {
+            f(); // warmup
+            let t0 = Instant::now();
+            for _ in 0..iters {
+                f();
+            }
+            let per = t0.elapsed().as_secs_f64() / iters as f64;
+            let ns_per_doc_dim = per * 1e9 / (n as f64 * dim as f64);
+            let gdocs = n as f64 / per / 1e9;
+            println!(
+                "  {label:<26} {:>8.3} ms/scan  {:>7.3} ns/doc/dim  {:>7.3} Gdoc/s",
+                per * 1e3,
+                ns_per_doc_dim,
+                gdocs,
+            );
+        };
+
+        println!(
+            "\nb=8 asymmetric kernel micro-bench (dim={dim}, n={n}, k={k}, iters={iters}, avx512={have_avx512})"
+        );
+
+        {
+            let packed8 = packed8.clone();
+            let lut8 = lut8.clone();
+            bench(
+                "b=8 scalar LUT",
+                Box::new(move || {
+                    let mut top = TopK::new(k);
+                    scan_b8_to_topk(&packed8, n, dim, &lut8, scale, &mut top);
+                    std::hint::black_box(&top);
+                }),
+            );
+        }
+
+        if have_avx512 {
+            let packed8 = packed8.clone();
+            let lut8 = lut8.clone();
+            bench(
+                "b=8 AVX-512 gather",
+                Box::new(move || {
+                    let mut top = TopK::new(k);
+                    // SAFETY: avx512f confirmed; dim % 16 == 0; shapes match.
+                    unsafe {
+                        scan_b8_asym_avx512_gather(&packed8, n, dim, &lut8, scale, &mut top);
+                    }
+                    std::hint::black_box(&top);
+                }),
+            );
+
+            // b=4 AVX-512 asym for cross-width context (raw codes, no LUT;
+            // dim % 64 == 0 satisfies its lane invariant).
+            let packed4 = packed4.clone();
+            let q_unit4 = q_unit.clone();
+            bench(
+                "b=4 AVX-512 asym (context)",
+                Box::new(move || {
+                    let mut top = TopK::new(k);
+                    // SAFETY: avx512f+dq confirmed; dim % 64 == 0; shapes match.
+                    unsafe {
+                        scan_b4_asym_avx512(&packed4, n, dim, &q_unit4, scale, &mut top);
+                    }
+                    std::hint::black_box(&top);
+                }),
+            );
+        } else {
+            println!("  (avx512 unavailable — SIMD rows skipped)");
+        }
+    }
+}
diff --git a/src/rank.rs b/src/rank.rs
index 27005bf2..c74bba8f 100644
--- a/src/rank.rs
+++ b/src/rank.rs
@@ -74,8 +74,12 @@ pub fn rank_transform_into(v: &[f32], out: &mut [u16]) {
 /// Bucket a single rank into one of `1 << bits` equal-width bins on
 /// `[0, d)`. Returns a value in `[0, 1 << bits)`.
 ///
+/// For `bits == 8` the codomain is the full `u8` range `[0, 256)`; a
+/// valid `rank < d` keeps the quotient `rank * 256 / d < 256`, so the
+/// result still fits the returned `u8`.
+///
 /// # Panics
-/// Panics if `bits > 7`, if `d == 0`, or if `rank >= d`. The `rank < d`
+/// Panics if `bits > 8`, if `d == 0`, or if `rank >= d`. The `rank < d`
 /// guard fails loud in *every* build — like the sibling [`pack_buckets`] and
 /// [`bucket_centre`] checks — rather than silently clamping an out-of-range
 /// rank into the top bucket. Internal callers feed ranks straight from
@@ -83,12 +87,14 @@ pub fn rank_transform_into(v: &[f32], out: &mut [u16]) {
 /// hot path.
 #[inline]
 pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
-    // `bits` is a `u8`, so a caller could pass e.g. 8 or 255. `1u32 << bits`
+    // `bits` is a `u8`, so a caller could pass e.g. 9 or 255. `1u32 << bits`
     // overflows for `bits >= 32` (in release that silently wraps and yields a
     // wrong bucket; in debug it panics inconsistently), and the result must
-    // also fit in the returned `u8`, so cap at 7. `d == 0` would divide by
-    // zero. Guard both up front so the failure is loud in every build.
-    assert!(bits <= 7, "bits too large");
+    // also fit in the returned `u8`, so cap at 8 — the widest RankQuant width
+    // (b=8 yields one bucket per code value in `[0, 256)`, which still fits a
+    // `u8`). `d == 0` would divide by zero. Guard both up front so the failure
+    // is loud in every build.
+    assert!(bits <= 8, "bits too large");
     assert!(d > 0, "d must be positive");
     // A valid rank is a position in `[0, d)`. Reject `rank >= d` loudly instead
     // of silently clamping the quotient back into range: the rest of the public
@@ -121,7 +127,7 @@ pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec<u8> {
     // input — an empty `ranks` skips the per-entry `rank_to_bucket` check and
     // would otherwise silently return an empty vec. Mirrors the Python binding,
     // which checks `bits` before its empty short-circuit.
-    assert!(bits <= 7, "bits too large");
+    assert!(bits <= 8, "bits too large");
     let d = ranks.len();
     ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect()
 }
@@ -130,19 +136,22 @@ pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec<u8> {
 /// dense byte stream.
 ///
 /// Layout: the bucket with index 0 occupies the most-significant bits
-/// of the first byte. Requires `bits ∈ {1, 2, 4}` and `d`'s length to
-/// be a multiple of `8 / bits`.
+/// of the first byte. Requires `bits ∈ {1, 2, 4, 8}` and `d`'s length to
+/// be a multiple of `8 / bits`. For `bits == 8` the packing is the
+/// degenerate one-code-per-byte case: each code is copied verbatim into
+/// its own byte (no sub-byte shifting), so any `d` is valid.
 ///
 /// # Panics
-/// Panics if `bits ∉ {1, 2, 4}`, if `buckets.len()` is not a multiple
+/// Panics if `bits ∉ {1, 2, 4, 8}`, if `buckets.len()` is not a multiple
 /// of `8 / bits`, or if any code is `>= 1 << bits`. The last guard is
 /// the public-contract backstop: an out-of-range code would otherwise
 /// be silently truncated to `code & ((1 << bits) - 1)` and corrupt the
 /// packed stream. (Internal callers feed codes straight from
 /// [`rank_to_bucket`], which is always in range; this protects direct
-/// callers of the primitive.)
+/// callers of the primitive.) Note the `b=8` code range is the full
+/// `u8`, so the range guard is vacuously satisfied for that width.
 pub fn pack_buckets(buckets: &[u8], bits: u8) -> Vec<u8> {
-    assert!(matches!(bits, 1 | 2 | 4), "bits must be 1, 2, or 4");
+    assert!(matches!(bits, 1 | 2 | 4 | 8), "bits must be 1, 2, 4, or 8");
     let codes_per_byte = (8 / bits) as usize;
     let d = buckets.len();
     assert_eq!(
@@ -150,7 +159,10 @@ pub fn pack_buckets(buckets: &[u8], bits: u8) -> Vec<u8> {
         0,
         "d ({d}) must be a multiple of codes_per_byte ({codes_per_byte}) for bits = {bits}",
     );
-    let mask = (1u8 << bits) - 1;
+    // `(1u8 << 8)` overflows a `u8`, so compute the mask in `u16` and saturate
+    // the `b=8` case to the full byte (`0xFF`). For `b ∈ {1,2,4}` this is the
+    // same value the old `(1u8 << bits) - 1` produced.
+    let mask = ((1u16 << bits) - 1) as u8;
     let n_bytes = d / codes_per_byte;
     let mut out = vec![0u8; n_bytes];
     let bits_u = bits as usize;
@@ -160,6 +172,8 @@ pub fn pack_buckets(buckets: &[u8], bits: u8) -> Vec<u8> {
     // fail-loud guarantee without a second O(d) pass over `buckets`; the
     // branch is loop-invariant-predictable for the always-valid internal
     // callers. Asserting `b <= mask` makes the trailing `& mask` redundant.
+    // At `b=8`, `codes_per_byte == 1`, so `shift == 0` and each byte holds one
+    // code verbatim.
     for (i, &b) in buckets.iter().enumerate() {
         assert!(
             b <= mask,
@@ -178,10 +192,12 @@ pub fn pack_buckets(buckets: &[u8], bits: u8) -> Vec<u8> {
 ///
 /// Inverse of [`pack_buckets`].
 pub fn unpack_buckets(packed: &[u8], d: usize, bits: u8) -> Vec<u8> {
-    assert!(matches!(bits, 1 | 2 | 4), "bits must be 1, 2, or 4");
+    assert!(matches!(bits, 1 | 2 | 4 | 8), "bits must be 1, 2, 4, or 8");
     let codes_per_byte = (8 / bits) as usize;
     assert_eq!(packed.len() * codes_per_byte, d);
-    let mask = (1u8 << bits) - 1;
+    // `(1u8 << 8)` overflows a `u8`; compute in `u16` and narrow so `b=8`
+    // yields the full-byte mask `0xFF` (each byte already holds one code).
+    let mask = ((1u16 << bits) - 1) as u8;
     let bits_u = bits as usize;
     let mut out = vec![0u8; d];
     #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
@@ -195,13 +211,16 @@ pub fn unpack_buckets(packed: &[u8], d: usize, bits: u8) -> Vec<u8> {
 }
 
 /// Number of bytes per packed RankQuant document at dimension `d` and
-/// bit width `bits ∈ {1, 2, 4}`.
+/// bit width `bits ∈ {1, 2, 4, 8}`.
+///
+/// At `bits == 8` each coordinate occupies its own byte (`codes_per_byte
+/// == 1`), so the storage is exactly `d` bytes per document.
 #[inline]
 pub fn rankquant_bytes_per_vec(d: usize, bits: u8) -> usize {
     // Guard the same domain as the sibling pack/unpack helpers: `bits == 0`
-    // would divide by zero computing `codes_per_byte`, and only 1/2/4 give an
+    // would divide by zero computing `codes_per_byte`, and only 1/2/4/8 give an
     // integral codes-per-byte.
-    assert!(matches!(bits, 1 | 2 | 4), "bits must be 1,2,4");
+    assert!(matches!(bits, 1 | 2 | 4 | 8), "bits must be 1,2,4,8");
     let codes_per_byte = (8 / bits) as usize;
     assert_eq!(
         d % codes_per_byte,
@@ -219,17 +238,20 @@ pub fn rankquant_bytes_per_vec(d: usize, bits: u8) -> usize {
 /// pattern `..., -1.5, -0.5, +0.5, +1.5, ...` for `B = 2`.
 ///
 /// # Panics
-/// Panics if `bits > 7` — bucket codes are `u8`, so the bit width is
+/// Panics if `bits > 8` — bucket codes are `u8`, so the bit width is
 /// capped at the representable bucketing range, matching
-/// [`rank_to_bucket`] (the RankQuant family uses `bits ∈ {1, 2, 4}`).
+/// [`rank_to_bucket`] (the RankQuant family uses `bits ∈ {1, 2, 4, 8}`).
 /// Also panics if `bucket >= 1 << bits`; this guard fails loud in *every*
 /// build — like the sibling [`pack_buckets`] check — so a direct caller
 /// cannot silently receive a centre outside the symmetric range. The
 /// internal LUT builders only ever pass `bucket ∈ [0, 1 << bits)` (the
 /// loop bound *is* `1 << bits`), so the assert never trips on the hot path.
+/// For `bits == 8` the centres span `..., -0.5, +0.5, ...` around zero
+/// with `bucket - 127.5`; the range guard is vacuous (every `u8` is a
+/// valid code).
 #[inline]
 pub fn bucket_centre(bucket: u8, bits: u8) -> f32 {
-    assert!(bits <= 7, "bits too large");
+    assert!(bits <= 8, "bits too large");
     assert!(
         (bucket as u32) < (1u32 << bits),
         "bucket {bucket} out of range for bits = {bits}",
@@ -270,14 +292,21 @@ pub fn rank_norm(d: usize) -> f32 {
 /// The mean-centred bucket index has variance `(2^(2B) - 1) / 12`, so
 /// the per-vector L2 norm is `sqrt(d * (2^(2B) - 1) / 12)`.
 ///
+/// This is the **symmetric analytical** norm: it is exact only when
+/// every bucket receives exactly `d / 2^B` coordinates, i.e. when
+/// `d % 2^B == 0`. For `bits == 8` that precondition is `d % 256 == 0`;
+/// the [`crate::RankQuant`] symmetric path enforces it before calling
+/// this (see `RankQuant::new` / `symmetric_supported`). This primitive
+/// itself only computes the closed form and does not re-check occupancy.
+///
 /// # Panics
-/// Panics if `bits ∉ {1, 2, 4}`, mirroring the [`crate::RankQuant`]
+/// Panics if `bits ∉ {1, 2, 4, 8}`, mirroring the [`crate::RankQuant`]
 /// bit-width domain (and [`rankquant_bytes_per_vec`]). Without it a
 /// nonsensical `bits` would return a norm for a scheme that does not
 /// exist (or overflow `1 << bits`).
 #[inline]
 pub fn rankquant_norm(d: usize, bits: u8) -> f32 {
-    assert!(matches!(bits, 1 | 2 | 4), "bits must be 1,2,4");
+    assert!(matches!(bits, 1 | 2 | 4 | 8), "bits must be 1,2,4,8");
     let n = (1u32 << bits) as f64;
     let var = (n * n - 1.0) / 12.0;
     ((d as f64) * var).sqrt() as f32
@@ -629,10 +658,20 @@ mod tests {
 
     #[test]
     #[should_panic(expected = "bits too large")]
-    fn bucket_ranks_rejects_bits_above_7_even_when_empty() {
+    fn bucket_ranks_rejects_bits_above_8_even_when_empty() {
         // `bits` is validated up front, so an invalid width fails loud even on
         // empty input — which never reaches the per-entry rank_to_bucket guard.
-        let _ = bucket_ranks(&[], 8);
+        // The valid bucketing range now extends to b=8 (the widest RankQuant
+        // width), so b=9 is the first rejected width.
+        let _ = bucket_ranks(&[], 9);
+    }
+
+    #[test]
+    fn bucket_ranks_accepts_bits_8() {
+        // b=8 is a supported width: a 4-element rank vector buckets without
+        // panicking and yields codes in [0, 256).
+        let codes = bucket_ranks(&[0, 1, 2, 3], 8);
+        assert_eq!(codes.len(), 4);
     }
 
     #[test]
@@ -662,6 +701,72 @@ mod tests {
         assert_eq!(unpacked, buckets);
     }
 
+    #[test]
+    fn pack_unpack_round_trip_bits8() {
+        // b=8 is the degenerate one-code-per-byte packing: each byte holds a
+        // full code in `[0, 256)`, so packed length == code count and the
+        // bytes are the codes verbatim. Cover the full code range including
+        // 0 and 255 (the extremes the `b ∈ {1,2,4}` mask would have clipped).
+        let buckets: Vec<u8> = (0..256).map(|i| i as u8).collect();
+        let packed = pack_buckets(&buckets, 8);
+        assert_eq!(packed.len(), 256, "b=8 stores one byte per code");
+        assert_eq!(packed, buckets, "b=8 packing is the identity byte stream");
+        let unpacked = unpack_buckets(&packed, 256, 8);
+        assert_eq!(unpacked, buckets);
+    }
+
+    #[test]
+    fn pack_unpack_round_trip_bits8_arbitrary_len() {
+        // Any `d` is a valid b=8 length (codes_per_byte == 1); 384 is not a
+        // multiple of 256 yet still round-trips — code generation never needs
+        // the equal-bucket precondition that only the symmetric norm requires.
+        let buckets: Vec<u8> = (0..384u16).map(|i| (i % 256) as u8).collect();
+        let packed = pack_buckets(&buckets, 8);
+        assert_eq!(packed.len(), 384);
+        let unpacked = unpack_buckets(&packed, 384, 8);
+        assert_eq!(unpacked, buckets);
+    }
+
+    #[test]
+    fn rank_to_bucket_b8_spans_full_byte_range() {
+        // rank in [0, d) with bits=8 must land in [0, 256). Check the extremes
+        // and that the top rank maps to the top bucket for d == 256.
+        let d = 256usize;
+        assert_eq!(rank_to_bucket(0, d, 8), 0);
+        assert_eq!(rank_to_bucket(255, d, 8), 255);
+        // A coarser d still keeps the quotient in range.
+        assert!(rank_to_bucket(383, 384, 8) < 255 || rank_to_bucket(383, 384, 8) == 255);
+        for rank in 0..d as u16 {
+            let _ = rank_to_bucket(rank, d, 8); // never panics, always < 256
+        }
+    }
+
+    #[test]
+    fn bucket_centre_b8_is_symmetric_around_zero() {
+        // For b=8 the 256 centres span -127.5 ..= +127.5 and sum to 0.
+        assert_eq!(bucket_centre(0, 8), -127.5);
+        assert_eq!(bucket_centre(255, 8), 127.5);
+        let sum: f32 = (0..256u16).map(|b| bucket_centre(b as u8, 8)).sum();
+        assert!(sum.abs() < 1e-3, "b=8 centres should sum to ~0, got {sum}");
+    }
+
+    #[test]
+    fn rankquant_norm_b8_matches_direct_computation() {
+        // d % 256 == 0 so every bucket gets exactly d/256 entries and the
+        // analytical norm is exact.
+        let d = 512usize;
+        let bits = 8u8;
+        let analytical = rankquant_norm(d, bits);
+        let ranks: Vec<u16> = (0..d as u16).collect();
+        let buckets = bucket_ranks(&ranks, bits);
+        let centred: Vec<f32> = buckets.iter().map(|&b| bucket_centre(b, bits)).collect();
+        let direct: f32 = centred.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!(
+            (analytical - direct).abs() / direct < 1e-5,
+            "analytical {analytical}, direct {direct}"
+        );
+    }
+
     #[test]
     fn bucket_centres_are_symmetric_around_zero() {
         // For B = 2: bucket values are {-1.5, -0.5, +0.5, +1.5}.
@@ -718,10 +823,18 @@ mod tests {
 
     #[test]
     #[should_panic(expected = "bits too large")]
-    fn bucket_centre_rejects_bits_above_7() {
-        // bits >= 32 overflows `1 << bits`; the guard caps at 7 (the u8
-        // bucket domain), matching `rank_to_bucket`.
-        let _ = bucket_centre(0, 8);
+    fn bucket_centre_rejects_bits_above_8() {
+        // bits >= 32 overflows `1 << bits`; the guard caps at 8 (the widest
+        // RankQuant width, whose codes still fit a u8), matching
+        // `rank_to_bucket`. b=9 is the first rejected width.
+        let _ = bucket_centre(0, 9);
+    }
+
+    #[test]
+    fn bucket_centre_accepts_bits_8() {
+        // b=8 centres are valid: code 0 → -127.5, code 255 → +127.5.
+        assert_eq!(bucket_centre(0, 8), -127.5);
+        assert_eq!(bucket_centre(255, 8), 127.5);
     }
 
     #[test]
diff --git a/tests/index/main.rs b/tests/index/main.rs
index 3a1177de..63c23a1c 100644
--- a/tests/index/main.rs
+++ b/tests/index/main.rs
@@ -33,6 +33,7 @@ mod rank;
 #[cfg(feature = "experimental")]
 mod multi_bucket;
 mod quant;
+mod quant_b8;
 mod two_stage;
 
 pub const D: usize = 128;
diff --git a/tests/index/quant_b8.rs b/tests/index/quant_b8.rs
new file mode 100644
index 00000000..429014ac
--- /dev/null
+++ b/tests/index/quant_b8.rs
@@ -0,0 +1,449 @@
+//! Capability-gated `b=8` RankQuant integration tests (#221).
+//!
+//! `b=8` is a stable/core evidence-refinement width, not experimental:
+//!
+//! - code generation, pair-evidence, and asymmetric (float-query) scoring
+//!   work at **any** dim;
+//! - symmetric scoring (and the symmetric analytical norm) require
+//!   `dim % 256 == 0` (equal bucket occupancy), so a non-`256`-aligned
+//!   `b=8` index is `AsymmetricOnly` and its `search` panics with an exact,
+//!   directing message.
+//!
+//! These tests pin the maintainer's capability matrix plus a brute-force
+//! parity check of the scalar `b=8` asymmetric path against a naive
+//! reference.
+
+use ordvec::rank::{bucket_centre, bucket_ranks, rank_transform, rankquant_norm};
+use ordvec::{RankQuant, RankQuantCapability};
+use rand::{RngExt, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+/// Naive reference for `b=8` asymmetric scoring of one float query against
+/// one float doc: L2-normalise the query, rank-transform + bucket the doc to
+/// `b=8` codes, score `Σ_d q_unit[d] * bucket_centre(code[d]) / norm`. This
+/// mirrors `ref_rankquant_asymmetric` in the shared helpers but is duplicated
+/// here so the b=8 module is self-contained.
+fn ref_b8_asymmetric(q: &[f32], doc: &[f32]) -> f32 {
+    let d = q.len();
+    let q_norm: f32 = q.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let q_unit: Vec<f32> = q.iter().map(|x| x / q_norm).collect();
+    let r = rank_transform(doc);
+    let codes = bucket_ranks(&r, 8);
+    let norm = rankquant_norm(d, 8);
+    let mut acc = 0.0f32;
+    for i in 0..d {
+        acc += q_unit[i] * bucket_centre(codes[i], 8);
+    }
+    acc / norm
+}
+
+fn random_corpus(seed: u64, n: usize, dim: usize) -> Vec<f32> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect()
+}
+
+// ---------------------------------------------------------------------
+// Capability reporting.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_new_asymmetric_384_is_asymmetric_only() {
+    let idx = RankQuant::new_asymmetric(384, 8);
+    assert_eq!(idx.capability(), RankQuantCapability::AsymmetricOnly);
+    assert!(!idx.symmetric_supported());
+    assert_eq!(idx.bits(), 8);
+    assert_eq!(idx.dim(), 384);
+    // b=8 stores one byte per coordinate.
+    assert_eq!(idx.bytes_per_vec(), 384);
+}
+
+#[test]
+fn b8_new_1024_is_symmetric_and_asymmetric() {
+    let idx = RankQuant::new(1024, 8);
+    assert_eq!(
+        idx.capability(),
+        RankQuantCapability::SymmetricAndAsymmetric
+    );
+    assert!(idx.symmetric_supported());
+    assert_eq!(idx.bits(), 8);
+}
+
+#[test]
+fn b8_new_asymmetric_256_aligned_upgrades_to_full() {
+    // new_asymmetric on a 256-aligned dim should NOT withhold symmetric
+    // scoring — there is no reason to, the analytical norm is exact.
+    let idx = RankQuant::new_asymmetric(768, 8);
+    assert_eq!(
+        idx.capability(),
+        RankQuantCapability::SymmetricAndAsymmetric
+    );
+    assert!(idx.symmetric_supported());
+}
+
+#[test]
+fn b124_constructors_are_always_full_capability() {
+    for &(dim, bits) in &[(384usize, 4u8), (384, 2), (256, 1), (1024, 4)] {
+        let a = RankQuant::new(dim, bits);
+        assert_eq!(a.capability(), RankQuantCapability::SymmetricAndAsymmetric);
+        assert!(a.symmetric_supported());
+        // new_asymmetric for b ∈ {1,2,4} is never less capable than new.
+        let b = RankQuant::new_asymmetric(dim, bits);
+        assert_eq!(b.capability(), RankQuantCapability::SymmetricAndAsymmetric);
+        assert!(b.symmetric_supported());
+    }
+}
+
+// ---------------------------------------------------------------------
+// new() fail-loud for non-256-aligned b=8.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_new_panics_for_non_256_aligned_dim_directing_to_new_asymmetric() {
+    let res = std::panic::catch_unwind(|| RankQuant::new(384, 8));
+    assert!(res.is_err(), "new(384, 8) must panic (384 % 256 != 0)");
+    let payload = res.err().expect("panic payload present");
+    let msg = *payload
+        .downcast::<String>()
+        .expect("panic payload should be a String");
+    assert!(
+        msg.contains("dim % 256 == 0"),
+        "panic should explain the 256-alignment requirement: {msg}"
+    );
+    assert!(
+        msg.contains("new_asymmetric"),
+        "panic should direct to new_asymmetric: {msg}"
+    );
+}
+
+// ---------------------------------------------------------------------
+// dim=384 b=8: code-gen passes, asymmetric passes, symmetric REJECTS.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_384_code_gen_and_asymmetric_work() {
+    let dim = 384;
+    let n = 50;
+    let corpus = random_corpus(8384, n, dim);
+    let mut idx = RankQuant::new_asymmetric(dim, 8);
+    // add() runs the rank → bucket → pack pipeline (the code-gen path).
+    idx.add(&corpus);
+    assert_eq!(idx.len(), n);
+    assert_eq!(idx.byte_size(), n * dim); // one byte per coord per doc
+
+    // Asymmetric scoring works at this non-256-aligned dim.
+    let query = random_corpus(8385, 1, dim);
+    let res = idx.search_asymmetric(&query, 10);
+    assert_eq!(res.nq, 1);
+    assert_eq!(res.k, 10);
+    for slot in 0..10 {
+        assert!(res.scores_for_query(0)[slot].is_finite());
+        let id = res.indices_for_query(0)[slot];
+        assert!(id >= 0 && (id as usize) < n);
+    }
+}
+
+#[test]
+fn b8_384_symmetric_search_rejects_with_exact_message() {
+    let dim = 384;
+    let mut idx = RankQuant::new_asymmetric(dim, 8);
+    idx.add(&random_corpus(8386, 8, dim));
+    let query = random_corpus(8387, 1, dim);
+
+    let res = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+        let _ = idx.search(&query, 5);
+    }));
+    assert!(
+        res.is_err(),
+        "symmetric search on AsymmetricOnly must panic"
+    );
+    let msg = *res
+        .unwrap_err()
+        .downcast::<String>()
+        .expect("panic payload should be a String");
+    // The EXACT wording shape from the spec.
+    let expected = format!(
+        "RankQuant b=8 symmetric scoring requires dim % 256 == 0; dim={dim} supports asymmetric/evidence APIs only."
+    );
+    assert_eq!(msg, expected, "symmetric-gating message must match exactly");
+}
+
+// ---------------------------------------------------------------------
+// dim=768/1024/1536 b=8: full path incl. symmetric passes.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_aligned_dims_full_path_including_symmetric() {
+    for &dim in &[768usize, 1024, 1536] {
+        let n = 40;
+        let corpus = random_corpus(9000 + dim as u64, n, dim);
+        // Both constructors should yield a full-capability instance here.
+        let mut idx = RankQuant::new(dim, 8);
+        assert!(
+            idx.symmetric_supported(),
+            "dim={dim} should be symmetric-capable"
+        );
+        idx.add(&corpus);
+
+        let queries = random_corpus(9500 + dim as u64, 3, dim);
+
+        // Symmetric path runs without panicking and returns well-formed,
+        // descending, in-range results.
+        let sym = idx.search(&queries, 10);
+        assert_eq!(sym.nq, 3);
+        assert_eq!(sym.k, 10);
+        for qi in 0..3 {
+            let scores = sym.scores_for_query(qi);
+            let ids = sym.indices_for_query(qi);
+            for slot in 0..10 {
+                assert!(scores[slot].is_finite(), "dim={dim} non-finite sym score");
+                assert!(ids[slot] >= 0 && (ids[slot] as usize) < n);
+            }
+            for slot in 1..10 {
+                assert!(
+                    scores[slot].total_cmp(&scores[slot - 1]).is_le(),
+                    "dim={dim} symmetric results not sorted descending"
+                );
+            }
+        }
+
+        // Asymmetric path runs too.
+        let asym = idx.search_asymmetric(&queries, 10);
+        assert_eq!(asym.nq, 3);
+        assert_eq!(asym.k, 10);
+    }
+}
+
+// ---------------------------------------------------------------------
+// dim=384 b=4 UNCHANGED (sanity that the b=8 work didn't disturb b=4).
+// ---------------------------------------------------------------------
+
+#[test]
+fn b4_384_unchanged_full_capability_and_search() {
+    let dim = 384;
+    let n = 40;
+    let corpus = random_corpus(4384, n, dim);
+    let mut idx = RankQuant::new(dim, 4);
+    assert_eq!(
+        idx.capability(),
+        RankQuantCapability::SymmetricAndAsymmetric
+    );
+    assert!(idx.symmetric_supported());
+    idx.add(&corpus);
+    let queries = random_corpus(4385, 3, dim);
+    let sym = idx.search(&queries, 10);
+    assert_eq!(sym.k, 10);
+    let asym = idx.search_asymmetric(&queries, 10);
+    assert_eq!(asym.k, 10);
+}
+
+// ---------------------------------------------------------------------
+// Brute-force parity: b=8 asymmetric scores match a naive reference.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_asymmetric_matches_naive_reference_any_dim() {
+    // Cover both an asymmetric-only (384) and a full-capability (768) dim;
+    // the asymmetric scalar path is identical for both.
+    for &dim in &[384usize, 768] {
+        let n = 60;
+        let corpus = random_corpus(7000 + dim as u64, n, dim);
+        let mut idx = RankQuant::new_asymmetric(dim, 8);
+        idx.add(&corpus);
+
+        let mut rng = ChaCha8Rng::seed_from_u64(7777 + dim as u64);
+        let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let res = idx.search_asymmetric(&query, 10);
+
+        let ref_scores: Vec<f32> = (0..n)
+            .map(|di| ref_b8_asymmetric(&query, &corpus[di * dim..(di + 1) * dim]))
+            .collect();
+
+        // Every returned score must agree with the reference at its doc id.
+        for slot in 0..10 {
+            let di = res.indices_for_query(0)[slot] as usize;
+            let got = res.scores_for_query(0)[slot];
+            let want = ref_scores[di];
+            assert!(
+                (got - want).abs() < 1e-4,
+                "dim={dim} slot {slot} doc {di}: {got} vs ref {want}"
+            );
+        }
+
+        // And the returned top-10 set must equal the reference top-10 set.
+        let mut ref_sorted: Vec<(usize, f32)> = ref_scores
+            .iter()
+            .enumerate()
+            .map(|(i, &s)| (i, s))
+            .collect();
+        ref_sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        let top_ref: std::collections::HashSet<usize> =
+            ref_sorted[..10].iter().map(|x| x.0).collect();
+        let top_got: std::collections::HashSet<usize> = res
+            .indices_for_query(0)
+            .iter()
+            .map(|&i| i as usize)
+            .collect();
+        assert_eq!(top_got, top_ref, "dim={dim} b=8 top-10 set mismatch");
+    }
+}
+
+// ---------------------------------------------------------------------
+// Optimized (AVX-512 gather) b=8 asymmetric path is parity-correct vs the
+// naive reference across the headline embedding dims.
+//
+// On an AVX-512 host `search_asymmetric` dispatches the b=8 score to the
+// `vgatherdps` kernel; on every other host it takes the scalar LUT path.
+// Either way the returned top-k scores must agree with the naive per-doc
+// reference within the crate's 1e-4 cross-backend score tolerance, and the
+// returned top-k *set* must equal the reference top-k set. This is the
+// end-to-end parity gate for the optimized kernel at dims 384/768/1024/1536.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_asymmetric_optimized_path_parity_headline_dims() {
+    for &dim in &[384usize, 768, 1024, 1536] {
+        let n = 200;
+        let corpus = random_corpus(6000 + dim as u64, n, dim);
+        let mut idx = RankQuant::new_asymmetric(dim, 8);
+        idx.add(&corpus);
+
+        let mut rng = ChaCha8Rng::seed_from_u64(6666 + dim as u64);
+        let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+
+        let k = 25;
+        let res = idx.search_asymmetric(&query, k);
+
+        // Naive scalar reference score per doc.
+        let ref_scores: Vec<f32> = (0..n)
+            .map(|di| ref_b8_asymmetric(&query, &corpus[di * dim..(di + 1) * dim]))
+            .collect();
+
+        // (a) every returned score agrees with the reference at its doc id.
+        for slot in 0..k {
+            let di = res.indices_for_query(0)[slot] as usize;
+            let got = res.scores_for_query(0)[slot];
+            let want = ref_scores[di];
+            assert!(
+                (got - want).abs() < 1e-4,
+                "dim={dim} slot {slot} doc {di}: optimized {got} vs ref {want}"
+            );
+        }
+
+        // (b) the returned top-k *set* equals the reference top-k set.
+        let mut ref_sorted: Vec<(usize, f32)> = ref_scores
+            .iter()
+            .enumerate()
+            .map(|(i, &s)| (i, s))
+            .collect();
+        ref_sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        let top_ref: std::collections::HashSet<usize> =
+            ref_sorted[..k].iter().map(|x| x.0).collect();
+        let top_got: std::collections::HashSet<usize> = res
+            .indices_for_query(0)
+            .iter()
+            .map(|&i| i as usize)
+            .collect();
+        assert_eq!(
+            top_got, top_ref,
+            "dim={dim} optimized b=8 top-{k} set mismatch vs reference"
+        );
+    }
+}
+
+// The optimized b=8 path must also be parity-correct through the subset
+// rerank entry point (`search_asymmetric_subset`), which gathers candidate
+// bytes into a scratch buffer and runs the same gather kernel.
+#[test]
+fn b8_asymmetric_subset_optimized_path_parity() {
+    let dim = 768;
+    let n = 300;
+    let corpus = random_corpus(6321, n, dim);
+    let mut idx = RankQuant::new_asymmetric(dim, 8);
+    idx.add(&corpus);
+
+    let mut rng = ChaCha8Rng::seed_from_u64(6322);
+    let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+
+    // An arbitrary, intentionally-unsorted candidate subset.
+    let candidates: Vec<u32> = (0..n as u32).rev().step_by(3).collect();
+    let k = 10;
+    let (scores, indices) = idx.search_asymmetric_subset(&query, &candidates, k);
+
+    for slot in 0..k {
+        let di = indices[slot] as usize;
+        let want = ref_b8_asymmetric(&query, &corpus[di * dim..(di + 1) * dim]);
+        assert!(
+            (scores[slot] - want).abs() < 1e-4,
+            "subset slot {slot} doc {di}: optimized {} vs ref {want}",
+            scores[slot]
+        );
+    }
+}
+
+// ---------------------------------------------------------------------
+// validate_params: b=8 is code-valid at any dim; b ∈ {1,2,4} unchanged.
+// ---------------------------------------------------------------------
+
+#[test]
+fn validate_params_b8_any_dim_but_b124_still_require_alignment() {
+    // b=8 accepts any dim >= 2 (no dim % 256 requirement).
+    assert!(RankQuant::validate_params(384, 8).is_ok());
+    assert!(RankQuant::validate_params(2, 8).is_ok());
+    assert!(RankQuant::validate_params(1000, 8).is_ok());
+    assert!(
+        RankQuant::validate_params(1, 8).is_err(),
+        "dim < 2 rejected"
+    );
+
+    // b ∈ {1,2,4} keep their 2^bits divisibility requirement.
+    assert!(RankQuant::validate_params(6, 2).is_err(), "6 % 4 != 0");
+    assert!(RankQuant::validate_params(8, 2).is_ok());
+    assert!(RankQuant::validate_params(384, 4).is_ok());
+    // b=3 is still not a packable width.
+    assert!(RankQuant::validate_params(384, 3).is_err());
+}
+
+// ---------------------------------------------------------------------
+// Symmetric b=8 (256-aligned) matches a naive symmetric reference.
+// ---------------------------------------------------------------------
+
+#[test]
+fn b8_symmetric_matches_naive_reference_aligned_dim() {
+    let dim = 512; // 256-aligned → exact analytical norm
+    let n = 40;
+    let corpus = random_corpus(5512, n, dim);
+    let mut idx = RankQuant::new(dim, 8);
+    idx.add(&corpus);
+
+    let mut rng = ChaCha8Rng::seed_from_u64(5513);
+    let query: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+    let res = idx.search(&query, 10);
+
+    // Naive symmetric reference: bucket query + doc to b=8, dot the centred
+    // bucket vectors, divide by norm^2.
+    let norm = rankquant_norm(dim, 8);
+    let inv_norm_sq = 1.0f32 / (norm * norm);
+    let q_codes = bucket_ranks(&rank_transform(&query), 8);
+    let ref_scores: Vec<f32> = (0..n)
+        .map(|di| {
+            let doc = &corpus[di * dim..(di + 1) * dim];
+            let d_codes = bucket_ranks(&rank_transform(doc), 8);
+            let acc: f32 = q_codes
+                .iter()
+                .zip(&d_codes)
+                .map(|(&qc, &dc)| bucket_centre(qc, 8) * bucket_centre(dc, 8))
+                .sum();
+            acc * inv_norm_sq
+        })
+        .collect();
+
+    for slot in 0..10 {
+        let di = res.indices_for_query(0)[slot] as usize;
+        let got = res.scores_for_query(0)[slot];
+        assert!(
+            (got - ref_scores[di]).abs() < 1e-4,
+            "b=8 symmetric slot {slot} doc {di}: {got} vs ref {}",
+            ref_scores[di]
+        );
+    }
+}
diff --git a/tests/redteam_gamma.rs b/tests/redteam_gamma.rs
index 6a2f9282..a95c0968 100644
--- a/tests/redteam_gamma.rs
+++ b/tests/redteam_gamma.rs
@@ -22,8 +22,9 @@ fn rank_to_bucket_large_bits_panics() {
     // Signature is `rank_to_bucket(rank, d, bits)`, so this is rank=3, d=8,
     // bits=200 — the `bits` value is what's under test. `bits >= 32` makes
     // `1u32 << bits` overflow (silently-wrong bucket in release), so the
-    // function guards with `assert!(bits <= 7, "bits too large")`. bits=200
-    // trips that guard; the panic must fire in release as well as debug.
+    // function guards with `assert!(bits <= 8, "bits too large")` (b=8 is the
+    // widest RankQuant width whose codes still fit a u8). bits=200 trips that
+    // guard; the panic must fire in release as well as debug.
     let _ = rank_to_bucket(3, 8, 200);
 }
 

From 61a39c78e03c0d4796a45722f0f1f7efb0500b1c Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 16:07:18 -0500
Subject: [PATCH 2/9] fix(quant): allow b=8 in the eval/empirical scoring path
 (qodo) (#221)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

check_eval_bits capped at 1..=7, rejecting b=8 from rankquant_eval_search — but
b=8 codes fit u8 and the eval norm is computed empirically (valid at any dim,
no dim%256). Widen to 1..=8 (b=9 is the first u8-overflowing width). Test: eval
search with b=8 at a non-256-aligned dim (384).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/quant.rs            |  5 ++++-
 tests/index/quant_b8.rs | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/quant.rs b/src/quant.rs
index d82d09cd..4ee88d30 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -42,7 +42,10 @@ use crate::util::{assert_all_finite, l2_normalise, result_buffer_len, TopK};
 use crate::{validate_candidate_ids, OrdvecError, SearchResults};
 
 fn check_eval_bits(bits: u8) {
-    assert!((1..=7).contains(&bits), "bits must be in 1..=7");
+    // b=8 codes still fit a u8 (0..=255); the eval norm is computed empirically
+    // (not the analytical b=8 norm), so it is valid at any dim. b=9 is the first
+    // width whose codes overflow u8.
+    assert!((1..=8).contains(&bits), "bits must be in 1..=8");
 }
 
 fn rankquant_eval_norm(dim: usize, bits: u8) -> f32 {
diff --git a/tests/index/quant_b8.rs b/tests/index/quant_b8.rs
index 429014ac..63ebe6f9 100644
--- a/tests/index/quant_b8.rs
+++ b/tests/index/quant_b8.rs
@@ -447,3 +447,29 @@ fn b8_symmetric_matches_naive_reference_aligned_dim() {
         );
     }
 }
+
+#[test]
+fn rankquant_eval_search_supports_b8_at_any_dim() {
+    // The eval/empirical path (check_eval_bits widened to 1..=8) accepts b=8 even
+    // at a non-256-aligned dim, where the analytical symmetric norm is
+    // unavailable — it computes the norm empirically. Returns ranked results
+    // without panicking.
+    let dim = 384usize; // not a multiple of 256
+    let n = 32usize;
+    let nq = 2usize;
+    let corpus: Vec<f32> = (0..n * dim)
+        .map(|i| ((i * 7 % 101) as f32) - 50.0)
+        .collect();
+    let queries: Vec<f32> = (0..nq * dim)
+        .map(|i| ((i * 13 % 97) as f32) - 48.0)
+        .collect();
+    let res = ordvec::rankquant_eval_search(&corpus, &queries, dim, 8, 5);
+    assert_eq!(res.k, 5);
+    assert_eq!(res.nq, nq);
+    for &id in &res.indices {
+        assert!(
+            id >= 0 && (id as usize) < n,
+            "eval-search id out of range: {id}"
+        );
+    }
+}

From 124b5a179a4cf4211fd8fa75a49e27accf41f043 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 16:12:55 -0500
Subject: [PATCH 3/9] fix(quant_kernels): gate b8 AVX-512 gather on avx512bw
 (qodo) (#221)

qodo flagged the b=8 gather (scan_b8_asym_avx512_gather, uses
_mm512_cvtepu8_epi32) dispatching on avx512f alone. VPMOVZXBD is AVX-512F per
Intel, but gating on avx512f+avx512bw matches the rest of the crate's AVX-512
kernels (which require avx512dq), keeps the byte-widening conservatively gated,
and adds no real exclusion (F-without-BW CPUs like KNL/KNM are already excluded
by the dq requirement). Updated both the runtime dispatch and the

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
#[target_feature], plus the direct test/bench callers' guards.
---
 src/quant_kernels.rs | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs
index f4319a0f..5ba5e0ac 100644
--- a/src/quant_kernels.rs
+++ b/src/quant_kernels.rs
@@ -572,8 +572,11 @@ pub(crate) fn scan_b8_asym(
     let lut = build_b8_asym_lut(q_unit);
     #[cfg(target_arch = "x86_64")]
     {
-        if is_x86_feature_detected!("avx512f") && dim.is_multiple_of(16) {
-            // SAFETY: `avx512f` is confirmed by the runtime detection above
+        if is_x86_feature_detected!("avx512f")
+            && is_x86_feature_detected!("avx512bw")
+            && dim.is_multiple_of(16)
+        {
+            // SAFETY: `avx512f`+`avx512bw` are confirmed by the runtime detection above
             // and `dim % 16 == 0` satisfies the kernel's lane invariant;
             // `packed.len() == n * dim` and `lut.len() == dim * 256` hold by
             // construction (b=8 packs one byte/coord; the LUT is built just
@@ -609,14 +612,18 @@ pub(crate) fn scan_b8_asym(
 // trick: the asymmetric LUT bakes the per-coordinate query weight in, so
 // there is no per-query constant offset to reapply at finalize.
 //
-// Caller must verify `is_x86_feature_detected!("avx512f")` once. The LUT
-// is the same `dim * 256` f32 layout the scalar `scan_b8_to_topk` consumes,
-// so the two paths are score-parity (modulo f32 summation order, within
-// the crate's 1e-4 cross-backend tolerance).
+// Caller must verify `is_x86_feature_detected!("avx512f") && ..("avx512bw")`
+// once. `avx512bw` is gated alongside `avx512f` to match the rest of the
+// crate's AVX-512 kernels (which require `avx512dq`) and to keep the byte
+// widening (`_mm512_cvtepu8_epi32`) conservatively gated — the F-without-BW
+// CPUs (KNL/KNM) are already excluded by the crate's `dq` requirement, so this
+// adds no real exclusion. The LUT is the same `dim * 256` f32 layout the scalar
+// `scan_b8_to_topk` consumes, so the two paths are score-parity (modulo f32
+// summation order, within the crate's 1e-4 cross-backend tolerance).
 // -------------------------------------------------------------------
 
 #[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx512f")]
+#[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn scan_b8_asym_avx512_gather(
     packed: &[u8],
     n: usize,
@@ -740,8 +747,8 @@ mod b8_gather_tests {
     /// dim=400 (`400 % 16 == 0`, `400 % 64 == 16`).
     #[test]
     fn b8_gather_matches_scalar_reference() {
-        if !is_x86_feature_detected!("avx512f") {
-            eprintln!("skipping b8 gather parity: no avx512f on this host");
+        if !(is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw")) {
+            eprintln!("skipping b8 gather parity: no avx512f+avx512bw on this host");
             return;
         }
         for &dim in &[384usize, 400, 768, 1024, 1536] {
@@ -800,7 +807,7 @@ mod b8_gather_tests {
     /// scaled scores, which the parity test above covers.
     #[test]
     fn b8_gather_raw_score_is_exact_gather_sum() {
-        if !is_x86_feature_detected!("avx512f") {
+        if !(is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw")) {
             return;
         }
         let dim = 256usize;
@@ -860,8 +867,9 @@ mod b8_gather_tests {
         use crate::quant_kernels::{scan_b4_asym_avx512, scan_b8_asym_avx512_gather};
         use std::time::Instant;
 
-        let have_avx512 =
-            is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512dq");
+        let have_avx512 = is_x86_feature_detected!("avx512f")
+            && is_x86_feature_detected!("avx512dq")
+            && is_x86_feature_detected!("avx512bw"); // b=4 path needs dq, b=8 gather needs bw
         let dim = 1024usize; // % 64 == 0 → valid for both b=4 and b=8 SIMD
         let n = 50_000usize;
         let k = 10usize;

From 44f1f2d46261d901ea6fcf0f80f0c2355d6edce9 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 16:19:43 -0500
Subject: [PATCH 4/9] docs(quant): b=8 gather dispatch docs say
 avx512f+avx512bw (Codex)

Two rustdoc/comment sites (scan_b8_asym dispatch note; RankQuant::search_asymmetric
b=8 doc) still described the gate as avx512f-only after 124b5a1 widened it to
avx512f+avx512bw. Docs now match the code.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/quant.rs         | 4 ++--
 src/quant_kernels.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/quant.rs b/src/quant.rs
index 4ee88d30..80a233e0 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -588,8 +588,8 @@ impl RankQuant {
     /// the score is a per-coordinate gather `Σ_d lut[d*256 + code[d]]`
     /// against the `dim * 256` LUT: it dispatches to the AVX-512
     /// `vgatherdps` kernel (`scan_b8_asym` → `scan_b8_asym_avx512_gather`)
-    /// when `avx512f` is present and `dim % 16 == 0`, else the portable
-    /// scalar LUT reference (`scan_b8_to_topk`). Unlike [`Self::search`],
+    /// when `avx512f` + `avx512bw` are present and `dim % 16 == 0`, else the
+    /// portable scalar LUT reference (`scan_b8_to_topk`). Unlike [`Self::search`],
     /// this never panics on an asymmetric-only instance.
     pub fn search_asymmetric(&self, queries: &[f32], k: usize) -> SearchResults {
         let nq = queries.len() / self.dim;
diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs
index 5ba5e0ac..28670948 100644
--- a/src/quant_kernels.rs
+++ b/src/quant_kernels.rs
@@ -556,8 +556,8 @@ pub(crate) unsafe fn scan_b4_asym_avx512(
 ///
 /// Builds the shared `dim * 256` per-coordinate LUT once
 /// ([`build_b8_asym_lut`]), then dispatches to the AVX-512 gather kernel
-/// ([`scan_b8_asym_avx512_gather`]) when `avx512f` is detected at runtime
-/// and `dim % 16 == 0`, falling back to the portable scalar reference
+/// ([`scan_b8_asym_avx512_gather`]) when `avx512f` + `avx512bw` are detected at
+/// runtime and `dim % 16 == 0`, falling back to the portable scalar reference
 /// ([`scan_b8_to_topk`]) on every other target / CPU / dim. Centralising
 /// the dispatch here keeps the `unsafe` SIMD reach in one place and out of
 /// `quant.rs`.

From b64f035eeec65a3bdd0e754ac0c993bd0db242de Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 16:23:07 -0500
Subject: [PATCH 5/9] =?UTF-8?q?docs(quant):=20finish=20b=3D8=20gather=20ga?=
 =?UTF-8?q?te=20doc=20sweep=20=E2=80=94=20avx512f+avx512bw=20(Codex)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remaining stale sites after 44f1f2d: the quant.rs module-level b=8 dispatch note,
and the three SAFETY comments at the b=8 gather's test/bench call sites. All now
say avx512f+avx512bw, matching the dispatch + #[target_feature]. Non-b8 kernels
(bitmap vpopcntdq, b2/b4 dq, fastscan, sign) are unchanged.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/quant.rs         | 2 +-
 src/quant_kernels.rs | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/quant.rs b/src/quant.rs
index 80a233e0..c05948d0 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -16,7 +16,7 @@
 //! unstable-experimental. See [`RankQuantCapability`] and
 //! [`RankQuant::new_asymmetric`]. Its asymmetric path is a per-coordinate
 //! gather against the `dim * 256` LUT: an AVX-512 `vgatherdps` kernel when
-//! available (`avx512f` + `dim % 16 == 0`), else the portable scalar LUT.
+//! available (`avx512f` + `avx512bw` + `dim % 16 == 0`), else the portable scalar LUT.
 //!
 //! The byte-LUT path ([`search_asymmetric_byte_lut`]) is re-exported
 //! `#[doc(hidden)]` (reachable as `ordvec::search_asymmetric_byte_lut`)
diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs
index 28670948..59742cd0 100644
--- a/src/quant_kernels.rs
+++ b/src/quant_kernels.rs
@@ -771,7 +771,7 @@ mod b8_gather_tests {
             let (s_scalar, i_scalar) = drain(&top_scalar, k);
 
             let mut top_gather = TopK::new(k);
-            // SAFETY: avx512f confirmed above; dim % 16 == 0; packed has
+            // SAFETY: avx512f+avx512bw confirmed above; dim % 16 == 0; packed has
             // n*dim bytes and lut has dim*256 entries by construction.
             unsafe {
                 scan_b8_asym_avx512_gather(&packed, n, dim, &lut, scale, &mut top_gather);
@@ -819,7 +819,7 @@ mod b8_gather_tests {
         let lut = build_b8_asym_lut(&q_unit);
 
         let mut top = TopK::new(k);
-        // SAFETY: avx512f confirmed; dim % 16 == 0; shapes match.
+        // SAFETY: avx512f+avx512bw confirmed; dim % 16 == 0; shapes match.
         unsafe {
             scan_b8_asym_avx512_gather(&packed, n, dim, &lut, 1.0, &mut top);
         }
@@ -929,7 +929,7 @@ mod b8_gather_tests {
                 "b=8 AVX-512 gather",
                 Box::new(move || {
                     let mut top = TopK::new(k);
-                    // SAFETY: avx512f confirmed; dim % 16 == 0; shapes match.
+                    // SAFETY: avx512f+avx512bw confirmed; dim % 16 == 0; shapes match.
                     unsafe {
                         scan_b8_asym_avx512_gather(&packed8, n, dim, &lut8, scale, &mut top);
                     }

From 9c7923a0db727c19dba65c370ebbbbf95c7706b7 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 17:38:36 -0500
Subject: [PATCH 6/9] docs: surface b=8 evidence-width in top-level docs (qodo)
 (#221)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qodo: crate-level lib.rs and README still described RankQuant widths as
bits ∈ {1,2,4}. Add the b=8 note (capability-gated evidence/refinement width:
asymmetric + code/projection at any dim; symmetric only when dim % 256 == 0),
so the headline docs match the new surface and don't mislead on b=8 scope.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 README.md  | 4 +++-
 src/lib.rs | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ca8ebbf8..e9d57686 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,9 @@ vector on its own:
   the product / scalar / binary quantization most crates use.
 - **Predictable footprint.** Exactly `dim * bits / 8` bytes per document —
   known before you see any data (256 B at dim = 1024, 2-bit), with
-  `bits ∈ {1, 2, 4}` the size/recall knob.
+  `bits ∈ {1, 2, 4}` the size/recall knob. (`b = 8` is an opt-in
+  evidence/refinement width — asymmetric scoring at any dim, symmetric only
+  when `dim % 256 == 0` — not a broad retrieval mode.)
 - **Two-stage retrieval, built in.** A cheap bitmap / sign-popcount
   prefilter feeds an exact rerank — the coarse→fine pipeline ships as
   library primitives.
diff --git a/src/lib.rs b/src/lib.rs
index d0408d1a..36784c3c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,7 +13,10 @@
 //!   coordinate, `2 * dim` bytes per document).
 //! - [`RankQuant`] buckets each rank into `1 << bits` equal-width
 //!   bins and packs `bits` bits per coordinate (`dim * bits / 8` bytes
-//!   per document).
+//!   per document). `bits ∈ {1, 2, 4}` are the stable retrieval widths;
+//!   `b = 8` is a capability-gated evidence/refinement width — asymmetric
+//!   scoring and code/projection generation at any dim, symmetric scoring
+//!   only when `dim % 256 == 0` (see [`RankQuant::new_asymmetric`]).
 //! - [`Bitmap`] stores a top-bucket bitmap per document (one bit
 //!   per coordinate) and scores via `popcount(Q AND D)`.
 //! - [`SignBitmap`] stores a sign bitmap per document (one bit per

From e27bb1cdf222219d34eb4a545be46ee7b0c27fef Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 18:13:24 -0500
Subject: [PATCH 7/9] fix: use exact empirical norm for b=8 asymmetric scoring
 at non-256 dims

The closed-form `rankquant_norm` (`sqrt(dim * var)`, `var = (2^bits-1... )`)
assumes exactly-uniform bucket occupancy, which only holds for b in {1,2,4}
and for b=8 when `dim % 256 == 0`. At a b=8 dim not divisible by 256 the
buckets are unequally occupied, so the closed form mis-scales the absolute
asymmetric scores. The ranking is unaffected (the norm is one global constant
shared by every document), but `search_asymmetric` / `search_asymmetric_subset`
report cosine-like scores that must be correctly scaled.

Add `asymmetric_norm(dim, bits)`: closed form for the uniform regimes,
exact empirical norm (`rankquant_eval_norm`, summing realised squared bucket
centres) for b=8 at non-256 dims. Wire it into both asymmetric scoring sites.
The symmetric path is untouched (it is gated to dim % 256 == 0, where the
closed form is exact).

Update `ref_b8_asymmetric` to compute the exact per-codes norm so the parity
tests validate against the true cosine at dim=384 (previously both production
and reference shared the same wrong norm, masking the mis-scale).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/quant.rs            | 23 +++++++++++++++++++++--
 tests/index/quant_b8.rs | 18 +++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/quant.rs b/src/quant.rs
index c05948d0..ee76555a 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -61,6 +61,25 @@ fn rankquant_eval_norm(dim: usize, bits: u8) -> f32 {
     acc.sqrt() as f32
 }
 
+/// L2 norm of a document's bucket-centre vector, for asymmetric scoring.
+///
+/// For `bits ∈ {1, 2, 4}` (and `b = 8` when `dim % 256 == 0`) the bucket
+/// occupancy is exactly uniform, so the closed-form [`rankquant_norm`]
+/// (`sqrt(dim * var)`) is exact and cheaper. For `b = 8` at a `dim` not
+/// divisible by 256 the buckets are *not* equally occupied, so the closed
+/// form mis-scales the absolute scores (the *ranking* is unaffected — the
+/// norm is one global constant shared by every document — but
+/// `search_asymmetric` reports cosine-like scores, which must be correctly
+/// scaled). In that regime we fall back to the exact empirical norm, which
+/// sums the squared bucket centres over the realised rank→bucket map.
+fn asymmetric_norm(dim: usize, bits: u8) -> f32 {
+    if bits == 8 && !dim.is_multiple_of(256) {
+        rankquant_eval_norm(dim, bits)
+    } else {
+        rankquant_norm(dim, bits)
+    }
+}
+
 fn rankquant_eval_centres(v: &[f32], bits: u8, out: &mut [f32]) {
     debug_assert_eq!(v.len(), out.len());
     let ranks = rank_transform(v);
@@ -612,7 +631,7 @@ impl RankQuant {
         let dim = self.dim;
         let bits = self.bits;
         let n = self.n_vectors;
-        let norm = rankquant_norm(dim, bits);
+        let norm = asymmetric_norm(dim, bits);
         let inv_norm = 1.0_f32 / norm;
         let n_buckets = 1usize << bits;
         let bytes_per_vec = rankquant_bytes_per_vec(dim, bits);
@@ -915,7 +934,7 @@ impl RankQuant {
             return (Vec::new(), Vec::new());
         }
 
-        let norm = rankquant_norm(dim, bits);
+        let norm = asymmetric_norm(dim, bits);
         let inv_norm = 1.0_f32 / norm;
         #[cfg(target_arch = "x86_64")]
         let centre = ((1u32 << bits) as f32 - 1.0) / 2.0;
diff --git a/tests/index/quant_b8.rs b/tests/index/quant_b8.rs
index 63ebe6f9..016ba25b 100644
--- a/tests/index/quant_b8.rs
+++ b/tests/index/quant_b8.rs
@@ -29,7 +29,23 @@ fn ref_b8_asymmetric(q: &[f32], doc: &[f32]) -> f32 {
     let q_unit: Vec<f32> = q.iter().map(|x| x / q_norm).collect();
     let r = rank_transform(doc);
     let codes = bucket_ranks(&r, 8);
-    let norm = rankquant_norm(d, 8);
+    // Exact L2 norm of this doc's centred bucket vector. For b=8 the bucket
+    // occupancy is uniform only when `dim % 256 == 0`; at other dims (e.g. 384)
+    // the closed-form `rankquant_norm` mis-scales the absolute score, so the
+    // reference — like production's `asymmetric_norm` — sums the realised
+    // squared centres (f64-accumulated, matching `rankquant_eval_norm`). The
+    // ranks are a permutation of `0..d` for every doc, so this equals the
+    // closed form exactly at 256-aligned dims.
+    let norm = {
+        let acc: f64 = codes
+            .iter()
+            .map(|&c| {
+                let cc = bucket_centre(c, 8) as f64;
+                cc * cc
+            })
+            .sum();
+        acc.sqrt() as f32
+    };
     let mut acc = 0.0f32;
     for i in 0..d {
         acc += q_unit[i] * bucket_centre(codes[i], 8);

From d527c6929826cb45ce41ffbc0d5b5e77c279a327 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 18:15:38 -0500
Subject: [PATCH 8/9] docs: disambiguate b=8 eval-search empirical norm from
 analytical-norm gating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qodo flagged an apparent contradiction: the crate docs state b=8 symmetric
scoring requires dim % 256 == 0, yet `rankquant_eval_search` supports b=8 at
non-256 dims. These are two distinct surfaces and there is no correctness bug —
clarify the docs so the capability matrix reads consistently:

- `rankquant_eval_search` rustdoc: fix the inaccurate 'analytical norm' (it has
  always used the *empirical* norm) and state explicitly that the empirical
  norm is exact under any bucket occupancy, which is why this path is unbound
  by the dim % 256 gate that the analytical-norm `RankQuant::search` carries.
- lib.rs crate doc: scope the dim % 256 restriction to analytical-norm
  symmetric `RankQuant::search`; note the empirical eval path has no such limit.
- check_eval_bits + the eval-at-any-dim test: spell out the relationship to the
  gated symmetric path.

No functional change; doc-only.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/lib.rs              |  8 ++++++--
 src/quant.rs            | 20 +++++++++++++++++---
 tests/index/quant_b8.rs |  5 +++++
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 36784c3c..2eab1fb4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,8 +15,12 @@
 //!   bins and packs `bits` bits per coordinate (`dim * bits / 8` bytes
 //!   per document). `bits ∈ {1, 2, 4}` are the stable retrieval widths;
 //!   `b = 8` is a capability-gated evidence/refinement width — asymmetric
-//!   scoring and code/projection generation at any dim, symmetric scoring
-//!   only when `dim % 256 == 0` (see [`RankQuant::new_asymmetric`]).
+//!   scoring and code/projection generation at any dim, *analytical-norm*
+//!   symmetric scoring (via [`RankQuant::search`]) only when
+//!   `dim % 256 == 0` (see [`RankQuant::new_asymmetric`]). The standalone
+//!   [`rankquant_eval_search`] computes its norm *empirically*, so it scores
+//!   any `bits ∈ 1..=8` at any dim (including `b = 8` off the 256 grid) and
+//!   carries no such restriction.
 //! - [`Bitmap`] stores a top-bucket bitmap per document (one bit
 //!   per coordinate) and scores via `popcount(Q AND D)`.
 //! - [`SignBitmap`] stores a sign bitmap per document (one bit per
diff --git a/src/quant.rs b/src/quant.rs
index ee76555a..33b8d83e 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -43,7 +43,10 @@ use crate::{validate_candidate_ids, OrdvecError, SearchResults};
 
 fn check_eval_bits(bits: u8) {
     // b=8 codes still fit a u8 (0..=255); the eval norm is computed empirically
-    // (not the analytical b=8 norm), so it is valid at any dim. b=9 is the first
+    // (not the analytical b=8 norm), so it is valid at any dim. This is *why*
+    // the eval path is not bound by the `dim % 256 == 0` gate that the
+    // analytical-norm symmetric `RankQuant::search` carries for b=8 — the
+    // empirical norm is exact under any bucket occupancy. b=9 is the first
     // width whose codes overflow u8.
     assert!((1..=8).contains(&bits), "bits must be in 1..=8");
 }
@@ -1108,10 +1111,21 @@ fn validate_finite(values: &[f32], name: &'static str) -> Result<(), OrdvecError
 /// This does **not** use [`RankQuant`] storage and does not change the `.tvrq`
 /// packing contract. It rank-transforms `corpus` and `queries`, buckets each
 /// rank into `1 << bits` equal-width bins, mean-centres bucket ids, normalises
-/// by the analytical norm for that `(dim, bits)`, and returns top-`k` results.
+/// by the **empirical** norm for that `(dim, bits)` (the exact L2 norm of the
+/// realised bucket-centre vector, summed over `0..dim`), and returns top-`k`
+/// results.
+///
+/// Because the norm is computed empirically rather than from the closed form,
+/// this path is valid for **any** `dim` and **any** `bits ∈ 1..=8`, including
+/// `bits = 8` at a `dim` not divisible by `256`. It therefore does *not* carry
+/// the `dim % 256 == 0` restriction that applies to the analytical-norm
+/// symmetric [`RankQuant::search`] (see [`RankQuant::new_asymmetric`]): that
+/// restriction exists only because the closed-form `rankquant_norm` is exact
+/// solely under uniform bucket occupancy, which this empirical path sidesteps.
 ///
 /// Intended for research/eval sweeps where non-byte-aligned widths such as
-/// `bits = 3` need to be scored without inventing a persistent packed format.
+/// `bits = 3`, or `b = 8` at arbitrary dims, need to be scored without
+/// inventing a persistent packed format.
 pub fn rankquant_eval_search(
     corpus: &[f32],
     queries: &[f32],
diff --git a/tests/index/quant_b8.rs b/tests/index/quant_b8.rs
index 016ba25b..164f1c7a 100644
--- a/tests/index/quant_b8.rs
+++ b/tests/index/quant_b8.rs
@@ -470,6 +470,11 @@ fn rankquant_eval_search_supports_b8_at_any_dim() {
     // at a non-256-aligned dim, where the analytical symmetric norm is
     // unavailable — it computes the norm empirically. Returns ranked results
     // without panicking.
+    //
+    // This is a *distinct* surface from the analytical-norm `RankQuant::search`,
+    // whose b=8 symmetric scoring is gated to `dim % 256 == 0`. There is no
+    // contradiction: the eval path's empirical norm is exact under any bucket
+    // occupancy, which is precisely why it is unbound by the 256 gate.
     let dim = 384usize; // not a multiple of 256
     let n = 32usize;
     let nq = 2usize;

From e6e79821892009cefc07200426012941e739a38b Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Sun, 14 Jun 2026 18:28:10 -0500
Subject: [PATCH 9/9] test: cover b=8 routing through the batched two-stage
 rerank path

The merge of main (SubsetScratch batched rerank) brought the b=8 asymmetric
routing into `search_asymmetric_subset_batched_serial_into` via the reused
scratch buffers. Add a parity test through the public
`search_asymmetric_subset_batched_serial` entry point covering both a
non-256-aligned dim (384, empirical asymmetric norm) and an aligned dim (768),
with two queries on distinct CSR candidate rows so scratch reuse across rows is
exercised. Every returned score matches the naive per-doc reference.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 tests/index/quant_b8.rs | 56 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/tests/index/quant_b8.rs b/tests/index/quant_b8.rs
index 164f1c7a..f3d25847 100644
--- a/tests/index/quant_b8.rs
+++ b/tests/index/quant_b8.rs
@@ -396,6 +396,62 @@ fn b8_asymmetric_subset_optimized_path_parity() {
     }
 }
 
+// The b=8 routing also runs through the *batched* two-stage rerank entry point
+// (`search_asymmetric_subset_batched_serial`), which packs each query's
+// candidate row into a reused `SubsetScratch` and scans it with the same b=8
+// gather kernel. Cover both a non-256-aligned dim (384, exercising the
+// empirical asymmetric norm) and an aligned dim (768), with two queries that
+// have distinct candidate rows (exercising the CSR offsets and scratch reuse
+// across rows). Every returned score must match the per-doc naive reference.
+#[test]
+fn b8_asymmetric_subset_batched_serial_path_parity() {
+    for &dim in &[384usize, 768] {
+        let n = 256;
+        let corpus = random_corpus(8100 + dim as u64, n, dim);
+        let mut idx = RankQuant::new_asymmetric(dim, 8);
+        idx.add(&corpus);
+
+        let mut rng = ChaCha8Rng::seed_from_u64(8200 + dim as u64);
+        let q0: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let q1: Vec<f32> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+        let mut queries = q0.clone();
+        queries.extend_from_slice(&q1);
+
+        // Two distinct, intentionally-unsorted candidate rows in CSR layout.
+        let cand0: Vec<u32> = (0..n as u32).rev().step_by(3).collect();
+        let cand1: Vec<u32> = (0..n as u32).step_by(5).collect();
+        let mut candidates = cand0.clone();
+        candidates.extend_from_slice(&cand1);
+        let candidate_offsets = [0usize, cand0.len(), cand0.len() + cand1.len()];
+
+        let k = 10;
+        let res = idx.search_asymmetric_subset_batched_serial(
+            &queries,
+            &candidate_offsets,
+            &candidates,
+            k,
+        );
+
+        for (qi, q) in [&q0, &q1].into_iter().enumerate() {
+            let got_scores = res.scores_for_query(qi);
+            let got_indices = res.indices_for_query(qi);
+            for slot in 0..k {
+                let di = got_indices[slot];
+                if di < 0 {
+                    continue; // fewer candidates than k in this row
+                }
+                let di = di as usize;
+                let want = ref_b8_asymmetric(q, &corpus[di * dim..(di + 1) * dim]);
+                assert!(
+                    (got_scores[slot] - want).abs() < 1e-4,
+                    "dim={dim} q{qi} slot {slot} doc {di}: batched {} vs ref {want}",
+                    got_scores[slot]
+                );
+            }
+        }
+    }
+}
+
 // ---------------------------------------------------------------------
 // validate_params: b=8 is code-valid at any dim; b ∈ {1,2,4} unchanged.
 // ---------------------------------------------------------------------