diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cee4d1e8..e22d96c0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -189,8 +189,8 @@ jobs:
         run: cargo test --features test-utils
       - name: cargo test (no default features)
         run: cargo test --no-default-features
-      - name: cargo build --release --example bench_rank
-        run: cargo build --release --example bench_rank
+      - name: cargo build --release --features bench-utils --example bench_rank
+        run: cargo build --release --features bench-utils --example bench_rank
 
   # ----------------------------------------------------------------------
   # Prove the declared MSRV (1.89.0) actually builds and tests. There is
@@ -445,6 +445,7 @@ jobs:
           set -euo pipefail
           cargo test
           cargo test --features experimental
+          cargo test --features bench-utils
 
   # ----------------------------------------------------------------------
   # WASM: the bitmap/sign popcount kernels have a `simd128` path
@@ -542,4 +543,4 @@ jobs:
           toolchain: stable
       - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
       - name: run bench_rank (scaled, seeded synthetic corpus)
-        run: cargo run --release --example bench_rank -- --n 10000 --queries 100
+        run: cargo run --release --features bench-utils --example bench_rank -- --n 10000 --queries 100
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index bbe46149..fac626d9 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -259,6 +259,7 @@ jobs:
           set -euo pipefail
           cargo test
           cargo test --features experimental
+          cargo test --features bench-utils
 
   notes:
     name: release notes (git-cliff) + draft Release
diff --git a/Cargo.toml b/Cargo.toml
index 5cb07d96..065d8e2c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,11 +81,19 @@ rand_chacha = "0.10"
 # (research scaffold), kept off the stable surface.
 experimental = []
 serde = ["dep:serde"]
+# `bench-utils` exposes benchmark-only reference paths used by examples and
+# parity tests. These helpers are not part of the default public API.
+bench-utils = []
 # `test-utils` exposes internal dispatch probes used by the crate's own integration
 # tests (e.g. the allocation-free guarantee check). Gated off the default surface
 # because these helpers are not part of the public API and carry no semver guarantee.
 test-utils = []
 
+[[example]]
+name = "bench_rank"
+path = "examples/bench_rank.rs"
+required-features = ["bench-utils"]
+
 [profile.release]
 lto = true
 codegen-units = 1
diff --git a/README.md b/README.md
index c7f4a768..23dc7095 100644
--- a/README.md
+++ b/README.md
@@ -437,7 +437,7 @@ the **quality numbers (R@10, candidate-recall, bytes/vec) are deterministic**
 and regenerable from a clean checkout with no external corpus file:
 
 ```sh
-cargo run --release --example bench_rank
+cargo run --release --features bench-utils --example bench_rank
 ```
 
 A few operating points from the committed run
diff --git a/benchmarks/rank_modes_results.txt b/benchmarks/rank_modes_results.txt
index a94702da..da96d2d5 100644
--- a/benchmarks/rank_modes_results.txt
+++ b/benchmarks/rank_modes_results.txt
@@ -4,7 +4,7 @@
 # external corpus file required — the corpus is generated in-process from
 # a fixed seed):
 #
-#     cargo run --release --example bench_rank
+#     cargo run --release --features bench-utils --example bench_rank
 #
 # No system dependencies are required — ordvec links no BLAS.
 #
@@ -47,7 +47,7 @@
 # To benchmark a real public corpus instead (e.g. GloVe / OpenAI
 # text-embedding-3 dumps), pass 2-D little-endian float32 .npy files (C
 # order); not required for the default run:
-#     cargo run --release --example bench_rank -- \
+#     cargo run --release --features bench-utils --example bench_rank -- \
 #         --corpus-npy /path/to/corpus.npy --queries-npy /path/to/queries.npy
 # ===========================================================================
 
diff --git a/docs/RANK_MODES.md b/docs/RANK_MODES.md
index 8409bd1e..65a092d7 100644
--- a/docs/RANK_MODES.md
+++ b/docs/RANK_MODES.md
@@ -20,7 +20,7 @@ with a single command, no external data and no system dependencies
 (ordvec links no BLAS):
 
 ```bash
-cargo run --release --example bench_rank
+cargo run --release --features bench-utils --example bench_rank
 ```
 
 That runs the head-to-head on a structured synthetic corpus (D=256,
@@ -201,7 +201,7 @@ This is the clean-checkout stress test — regenerated by the default
 `bench_rank` run, no external data required:
 
 ```bash
-cargo run --release --example bench_rank
+cargo run --release --features bench-utils --example bench_rank
 ```
 
 Setup: D=256, N=30,000 documents, 200 queries, k=10. Low-rank
@@ -276,7 +276,7 @@ table](#synthetic-stress-test-numbers). The default
 `bench_rank` run uses these parameters; the explicit form is:
 
 ```bash
-cargo run --release --example bench_rank -- \
+cargo run --release --features bench-utils --example bench_rank -- \
   --dim 256 --n 30000 --queries 200 --clusters 200 --latent 64
 ```
 
@@ -371,7 +371,7 @@ facts qualify this:
   overfitting top-k order at near-tolerance boundaries.
 
 The byte-LUT scorer remains in the codebase as a labelled reference
-path (`ordvec::search_asymmetric_byte_lut`,
+path behind the `bench-utils` feature (`ordvec::search_asymmetric_byte_lut`,
 benched as the `RankQuant b=… asym byte-LUT` rows) but is not the
 production scoring route — streaming SIMD math beats query-LUT cache
 traffic on the hardware tested.
@@ -406,7 +406,7 @@ To check the modes on real embeddings, point the same bench at your own
 `.npy` arrays:
 
 ```bash
-cargo run --release --example bench_rank -- \
+cargo run --release --features bench-utils --example bench_rank -- \
   --corpus-npy  /path/to/embeddings.npy \
   --queries-npy /path/to/queries.npy \
   --queries 200 --k 10
@@ -525,10 +525,10 @@ cargo test --features experimental                   # + MultiBucket tests
 
 # Headline benchmark (synthetic clustered corpus — no external data,
 # no BLAS).
-cargo run --release --example bench_rank
+cargo run --release --features bench-utils --example bench_rank
 
 # Same bench against your own real-embedding arrays.
-cargo run --release --example bench_rank -- \
+cargo run --release --features bench-utils --example bench_rank -- \
     --corpus-npy  /path/to/embeddings.npy \
     --queries-npy /path/to/queries.npy \
     --queries 200 --k 10
diff --git a/docs/compatibility-policy.md b/docs/compatibility-policy.md
index 4efede6f..a471b51c 100644
--- a/docs/compatibility-policy.md
+++ b/docs/compatibility-policy.md
@@ -65,9 +65,9 @@ The `experimental` feature is a default-off research surface. Today it exposes
 normal pre-1.0 compatibility policy above. Its direct `.ovfs`
 `RankQuantFastscan::{write,load}` path is supported, but in v0.5.0 `.ovfs` is
 not yet part of the primitive persisted-format, `probe_index_metadata()`, or
-`ordvec-manifest` v1 contract. `#[doc(hidden)]` exports such as
+`ordvec-manifest` v1 contract. Feature-gated `#[doc(hidden)]` exports such as
 `search_asymmetric_byte_lut` are reachable for internal benchmarks and parity
-tests, but are not part of the stable default API.
+tests only when explicitly enabled, and are not part of the stable default API.
 
 New feature flags must declare their stability class before merging:
 
diff --git a/examples/bench_rank.rs b/examples/bench_rank.rs
index ff6e1026..7adb5d5b 100644
--- a/examples/bench_rank.rs
+++ b/examples/bench_rank.rs
@@ -7,7 +7,7 @@
 //! synthetic corpus in-process, so the headline numbers are regenerable
 //! from a clean checkout with a single command:
 //!
-//!     cargo run --release --example bench_rank
+//!     cargo run --release --features bench-utils --example bench_rank
 //!
 //! No system dependencies are required — ordvec links no BLAS.
 //!
@@ -31,11 +31,11 @@
 //! `benchmarks/rank_modes_results.txt`.
 //!
 //! Larger sweeps / real public corpora:
-//!     cargo run --release --example bench_rank -- --dim 1024 --n 100000 --queries 200
+//!     cargo run --release --features bench-utils --example bench_rank -- --dim 1024 --n 100000 --queries 200
 //!     # Point at a real public embedding corpus (no file required for
 //!     # the default run). Both must be 2-D little-endian float32 .npy
 //!     # (C order). For GloVe or OpenAI text-embedding-3 dumps:
-//!     cargo run --release --example bench_rank -- \
+//!     cargo run --release --features bench-utils --example bench_rank -- \
 //!         --corpus-npy /path/to/corpus.npy --queries-npy /path/to/queries.npy
 //!
 //! Output is a human-readable table followed by a JSON line for
diff --git a/ordvec-python/Cargo.toml b/ordvec-python/Cargo.toml
index 44ab6768..174fe13f 100644
--- a/ordvec-python/Cargo.toml
+++ b/ordvec-python/Cargo.toml
@@ -14,6 +14,9 @@ publish = false
 name = "_ordvec"
 crate-type = ["cdylib"]
 
+[features]
+bench-utils = ["ordvec_core/bench-utils"]
+
 [dependencies]
 # Alias the core crate as `ordvec_core` so binding code is unambiguous and never
 # mixes `ordvec::` with the Python-facing `ordvec` package name.
diff --git a/ordvec-python/python/ordvec/__init__.py b/ordvec-python/python/ordvec/__init__.py
index 596c6572..4726b895 100644
--- a/ordvec-python/python/ordvec/__init__.py
+++ b/ordvec-python/python/ordvec/__init__.py
@@ -10,11 +10,10 @@
 ``rank_to_bucket``, ``bucket_ranks``, ``pack_buckets``, ``unpack_buckets``,
 ``rankquant_bytes_per_vec``, ``bucket_centre``, ``rank_norm``,
 ``rankquant_norm``), the eval-only arbitrary-width scorer
-``rankquant_eval_search``, the byte-LUT scoring helper
-``search_asymmetric_byte_lut``, and the loader limit constants (``MAX_DIM``,
+``rankquant_eval_search``, and the loader limit constants (``MAX_DIM``,
 ``MAX_SIGN_BITMAP_DIM``, ``MAX_VECTORS``). Together with the four classes'
-methods this mirrors the headline Rust retrieval API. Rust-only metadata
-probing and manifest-verification helpers remain available through the Rust
+methods this mirrors the headline Rust retrieval API. Rust-only metadata,
+benchmark, and manifest-verification helpers remain available through the Rust
 crates and the ``ordvec-manifest`` CLI; the low-level ``rank_io`` read/write
 functions are reached through the classes' ``write()`` / ``load()`` methods
 rather than exposed as standalone free functions. The specialized
@@ -77,7 +76,6 @@
     rankquant_eval_search,
     rankquant_bytes_per_vec,
     rankquant_norm,
-    search_asymmetric_byte_lut,
     unpack_buckets,
 )
 
@@ -106,7 +104,6 @@
     "rank_norm",
     "rankquant_norm",
     "rankquant_eval_search",
-    "search_asymmetric_byte_lut",
     # loader limit constants
     "MAX_DIM",
     "MAX_SIGN_BITMAP_DIM",
diff --git a/ordvec-python/python/ordvec/_ordvec.pyi b/ordvec-python/python/ordvec/_ordvec.pyi
index de4dc7f0..0bc7e9dc 100644
--- a/ordvec-python/python/ordvec/_ordvec.pyi
+++ b/ordvec-python/python/ordvec/_ordvec.pyi
@@ -2,8 +2,8 @@
 
 Hand-written to mirror the PyO3 surface in ``ordvec-python/src/lib.rs`` exactly
 — the four index classes (``Rank``, ``RankQuant``, ``Bitmap``, ``SignBitmap``),
-the module-level rank-math primitives, the byte-LUT / eval scorers, and the
-``MAX_*`` loader limit constants. abi3 wheels carry no embedded type
+the module-level rank-math primitives, the eval scorer, and the ``MAX_*``
+loader limit constants. abi3 wheels carry no embedded type
 information, so without this stub (and the ``py.typed`` marker) editors and
 ``mypy`` see ``Any`` for the whole package.
 
@@ -171,7 +171,7 @@ class SignBitmap:
 
 # ---------------------------------------------------------------------------
 # Module-level rank-math primitives (parity with ``ordvec::rank::*``) and the
-# byte-LUT / eval scoring helpers.
+# eval scoring helper.
 # ---------------------------------------------------------------------------
 
 def rank_transform(v: NDArray[Any]) -> NDArray[np.uint16]: ...
@@ -183,9 +183,6 @@ def rankquant_bytes_per_vec(d: int, bits: int) -> int: ...
 def bucket_centre(bucket: int, bits: int) -> float: ...
 def rank_norm(d: int) -> float: ...
 def rankquant_norm(d: int, bits: int) -> float: ...
-def search_asymmetric_byte_lut(
-    index: RankQuant, queries: NDArray[Any], k: int
-) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ...
 def rankquant_eval_search(
     corpus: NDArray[Any], queries: NDArray[Any], bits: int, k: int
 ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ...
diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs
index be269151..3056e77f 100644
--- a/ordvec-python/src/lib.rs
+++ b/ordvec-python/src/lib.rs
@@ -1483,9 +1483,8 @@ impl SignBitmap {
 // The four classes above give object-level parity with the Rust API; these
 // free functions expose the `ordvec::rank` math primitives (the data-oblivious
 // kernels the OrdVec/RankQuant paper's Python pipeline verifies against numpy)
-// and the byte-LUT scoring path, so the crate's `pub` surface is fully
-// reachable from Python. Each mirrors the core's argument asserts as a typed
-// `ValueError` instead of letting them surface as a `PanicException`.
+// and the eval-only scoring path. Each mirrors the core's argument asserts as a
+// typed `ValueError` instead of letting them surface as a `PanicException`.
 // =====================================================================
 
 /// Dimension-wise rank transform: `out[k]` = rank of `v[k]` among `v` (ties
@@ -1680,6 +1679,7 @@ fn rankquant_norm(d: usize, bits: u8) -> PyResult<f32> {
 /// Asymmetric search via the byte-LUT scoring path (a benchmark/parity helper;
 /// requires `bits ∈ {2, 4}`). Returns `(scores, indices)` matching
 /// `RankQuant.search_asymmetric`.
+#[cfg(feature = "bench-utils")]
 #[pyfunction]
 fn search_asymmetric_byte_lut<'py>(
     py: Python<'py>,
@@ -1773,8 +1773,7 @@ fn _ordvec(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<Bitmap>()?;
     m.add_class::<SignBitmap>()?;
 
-    // Module-level rank-math primitives (parity with `ordvec::rank::*` and the
-    // crate-root `search_asymmetric_byte_lut`).
+    // Module-level rank-math primitives (parity with `ordvec::rank::*`).
     m.add_function(wrap_pyfunction!(rank_transform, m)?)?;
     m.add_function(wrap_pyfunction!(rank_to_bucket, m)?)?;
     m.add_function(wrap_pyfunction!(bucket_ranks, m)?)?;
@@ -1784,6 +1783,7 @@ fn _ordvec(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(bucket_centre, m)?)?;
     m.add_function(wrap_pyfunction!(rank_norm, m)?)?;
     m.add_function(wrap_pyfunction!(rankquant_norm, m)?)?;
+    #[cfg(feature = "bench-utils")]
     m.add_function(wrap_pyfunction!(search_asymmetric_byte_lut, m)?)?;
     m.add_function(wrap_pyfunction!(rankquant_eval_search, m)?)?;
 
diff --git a/ordvec-python/tests/test_primitives.py b/ordvec-python/tests/test_primitives.py
index ab93682a..9cb5a12f 100644
--- a/ordvec-python/tests/test_primitives.py
+++ b/ordvec-python/tests/test_primitives.py
@@ -1,11 +1,9 @@
 """Tests for the module-level rank-math primitives and limit constants.
 
-These free functions mirror ``ordvec::rank::*``, the crate-root
-``search_asymmetric_byte_lut``, and the ``ordvec::rank_io`` limit constants,
-giving the Python package 1:1 parity with the Rust public surface. Algorithmic
-correctness is proven in the crate's Rust tests; these cover the FFI boundary,
-the numpy round-trips, and the argument guards (bad input → typed exception,
-never a PanicException).
+These free functions mirror ``ordvec::rank::*`` and the ``ordvec::rank_io``
+limit constants. Algorithmic correctness is proven in the crate's Rust tests;
+these cover the FFI boundary, the numpy round-trips, and the argument guards
+(bad input → typed exception, never a PanicException).
 """
 from __future__ import annotations
 
@@ -17,7 +15,6 @@
     MAX_DIM,
     MAX_SIGN_BITMAP_DIM,
     MAX_VECTORS,
-    RankQuant,
     bucket_centre,
     bucket_ranks,
     pack_buckets,
@@ -26,7 +23,6 @@
     rank_transform,
     rankquant_bytes_per_vec,
     rankquant_norm,
-    search_asymmetric_byte_lut,
     unpack_buckets,
 )
 
@@ -149,32 +145,6 @@ def test_primitive_bits_guards():
         rank_to_bucket(0, 1024, 8)
 
 
-def test_search_asymmetric_byte_lut_self_retrieves_top1():
-    rng = np.random.default_rng(0)
-    vectors = rng.standard_normal((40, 128)).astype(np.float32)
-    vectors /= np.linalg.norm(vectors, axis=1, keepdims=True) + 1e-9
-    idx = RankQuant(dim=128, bits=2)
-    idx.add(vectors)
-    queries = vectors[:3]
-    s_lut, i_lut = search_asymmetric_byte_lut(idx, queries, k=10)
-    _, i_ref = idx.search_asymmetric(queries, k=10)
-    assert s_lut.shape == (3, 10)
-    # Both the byte-LUT and the production kernel are the asymmetric path, so a
-    # self-query must self-rank at top-1 in both.
-    for bi in range(3):
-        assert int(i_lut[bi][0]) == bi
-        assert int(i_ref[bi][0]) == bi
-
-
-def test_search_asymmetric_byte_lut_rejects_b1():
-    rng = np.random.default_rng(0)
-    vectors = rng.standard_normal((10, 128)).astype(np.float32)
-    idx = RankQuant(dim=128, bits=1)
-    idx.add(vectors)
-    with pytest.raises(ValueError, match="benchmark-only"):
-        search_asymmetric_byte_lut(idx, vectors[:2], k=5)
-
-
 def test_constants_exposed():
     assert MAX_DIM == 65535
     assert MAX_SIGN_BITMAP_DIM == (1 << 24)
diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py
index 0d2b168c..a47ee66e 100644
--- a/ordvec-python/tests/test_redteam_fuzz.py
+++ b/ordvec-python/tests/test_redteam_fuzz.py
@@ -20,8 +20,7 @@
   files and a forged-huge-dim DoS-allocation header;
 * exotic dtypes (bool / float16 / object / complex / int families) and NaN bit
   patterns (signaling + quiet) across every f32 entry point;
-* type confusion on the ``search_asymmetric_byte_lut`` ``PyRef<RankQuant>`` arg
-  and on every ``None`` / list / str argument;
+* type confusion on every ``None`` / list / str argument;
 * the documented PyO3 borrow-flag reentrancy contract (a ``__index__`` callback
   that re-enters a ``&mut self`` method on the object a ``&self`` method already
   borrowed → clean ``Already borrowed`` ``RuntimeError``, never a data race).
@@ -61,7 +60,6 @@
     rank_transform,
     rankquant_bytes_per_vec,
     rankquant_norm,
-    search_asymmetric_byte_lut,
     unpack_buckets,
 )
 
@@ -527,26 +525,10 @@ def test_signbitmap_batched_fortran_order_raises_value_error():
 
 # =====================================================================
 # Type confusion on non-array params: None / list / str must be a clean
-# TypeError everywhere, including the search_asymmetric_byte_lut PyRef arg.
+# TypeError everywhere.
 # =====================================================================
 
 
-@pytest.mark.parametrize("bad_first", [None, [1, 2, 3], "rq", 42])
-def test_byte_lut_wrong_index_type_raises_type_error(bad_first):
-    q = unit_vectors(2, 64)
-    with pytest.raises(TypeError):
-        search_asymmetric_byte_lut(bad_first, q, k=3)
-
-
-def test_byte_lut_rank_instead_of_rankquant_raises_type_error():
-    # A Rank (wrong index type) where RankQuant is required → TypeError, not a
-    # mis-cast that reads RankQuant fields off a Rank.
-    rk = Rank(dim=64)
-    rk.add(unit_vectors(10, 64))
-    with pytest.raises(TypeError):
-        search_asymmetric_byte_lut(rk, unit_vectors(2, 64), k=3)
-
-
 @pytest.mark.parametrize("bad", [None, [[1.0] * 64] * 4, "hello"])
 def test_rank_add_non_array_raises_type_error(bad):
     with pytest.raises(TypeError):
diff --git a/src/bitmap.rs b/src/bitmap.rs
index 46e527b4..4869d24c 100644
--- a/src/bitmap.rs
+++ b/src/bitmap.rs
@@ -136,12 +136,12 @@ impl Bitmap {
     /// loader's `n_vectors` ceiling. (Bounds the count, not the byte payload —
     /// see the loaders' separate `MAX_PAYLOAD` cap.) Also panics if the
     /// resulting row-major buffer length would overflow `usize` (reachable only
-    /// on 32-bit targets — see `util::checked_new_len`).
+    /// on 32-bit targets — see `util::checked_new_count`).
     pub fn add(&mut self, vectors: &[f32]) {
         let n = vectors.len() / self.dim;
         assert_eq!(vectors.len(), n * self.dim);
         assert_all_finite(vectors);
-        let new_n = crate::util::checked_new_len(self.n_vectors, n, self.qwords_per_vec);
+        let new_n = crate::util::checked_new_count(self.n_vectors, n, self.qwords_per_vec);
         let qpv = self.qwords_per_vec;
         let cutoff = (self.dim - self.n_top) as u16;
         let start = self.bitmaps.len();
diff --git a/src/bucket_code.rs b/src/bucket_code.rs
index 46afd985..00a41eed 100644
--- a/src/bucket_code.rs
+++ b/src/bucket_code.rs
@@ -14,8 +14,8 @@
 //!   exactly `dim / buckets` coordinates. It owns the code-validation rules:
 //!   length, range, and per-bucket occupancy.
 //! - [`RankQuantSpec`] — the RankQuant-shaped specialisation: `buckets`
-//!   derived as `1 << bits` for `bits ∈ {1, 2, 4}`, matching the crate's
-//!   [`crate::RankQuant`] bit-width domain.
+//!   derived as `1 << bits` for `bits ∈ {1, 2, 4, 8}` when the fixed-composition
+//!   invariant exists.
 //! - [`BucketCode`] — a single validated code vector against a
 //!   [`CompositionSpec`], built from raw codes, from a rank permutation
 //!   ([`BucketCode::from_ranks`]), or directly from a float vector
@@ -43,8 +43,8 @@
 //! delegates to the crate's shared [`crate::rank`] primitives, so callers
 //! no longer need to fork rank or bucket semantics.
 //!
-//! Two intentional constraints to note: `bits = 8` is rejected (it lands as a
-//! capability-gated width in the separate b=8 work, #221), and
+//! Two intentional constraints to note: `bits = 8` requires `dim % 256 == 0`
+//! because this surface validates fixed-composition codes, and
 //! [`CompositionSpec::new`] rejects `buckets > 256` (codes are `u8`).
 
 use std::error::Error;
@@ -177,9 +177,13 @@ impl CompositionSpec {
 /// RankQuant-shaped fixed-composition code parameters.
 ///
 /// Specialises [`CompositionSpec`] to the crate's RankQuant bit-width domain:
-/// the bucket count is `1 << bits` for `bits ∈ {1, 2, 4}`, and `dim` is capped
-/// at `u16::MAX` to mirror the crate-wide rank invariant (a rank vector is a
-/// permutation of `[0, dim)` stored as `u16`).
+/// the bucket count is `1 << bits` for `bits ∈ {1, 2, 4, 8}`. Because this type
+/// models fixed-composition codes, `bits = 8` is valid only when
+/// `dim % 256 == 0`; arbitrary-dimension asymmetric-only b=8 remains a
+/// [`crate::RankQuant`] index capability, not a composition spec.
+///
+/// `dim` is capped at `u16::MAX` to mirror the crate-wide rank invariant
+/// (a rank vector is a permutation of `[0, dim)` stored as `u16`).
 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub struct RankQuantSpec {
     bits: u8,
@@ -190,14 +194,11 @@ impl RankQuantSpec {
     /// Build a RankQuant spec for `dim` coordinates at `bits` bits/coordinate.
     ///
     /// # Errors
-    /// - [`CompositionViolation::InvalidBits`] if `bits ∉ {1, 2, 4}`. This is
-    ///   the crate's [`crate::RankQuant`] bit-width domain — the reference
-    ///   prototype also accepted `8`, but ordvec's packed format and analytical
-    ///   norm are defined only for `{1, 2, 4}`, so 8-bit is rejected here.
+    /// - [`CompositionViolation::InvalidBits`] if `bits ∉ {1, 2, 4, 8}`.
     /// - [`CompositionViolation::DimTooLarge`] if `dim > u16::MAX`.
     /// - the [`CompositionSpec::new`] errors (non-divisible `dim`).
     pub fn new(dim: usize, bits: u8) -> Result<Self, CompositionViolation> {
-        if !matches!(bits, 1 | 2 | 4) {
+        if !matches!(bits, 1 | 2 | 4 | 8) {
             return Err(CompositionViolation::InvalidBits { bits });
         }
         if dim > u16::MAX as usize {
@@ -213,7 +214,7 @@ impl RankQuantSpec {
         })
     }
 
-    /// Bits per coordinate (`1`, `2`, or `4`).
+    /// Bits per coordinate (`1`, `2`, `4`, or fixed-composition `8`).
     pub fn bits(&self) -> u8 {
         self.bits
     }
@@ -317,8 +318,8 @@ impl BucketCode {
     /// panicking inside the rank primitives.
     ///
     /// # Errors
-    /// - the [`RankQuantSpec::new`] errors (`bits ∉ {1, 2, 4}`, `dim` too large
-    ///   or non-divisible).
+    /// - the [`RankQuantSpec::new`] errors (`bits ∉ {1, 2, 4, 8}`, `dim` too
+    ///   large or non-divisible).
     /// - [`CompositionViolation::WrongLength`] if `vector.len() != dim`.
     /// - [`CompositionViolation::NonFiniteValue`] on the first non-finite
     ///   coordinate.
@@ -383,7 +384,7 @@ impl BucketCode {
 pub enum CompositionViolation {
     /// A structural spec parameter was invalid (`dim == 0`, `buckets < 2`).
     InvalidSpec(&'static str),
-    /// `bits` was outside the supported RankQuant set `{1, 2, 4}`.
+    /// `bits` was outside the supported RankQuant set `{1, 2, 4, 8}`.
     InvalidBits {
         /// The rejected bit width.
         bits: u8,
@@ -453,7 +454,7 @@ impl fmt::Display for CompositionViolation {
         match self {
             Self::InvalidSpec(message) => write!(f, "{message}"),
             Self::InvalidBits { bits } => {
-                write!(f, "bits {bits} is invalid; expected one of 1, 2, 4")
+                write!(f, "bits {bits} is invalid; expected one of 1, 2, 4, 8")
             }
             Self::DimTooLarge { dim, max } => write!(f, "dim {dim} exceeds maximum {max}"),
             Self::NonUniformSpec { dim, buckets } => {
@@ -555,19 +556,29 @@ mod tests {
         );
     }
 
-    // Pin the b=8 decision: the reference prototype accepted bits=8 but ordvec
-    // rejects it. These tests ensure that boundary cannot change silently.
     #[test]
-    fn rankquant_spec_rejects_bits_8() {
-        assert_eq!(
-            RankQuantSpec::new(8, 8).unwrap_err(),
-            CompositionViolation::InvalidBits { bits: 8 }
-        );
-        // `from_vector` takes the same path: bits=8 is rejected at the spec level.
-        let v: Vec<f32> = (0..8).map(|i| i as f32).collect();
+    fn rankquant_spec_accepts_fixed_composition_bits_8() {
+        let spec = RankQuantSpec::new(512, 8).unwrap();
+        assert_eq!(spec.bits(), 8);
+        assert_eq!(spec.composition().buckets(), 256);
+        assert_eq!(spec.composition().expected_per_bucket(), 2);
+
+        let v: Vec<f32> = (0..512).map(|i| i as f32).collect();
+        let code = BucketCode::from_vector(512, 8, &v).unwrap();
+        assert_eq!(code.spec().buckets(), 256);
+        assert_eq!(code.spec().expected_per_bucket(), 2);
+        assert_eq!(code.codes()[0], 0);
+        assert_eq!(code.codes()[511], 255);
+    }
+
+    #[test]
+    fn rankquant_spec_rejects_non_fixed_composition_bits_8() {
         assert_eq!(
-            BucketCode::from_vector(8, 8, &v).unwrap_err(),
-            CompositionViolation::InvalidBits { bits: 8 }
+            RankQuantSpec::new(384, 8).unwrap_err(),
+            CompositionViolation::NonUniformSpec {
+                dim: 384,
+                buckets: 256,
+            }
         );
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index 44c419fb..754b6a4d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -108,13 +108,10 @@ pub use rank_io::{probe_index_metadata, IndexKind, IndexMetadata, IndexParams};
 pub use sign_bitmap::CandidateBatch;
 pub use sign_bitmap::SignBitmap;
 
-// `search_asymmetric_byte_lut` is a bench-only scoring reference: it
-// panics on b=1 and exists so `examples/bench_rank` can compare the
-// byte-LUT path against the production AVX kernels on the same data.
-// Re-exported `#[doc(hidden)]` — reachable for the example and the
-// red-team parity tests, but not part of the headline API. Production
-// callers use `RankQuant::search_asymmetric`, whose dispatch routes
-// every supported bit width to a non-panicking kernel.
+// Bench-only scoring reference for `examples/bench_rank` and parity tests.
+// Gated off the default public API surface; production callers use
+// `RankQuant::search_asymmetric`.
+#[cfg(feature = "bench-utils")]
 #[doc(hidden)]
 pub use quant::search_asymmetric_byte_lut;
 
diff --git a/src/multi_bucket.rs b/src/multi_bucket.rs
index a375a363..1d32af06 100644
--- a/src/multi_bucket.rs
+++ b/src/multi_bucket.rs
@@ -409,10 +409,9 @@ impl MultiBucketBitmap {
     /// forces the **portable scalar** diagonal kernel, bypassing the runtime
     /// AVX-512 dispatch. It exists so `examples/bench_contingency` can time the
     /// scalar and SIMD diagonal paths against each other on the same index
-    /// (mirroring the `#[doc(hidden)]` `search_asymmetric_byte_lut` bench
-    /// reference at the crate root). Not part of the stable API — production
-    /// callers use [`Self::diagonal_overlap_row`], which dispatches to the
-    /// fastest available kernel.
+    /// (mirroring the feature-gated byte-LUT bench reference). Not part of the
+    /// stable API — production callers use [`Self::diagonal_overlap_row`], which
+    /// dispatches to the fastest available kernel.
     ///
     /// # Panics
     /// Panics if `doc_idx >= len()` or `q_bitmaps.len() != nb * qpb`.
diff --git a/src/quant.rs b/src/quant.rs
index 831f54bc..1c8ce627 100644
--- a/src/quant.rs
+++ b/src/quant.rs
@@ -18,10 +18,9 @@
 //! gather against the `dim * 256` LUT: an AVX-512 `vgatherdps` kernel when
 //! available (`avx512f` + `avx512bw` + `dim % 16 == 0`), else the portable scalar LUT.
 //!
-//! The byte-LUT path ([`search_asymmetric_byte_lut`]) is re-exported
-//! `#[doc(hidden)]` (reachable as `ordvec::search_asymmetric_byte_lut`)
-//! so `examples/bench_rank.rs` can compare it against the production
-//! AVX path on the same data.
+//! The byte-LUT reference path is available only with the non-default
+//! `bench-utils` feature so `examples/bench_rank.rs` can compare it against
+//! the production AVX path on the same data.
 
 use rayon::prelude::*;
 
@@ -584,7 +583,7 @@ impl RankQuant {
     /// loader's `n_vectors` ceiling. (Bounds the count, not the byte payload —
     /// see the loaders' separate `MAX_PAYLOAD` cap.) Also panics if the
     /// resulting row-major buffer length would overflow `usize` (reachable only
-    /// on 32-bit targets — see `util::checked_new_len`).
+    /// on 32-bit targets — see `util::checked_new_count`).
     pub fn add(&mut self, vectors: &[f32]) {
         let n = vectors.len() / self.dim;
         assert_eq!(
@@ -594,7 +593,7 @@ impl RankQuant {
         );
         assert_all_finite(vectors);
         let bytes_per_vec = rankquant_bytes_per_vec(self.dim, self.bits);
-        let new_n = crate::util::checked_new_len(self.n_vectors, n, bytes_per_vec);
+        let new_n = crate::util::checked_new_count(self.n_vectors, n, bytes_per_vec);
         let start = self.packed.len();
         self.packed.resize(start + n * bytes_per_vec, 0);
         let dim = self.dim;
@@ -1624,14 +1623,15 @@ pub fn rankquant_eval_search(
 //   B=2: 256 groups × 256 entries × 4 B = 256 KiB per query (fits L2)
 //   B=4: 512 groups × 256 entries × 4 B = 512 KiB per query (spills L2 a little)
 //
-// Re-exported `#[doc(hidden)]` for benchmarking. Production callers should reach
-// for [`RankQuant::search_asymmetric`] which dispatches to the
-// fastest implementation for the current CPU.
+// Available behind `bench-utils` for benchmarking. Production callers should
+// reach for `RankQuant::search_asymmetric`, which dispatches to the fastest
+// implementation for the current CPU.
 // -------------------------------------------------------------------
 
 /// Build the byte-LUT for B=2 asymmetric: `lut[g * 256 + byte]` is the
 /// f32 contribution of `doc[g] == byte` to the score, summed across
 /// the 4 coordinates packed into that byte.
+#[cfg(feature = "bench-utils")]
 fn build_byte_lut_b2(q_unit: &[f32]) -> Vec<f32> {
     let dim = q_unit.len();
     debug_assert_eq!(dim % 4, 0);
@@ -1654,6 +1654,7 @@ fn build_byte_lut_b2(q_unit: &[f32]) -> Vec<f32> {
 }
 
 /// Build the byte-LUT for B=4 asymmetric.
+#[cfg(feature = "bench-utils")]
 fn build_byte_lut_b4(q_unit: &[f32]) -> Vec<f32> {
     let dim = q_unit.len();
     debug_assert_eq!(dim % 2, 0);
@@ -1672,6 +1673,7 @@ fn build_byte_lut_b4(q_unit: &[f32]) -> Vec<f32> {
 }
 
 /// Scalar byte-LUT scan for B=2 asymmetric. One add per doc byte.
+#[cfg(feature = "bench-utils")]
 fn scan_b2_asym_byte_lut(
     packed: &[u8],
     n: usize,
@@ -1693,6 +1695,7 @@ fn scan_b2_asym_byte_lut(
 }
 
 /// Scalar byte-LUT scan for B=4 asymmetric.
+#[cfg(feature = "bench-utils")]
 fn scan_b4_asym_byte_lut(
     packed: &[u8],
     n: usize,
@@ -1728,6 +1731,7 @@ fn scan_b4_asym_byte_lut(
 ///
 /// Returns the raw `Vec<i64>` of doc indices per query, length
 /// `queries.len() / dim * k`.
+#[cfg(feature = "bench-utils")]
 pub fn search_asymmetric_byte_lut(index: &RankQuant, queries: &[f32], k: usize) -> SearchResults {
     let dim = index.dim;
     let bits = index.bits;
diff --git a/src/rank.rs b/src/rank.rs
index 5c511edf..5a2bcf1d 100644
--- a/src/rank.rs
+++ b/src/rank.rs
@@ -87,31 +87,10 @@ pub fn rank_transform_into(v: &[f32], out: &mut [u16]) {
 /// hot path.
 #[inline]
 pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
-    // `bits` is a `u8`, so a caller could pass e.g. 9 or 255. `1u32 << bits`
-    // overflows for `bits >= 32` (in release that silently wraps and yields a
-    // wrong bucket; in debug it panics inconsistently), and the result must
-    // also fit in the returned `u8`, so cap at 8 — the widest RankQuant width
-    // (b=8 yields one bucket per code value in `[0, 256)`, which still fits a
-    // `u8`). `d == 0` would divide by zero. Guard both up front so the failure
-    // is loud in every build.
     assert!(bits <= 8, "bits too large");
     assert!(d > 0, "d must be positive");
-    // A valid rank is a position in `[0, d)`. Reject `rank >= d` loudly instead
-    // of silently clamping the quotient back into range: the rest of the public
-    // bucket API ([`pack_buckets`] / [`bucket_centre`]) fails loud on an
-    // out-of-domain argument, so a direct caller that miscomputes a rank should
-    // hear about it rather than receive a plausible-but-wrong bucket.
     assert!((rank as usize) < d, "rank ({rank}) must be < d ({d})");
     let n_buckets = 1u32 << bits;
-    // u64 math: `d` is a `usize` and reaches this from the Python binding as a
-    // free argument, so `d as u32` could truncate a `d >= 2^32` (e.g. to 0,
-    // which would divide by zero and panic). rank ≤ u16::MAX and n_buckets ≤
-    // 128, so the product fits u64 comfortably; over the realistic d ≤ u16::MAX
-    // domain this is bit-identical to the previous u32 form.
-    //
-    // With `rank < d` guaranteed above, `rank * n_buckets / d < n_buckets`
-    // (integer division floors), so the quotient already lands in
-    // `[0, n_buckets)` and fits the returned `u8` without a clamp.
     ((rank as u64 * n_buckets as u64) / d as u64) as u8
 }
 
@@ -123,10 +102,6 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
 /// valid rank vector is a permutation of `[0, ranks.len())`, so well-formed
 /// input never trips the per-entry guard.
 pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec<u8> {
-    // Validate `bits` up front so an invalid width fails loud even for empty
-    // input — an empty `ranks` skips the per-entry `rank_to_bucket` check and
-    // would otherwise silently return an empty vec. Mirrors the Python binding,
-    // which checks `bits` before its empty short-circuit.
     assert!(bits <= 8, "bits too large");
     let d = ranks.len();
     ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect()
@@ -166,14 +141,6 @@ pub fn pack_buckets(buckets: &[u8], bits: u8) -> Vec<u8> {
     let n_bytes = d / codes_per_byte;
     let mut out = vec![0u8; n_bytes];
     let bits_u = bits as usize;
-    // Pack in a single pass, failing loud on an out-of-range code rather than
-    // silently masking it (`code & mask` would turn e.g. 7 at bits=2 into 3,
-    // packing a different vector). Checking inside the loop keeps the
-    // fail-loud guarantee without a second O(d) pass over `buckets`; the
-    // branch is loop-invariant-predictable for the always-valid internal
-    // callers. Asserting `b <= mask` makes the trailing `& mask` redundant.
-    // At `b=8`, `codes_per_byte == 1`, so `shift == 0` and each byte holds one
-    // code verbatim.
     for (i, &b) in buckets.iter().enumerate() {
         assert!(
             b <= mask,
@@ -366,7 +333,7 @@ impl Rank {
     /// loader's `n_vectors` ceiling. (Bounds the count, not the byte payload —
     /// see the loaders' separate `MAX_PAYLOAD` cap.) Also panics if the
     /// resulting row-major buffer length would overflow `usize` (reachable only
-    /// on 32-bit targets — see `util::checked_new_len`).
+    /// on 32-bit targets — see `util::checked_new_count`).
     pub fn add(&mut self, vectors: &[f32]) {
         let n = vectors.len() / self.dim;
         assert_eq!(
@@ -375,7 +342,7 @@ impl Rank {
             "vectors length must be a multiple of dim",
         );
         assert_all_finite(vectors);
-        let new_n = crate::util::checked_new_len(self.n_vectors, n, self.dim);
+        let new_n = crate::util::checked_new_count(self.n_vectors, n, self.dim);
         let start = self.ranks.len();
         self.ranks.resize(start + n * self.dim, 0);
         let dim = self.dim;
@@ -886,10 +853,10 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "bits must be 1,2,4")]
+    #[should_panic(expected = "bits must be 1,2,4,8")]
     fn rankquant_norm_rejects_invalid_bits() {
-        // 3-bit packing has no RankQuant scheme; the norm must refuse it
-        // rather than return a value for a non-existent layout.
+        // Only byte-dividing RankQuant widths are valid; unsupported widths
+        // must fail loud instead of returning a norm for a non-existent layout.
         let _ = rankquant_norm(64, 3);
     }
 }
diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index 7d6bbcc6..66f971ab 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -175,12 +175,12 @@ impl SignBitmap {
     /// loader's `n_vectors` ceiling. (Bounds the count, not the byte payload —
     /// see the loaders' separate `MAX_PAYLOAD` cap.) Also panics if the
     /// resulting row-major buffer length would overflow `usize` (reachable only
-    /// on 32-bit targets — see `util::checked_new_len`).
+    /// on 32-bit targets — see `util::checked_new_count`).
     pub fn add(&mut self, vectors: &[f32]) {
         crate::util::assert_all_finite(vectors);
         let n = vectors.len() / self.dim;
         assert_eq!(vectors.len(), n * self.dim);
-        let new_n = crate::util::checked_new_len(self.n_vectors, n, self.qwords_per_vec);
+        let new_n = crate::util::checked_new_count(self.n_vectors, n, self.qwords_per_vec);
         let qpv = self.qwords_per_vec;
         let dim = self.dim;
         let start = self.bitmaps.len();
diff --git a/src/util.rs b/src/util.rs
index 8e229f59..5f9eb1dd 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -4,7 +4,7 @@
 //!   path (full ranks, bucketed ranks, bitmap overlap).
 //! - [`l2_normalise`] returns the unit-norm copy of a query vector for
 //!   the asymmetric scoring path.
-//! - The checked-allocation guards (`result_buffer_len`, `checked_new_len`),
+//! - The checked-allocation guards (`result_buffer_len`, `checked_new_count`),
 //!   the finite-input assert (`assert_all_finite`), and the portable AND/XOR
 //!   popcount reductions (`and_popcount` / `xor_popcount`) round out the
 //!   shared helpers.
@@ -47,7 +47,7 @@ pub(crate) fn result_buffer_len(nq: usize, k: usize) -> usize {
 
 /// Validate that an `add` would not grow an index past
 /// `rank_io::MAX_VECTORS`, **and** that the resulting row-major buffer of
-/// `new_n * elems_per_vec` elements still fits `usize`. Returns the new length.
+/// `new_n * elems_per_vec` elements still fits `usize`. Returns the new count.
 ///
 /// The on-disk loaders cap `n_vectors` at `MAX_VECTORS` (64 Mi); the four
 /// in-memory growth paths (`Rank` / `RankQuant` / `Bitmap` / `SignBitmap`
@@ -66,7 +66,7 @@ pub(crate) fn result_buffer_len(nq: usize, k: usize) -> usize {
 /// buffer (issue #25). The *count* cap is the `u32` / round-trip contract; the
 /// byte payload is bounded separately by the loaders' `MAX_PAYLOAD` cap.
 #[inline]
-pub(crate) fn checked_new_len(current: usize, adding: usize, elems_per_vec: usize) -> usize {
+pub(crate) fn checked_new_count(current: usize, adding: usize, elems_per_vec: usize) -> usize {
     let new_n = current
         .checked_add(adding)
         .expect("ordvec: n_vectors overflows usize");
@@ -81,21 +81,18 @@ pub(crate) fn checked_new_len(current: usize, adding: usize, elems_per_vec: usiz
     new_n
 }
 
+const L2_NORMALISE_EPSILON: f32 = 1e-12;
+
 /// Unit-L2 copy of `v`, used by the asymmetric scoring path.
 ///
 /// **Degenerate queries are intentional, not errors.** A query with L2 norm
-/// `≤ 1e-12` (the all-zero vector, or one numerically indistinguishable from
-/// it) has no direction, so its unit copy is the zero vector. The asymmetric
-/// score is then `0` for every document: they all tie, and the returned top-k
-/// is an arbitrary — though deterministic, via the `(score, doc_id)`
-/// tie-break — prefix of the corpus. This is the correct outcome for a
-/// retrieval substrate (a directionless query has no nearest neighbour), and
-/// it is deliberately *silent*: the input is finite and valid, so it is not
-/// rejected the way NaN/±Inf are by [`assert_all_finite`]. Callers that treat
-/// an all-zero query as an upstream bug should check `‖q‖` before searching.
+/// `≤ L2_NORMALISE_EPSILON` (the all-zero vector, or one numerically
+/// indistinguishable from it) has no direction, so its unit copy is the zero
+/// vector. Callers that treat this as an upstream bug should check `‖q‖`
+/// before searching.
 pub(crate) fn l2_normalise(v: &[f32]) -> Vec<f32> {
     let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
-    if norm <= 1e-12 {
+    if norm <= L2_NORMALISE_EPSILON {
         vec![0.0; v.len()]
     } else {
         let inv = 1.0 / norm;
@@ -109,7 +106,7 @@ pub(crate) fn l2_normalise(v: &[f32]) -> Vec<f32> {
 pub(crate) fn l2_normalise_into(out: &mut Vec<f32>, v: &[f32]) {
     out.clear();
     let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
-    if norm <= 1e-12 {
+    if norm <= L2_NORMALISE_EPSILON {
         out.resize(v.len(), 0.0);
     } else {
         let inv = 1.0 / norm;
@@ -600,7 +597,10 @@ impl TopK {
 
 #[cfg(test)]
 mod tests {
-    use super::{and_popcount, checked_new_len, xor_popcount, TopK};
+    use super::{
+        and_popcount, checked_new_count, l2_normalise, l2_normalise_into, xor_popcount, TopK,
+        L2_NORMALISE_EPSILON,
+    };
     use rand::{RngExt, SeedableRng};
     use rand_chacha::ChaCha8Rng;
 
@@ -676,47 +676,47 @@ mod tests {
     }
 
     #[test]
-    fn checked_new_len_accepts_up_to_max() {
+    fn checked_new_count_accepts_up_to_max() {
         use crate::rank_io::MAX_VECTORS;
         // Exactly MAX_VECTORS is allowed — the loaders accept the same ceiling,
         // so a freshly grown index stays write/load round-trippable. (elems=1
         // isolates the count cap from the buffer-size check.)
-        assert_eq!(checked_new_len(0, MAX_VECTORS, 1), MAX_VECTORS);
-        assert_eq!(checked_new_len(MAX_VECTORS - 1, 1, 1), MAX_VECTORS);
+        assert_eq!(checked_new_count(0, MAX_VECTORS, 1), MAX_VECTORS);
+        assert_eq!(checked_new_count(MAX_VECTORS - 1, 1, 1), MAX_VECTORS);
         // An empty add never trips the guard.
-        assert_eq!(checked_new_len(MAX_VECTORS, 0, 1), MAX_VECTORS);
+        assert_eq!(checked_new_count(MAX_VECTORS, 0, 1), MAX_VECTORS);
         // MAX_VECTORS * 4096 = 2^38 fits usize on 64-bit; on 32-bit it overflows,
         // which the guard correctly panics on (see
-        // `checked_new_len_rejects_buffer_overflow`). Gate the success assertion
+        // `checked_new_count_rejects_buffer_overflow`). Gate the success assertion
         // to 64-bit so the suite stays portable (wasm32 / armv7).
         #[cfg(target_pointer_width = "64")]
         {
-            assert_eq!(checked_new_len(0, MAX_VECTORS, 4096), MAX_VECTORS);
+            assert_eq!(checked_new_count(0, MAX_VECTORS, 4096), MAX_VECTORS);
         }
     }
 
     #[test]
     #[should_panic(expected = "MAX_VECTORS")]
-    fn checked_new_len_rejects_one_past_max() {
+    fn checked_new_count_rejects_one_past_max() {
         use crate::rank_io::MAX_VECTORS;
         // One past the loader ceiling must fail loud rather than build an index
         // that write/load would refuse to round-trip.
-        let _ = checked_new_len(MAX_VECTORS, 1, 1);
+        let _ = checked_new_count(MAX_VECTORS, 1, 1);
     }
 
     #[test]
     #[should_panic(expected = "n_vectors overflows usize")]
-    fn checked_new_len_rejects_usize_overflow() {
+    fn checked_new_count_rejects_usize_overflow() {
         // The running count itself must not wrap before the cap is checked.
-        let _ = checked_new_len(usize::MAX, 1, 1);
+        let _ = checked_new_count(usize::MAX, 1, 1);
     }
 
     #[test]
     #[should_panic(expected = "buffer length")]
-    fn checked_new_len_rejects_buffer_overflow() {
+    fn checked_new_count_rejects_buffer_overflow() {
         // Count is within MAX_VECTORS, but new_n * elems_per_vec overflows
         // usize — the 32-bit (wasm32) hazard the `resize` in `add` would hit.
-        let _ = checked_new_len(0, 2, usize::MAX);
+        let _ = checked_new_count(0, 2, usize::MAX);
     }
 
     #[test]
@@ -767,7 +767,6 @@ mod tests {
 
     #[test]
     fn l2_normalise_into_matches_l2_normalise_and_reuses_capacity() {
-        use super::{l2_normalise, l2_normalise_into};
         let v = vec![3.0f32, 0.0, 4.0, 0.0]; // norm 5
         let expected = l2_normalise(&v);
         let mut out: Vec<f32> = Vec::new();
@@ -785,4 +784,22 @@ mod tests {
         l2_normalise_into(&mut out, &v);
         assert_eq!(out.capacity(), cap, "l2_normalise_into must reuse capacity");
     }
+
+    #[test]
+    fn l2_normalise_threshold_edges_are_pinned() {
+        let below = vec![L2_NORMALISE_EPSILON * 0.5, 0.0];
+        assert_eq!(l2_normalise(&below), vec![0.0, 0.0]);
+
+        let at = vec![L2_NORMALISE_EPSILON, 0.0];
+        assert_eq!(l2_normalise(&at), vec![0.0, 0.0]);
+
+        let above = vec![L2_NORMALISE_EPSILON * 2.0, 0.0];
+        assert_eq!(l2_normalise(&above), vec![1.0, 0.0]);
+
+        let mut out = Vec::new();
+        l2_normalise_into(&mut out, &below);
+        assert_eq!(out, vec![0.0, 0.0]);
+        l2_normalise_into(&mut out, &above);
+        assert_eq!(out, vec![1.0, 0.0]);
+    }
 }
diff --git a/tests/determinism_contract.rs b/tests/determinism_contract.rs
index 56c8f486..daceed3e 100644
--- a/tests/determinism_contract.rs
+++ b/tests/determinism_contract.rs
@@ -1,4 +1,4 @@
-use ordvec::{search_asymmetric_byte_lut, Bitmap, Rank, RankQuant, SignBitmap};
+use ordvec::{Bitmap, Rank, RankQuant, SignBitmap};
 
 fn repeated_docs(n: usize, dim: usize, value: f32) -> Vec<f32> {
     vec![value; n * dim]
@@ -49,7 +49,10 @@ fn full_search_ties_return_lowest_row_ids() {
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rankquant_dispatch_matches_scalar_reference_on_ordered_ties() {
+    use ordvec::search_asymmetric_byte_lut;
+
     for &dim in &[20usize, 64] {
         let docs = repeated_docs(8, dim, 1.0);
         let query = vec![0.0; dim];
diff --git a/tests/index/finite.rs b/tests/index/finite.rs
index a2dfbef5..701a04a5 100644
--- a/tests/index/finite.rs
+++ b/tests/index/finite.rs
@@ -65,6 +65,7 @@ fn rank_transform_rejects_nan() {
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 #[should_panic(expected = "non-finite")]
 fn search_asymmetric_byte_lut_rejects_inf() {
     let mut idx = RankQuant::new(D, 2);
diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs
index 1b10c3b4..85855d48 100644
--- a/tests/index/two_stage.rs
+++ b/tests/index/two_stage.rs
@@ -2,7 +2,9 @@ use ordvec::{
     validate_candidate_ids, validate_flat_vectors_len, Bitmap, OrdvecError, RankQuant, SignBitmap,
     TwoStageCandidatePolicy,
 };
+#[cfg(feature = "bench-utils")]
 use rand::{RngExt, SeedableRng};
+#[cfg(feature = "bench-utils")]
 use rand_chacha::ChaCha8Rng;
 
 use crate::{make_corpus, D, N};
@@ -697,6 +699,7 @@ fn batched_serial_wrapper_matches_into_and_full_set_matches_search_asymmetric()
 /// Scores compared within the existing kernel parity tolerance, NOT byte-identical
 /// across tiers. (Same convention as redteam_beta + determinism_contract.)
 #[test]
+#[cfg(feature = "bench-utils")]
 fn batched_subset_rerank_matches_scalar_reference_across_tiers() {
     use ordvec::search_asymmetric_byte_lut;
     for dim in [64usize, 80, 128] {
diff --git a/tests/redteam_beta.rs b/tests/redteam_beta.rs
index 12d0e663..58d55e8e 100644
--- a/tests/redteam_beta.rs
+++ b/tests/redteam_beta.rs
@@ -27,8 +27,9 @@ use rand::{RngExt, SeedableRng};
 use rand_chacha::ChaCha8Rng;
 
 use ordvec::rank::{bucket_centre, bucket_ranks, rank_transform, rankquant_norm};
-use ordvec::search_asymmetric_byte_lut;
-use ordvec::{Rank, RankQuant, SearchResults, SignBitmap};
+#[cfg(feature = "bench-utils")]
+use ordvec::SearchResults;
+use ordvec::{Rank, RankQuant, SignBitmap};
 
 fn make_corpus(seed: u64, n: usize, dim: usize) -> Vec<f32> {
     let mut rng = ChaCha8Rng::seed_from_u64(seed);
@@ -71,7 +72,10 @@ fn ref_rankquant_asymmetric(query: &[f32], doc: &[f32], bits: u8) -> f32 {
 //   - 768 b4: production-scale AVX-512 happy path
 // -------------------------------------------------------------------
 
+#[cfg(feature = "bench-utils")]
 fn assert_asym_matches_byte_lut(dim: usize, bits: u8, seed: u64) {
+    use ordvec::search_asymmetric_byte_lut;
+
     let n = 64;
     let corpus = make_corpus(seed, n, dim);
     let mut idx = RankQuant::new(dim, bits);
@@ -115,36 +119,43 @@ fn assert_asym_matches_byte_lut(dim: usize, bits: u8, seed: u64) {
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b2_dim48_matches_scalar() {
     assert_asym_matches_byte_lut(48, 2, 101);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b4_dim80_matches_scalar() {
     assert_asym_matches_byte_lut(80, 4, 102);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b2_dim20_matches_scalar() {
     assert_asym_matches_byte_lut(20, 2, 103);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b2_dim4_matches_scalar() {
     assert_asym_matches_byte_lut(4, 2, 104);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b2_dim64_happy_path_matches_scalar() {
     assert_asym_matches_byte_lut(64, 2, 105);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b4_dim128_happy_path_matches_scalar() {
     assert_asym_matches_byte_lut(128, 4, 106);
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn rt2_asym_b4_dim768_happy_path_matches_scalar() {
     assert_asym_matches_byte_lut(768, 4, 107);
 }
@@ -288,7 +299,10 @@ fn sign_bitmap_top_m_huge_m_clamps() {
 // -------------------------------------------------------------------
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn byte_lut_huge_k_clamps_no_overflow() {
+    use ordvec::search_asymmetric_byte_lut;
+
     let dim = 64;
     let n = 16;
     let corpus = make_corpus(501, n, dim);
@@ -312,7 +326,10 @@ fn byte_lut_huge_k_clamps_no_overflow() {
 }
 
 #[test]
+#[cfg(feature = "bench-utils")]
 fn byte_lut_huge_k_multi_query_clamps_no_overflow() {
+    use ordvec::search_asymmetric_byte_lut;
+
     // Multi-query exercises the `nq * k` result-buffer axis (Finding 1):
     // with the raw `usize::MAX` the product `nq * k` overflows usize and
     // would silently wrap to a too-small Vec; `result_buffer_len` turns
diff --git a/tests/redteam_delta.rs b/tests/redteam_delta.rs
index f891ad6a..7187692a 100644
--- a/tests/redteam_delta.rs
+++ b/tests/redteam_delta.rs
@@ -49,7 +49,7 @@ use rand::{RngExt, SeedableRng};
 use rand_chacha::ChaCha8Rng;
 
 use ordvec::rank::rank_norm;
-use ordvec::{search_asymmetric_byte_lut, Bitmap, Rank, RankQuant, SignBitmap};
+use ordvec::{Bitmap, Rank, RankQuant, SignBitmap};
 
 /// `MAX_VECTORS` from `rank_io` — the on-disk document-count ceiling.
 /// Re-declared here (not imported) to keep the test independent of
@@ -707,8 +707,11 @@ fn delta_d4_large_nq_small_k() {
 /// `b = 1` to the scalar LUT and is unaffected — covered by the `beta`
 /// suite). This is an intentional, documented contract, not a bug.
 #[test]
+#[cfg(feature = "bench-utils")]
 #[should_panic(expected = "byte-LUT path only supports bits")]
 fn delta_e1_byte_lut_panics_on_b1_index() {
+    use ordvec::search_asymmetric_byte_lut;
+
     let dim = 64;
     let mut idx = RankQuant::new(dim, 1);
     idx.add(&make_corpus(8901, 8, dim));
diff --git a/tests/redteam_gamma.rs b/tests/redteam_gamma.rs
index a95c0968..2ff14dfc 100644
--- a/tests/redteam_gamma.rs
+++ b/tests/redteam_gamma.rs
@@ -39,7 +39,7 @@ fn rank_to_bucket_zero_d_panics() {
 // ---------------------------------------------------------------------------
 
 #[test]
-#[should_panic(expected = "bits must be 1,2,4")]
+#[should_panic(expected = "bits must be 1,2,4,8")]
 fn rankquant_bytes_per_vec_zero_bits_panics() {
     let _ = rankquant_bytes_per_vec(64, 0);
 }