From 804e571f9ec4a3a6d240d24eec66ac025c8d5610 Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Mon, 25 May 2026 22:04:17 -0500
Subject: [PATCH 1/7] refactor: enforce unsafe_op_in_unsafe_fn crate-wide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move #![deny(unsafe_op_in_unsafe_fn)] from fastscan.rs to the crate root
(lib.rs) so every unsafe operation in the SIMD kernels must sit in an
explicit `unsafe {}` block rather than leaning on an enclosing `unsafe fn`.
Wraps the AVX-512/AVX2 kernel bodies in bitmap.rs, sign_bitmap.rs and
quant_kernels.rs, and the NEON popcount in util.rs. horizontal_sum_avx2 is
register-only (no memory access), so its intrinsics are safe under the
#[target_feature] gate and need no block (an explicit one would be
unused_unsafe). Purely additive — no kernel logic changes. Keeps the unsafe
surface visible to future edits (THREAT-SIMD-001).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/bitmap.rs        | 251 +++++++++++++-----------
 src/fastscan.rs      |   7 -
 src/lib.rs           |   7 +
 src/quant_kernels.rs | 442 ++++++++++++++++++++++---------------------
 src/sign_bitmap.rs   | 147 +++++++-------
 src/util.rs          |  71 ++++---
 6 files changed, 498 insertions(+), 427 deletions(-)

diff --git a/src/bitmap.rs b/src/bitmap.rs
index 5de75a99..394c8c57 100644
--- a/src/bitmap.rs
+++ b/src/bitmap.rs
@@ -491,26 +491,30 @@ unsafe fn bitmap_scan_avx512vpop(bitmaps: &[u64], n: usize, qpv: usize, q: &[u64
     // (`di < n`) each stay within their slice. AVX-512 F/VPOPCNTDQ are confirmed
     // by the `#[target_feature]` gate plus the caller's runtime
     // `is_x86_feature_detected!`.
-    debug_assert_eq!(qpv % 8, 0, "AVX-512 bitmap scan needs qpv % 8 == 0");
-    let lanes = qpv / 8;
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for l in 0..lanes {
-        q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
-    }
-    for di in 0..n {
-        let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-        let mut acc_zmm = _mm512_setzero_si512();
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0, "AVX-512 bitmap scan needs qpv % 8 == 0");
+        let lanes = qpv / 8;
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
         #[allow(clippy::needless_range_loop)]
         // indexed access is clearer / matches the kernel layout
         for l in 0..lanes {
-            let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
-            let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
-            let pop_zmm = _mm512_popcnt_epi64(and_zmm);
-            acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
+        }
+        for di in 0..n {
+            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+            let mut acc_zmm = _mm512_setzero_si512();
+            #[allow(clippy::needless_range_loop)]
+            // indexed access is clearer / matches the kernel layout
+            for l in 0..lanes {
+                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
+                let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            }
+            let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
+            top.maybe_insert(acc_sum as f32, di);
         }
-        let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
-        top.maybe_insert(acc_sum as f32, di);
     }
 }
 
@@ -553,25 +557,34 @@ unsafe fn bitmap_scan_collect_avx512vpop(
     scores: &mut [u32],
 ) {
     use std::arch::x86_64::*;
-    debug_assert_eq!(qpv % 8, 0);
-    let lanes = qpv / 8;
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for l in 0..lanes {
-        q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
-    }
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for di in 0..n {
-        let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-        let mut acc_zmm = _mm512_setzero_si512();
+    // SAFETY: same contract as the sibling `bitmap_scan_avx512vpop` — the caller
+    // (`bitmap_scan_collect`) gates dispatch on `qpv.is_multiple_of(8)`,
+    // `q.len() == qpv`, and `bitmaps.len() == n * qpv`, bounding all raw loads.
+    // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0);
+        let lanes = qpv / 8;
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
+        #[allow(clippy::needless_range_loop)]
+        // indexed access is clearer / matches the kernel layout
         for l in 0..lanes {
-            let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
-            let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
-            let pop_zmm = _mm512_popcnt_epi64(and_zmm);
-            acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        // indexed access is clearer / matches the kernel layout
+        for di in 0..n {
+            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+            let mut acc_zmm = _mm512_setzero_si512();
+            for l in 0..lanes {
+                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
+                let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            }
+            let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
+            scores[di] = acc_sum as u32;
         }
-        let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
-        scores[di] = acc_sum as u32;
     }
 }
 
@@ -635,77 +648,85 @@ unsafe fn bitmap_scan_collect_batched_avx512vpop(
     scores: &mut [u32],
 ) {
     use std::arch::x86_64::*;
-    debug_assert_eq!(qpv % 8, 0);
-    debug_assert_eq!(q_batch.len(), batch * qpv);
-    debug_assert_eq!(scores.len(), batch * n);
-    let lanes = qpv / 8;
-    const CHUNK: usize = BATCHED_AVX512_CHUNK;
-
-    // Pre-load all batch * lanes query ZMMs once. For typical
-    // (batch=8, lanes=2) this is 16 __m512i of register-equivalent
-    // state, which fits in the 32-ZMM file alongside the per-chunk
-    // accs and doc lane temps.
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes);
-    for bi in 0..batch {
-        for l in 0..lanes {
-            q_zmms.push(_mm512_loadu_si512(
-                q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i
-            ));
+    // SAFETY: same contract as the sibling `bitmap_scan_avx512vpop` — the caller
+    // (`bitmap_scan_collect_batched`) gates dispatch on `qpv.is_multiple_of(8)`,
+    // `q_batch.len() == batch * qpv`, `bitmaps.len() == n * qpv`, and
+    // `scores.len() == batch * n`, bounding all raw loads and `scores[…]` writes.
+    // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0);
+        debug_assert_eq!(q_batch.len(), batch * qpv);
+        debug_assert_eq!(scores.len(), batch * n);
+        let lanes = qpv / 8;
+        const CHUNK: usize = BATCHED_AVX512_CHUNK;
+
+        // Pre-load all batch * lanes query ZMMs once. For typical
+        // (batch=8, lanes=2) this is 16 __m512i of register-equivalent
+        // state, which fits in the 32-ZMM file alongside the per-chunk
+        // accs and doc lane temps.
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes);
+        for bi in 0..batch {
+            for l in 0..lanes {
+                q_zmms.push(_mm512_loadu_si512(
+                    q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i
+                ));
+            }
         }
-    }
 
-    // Hot path: process whole CHUNK-sized groups. The inner `for bi
-    // in 0..CHUNK` is bounded by a *const*, so LLVM unrolls it and
-    // promotes the `accs: [__m512i; CHUNK]` stack array to ZMM
-    // registers — that's the property that keeps the kernel
-    // competitive with the single-query AVX-512 path on a per-query
-    // basis, plus the bandwidth amortisation. A runtime-bounded
-    // `0..chunk` loop would force `accs[bi]` to spill to stack
-    // memory and double per-doc latency.
-    let mut chunk_start = 0usize;
-    while chunk_start + CHUNK <= batch {
-        for di in 0..n {
-            let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
-            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-            for l in 0..lanes {
-                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+        // Hot path: process whole CHUNK-sized groups. The inner `for bi
+        // in 0..CHUNK` is bounded by a *const*, so LLVM unrolls it and
+        // promotes the `accs: [__m512i; CHUNK]` stack array to ZMM
+        // registers — that's the property that keeps the kernel
+        // competitive with the single-query AVX-512 path on a per-query
+        // basis, plus the bandwidth amortisation. A runtime-bounded
+        // `0..chunk` loop would force `accs[bi]` to spill to stack
+        // memory and double per-doc latency.
+        let mut chunk_start = 0usize;
+        while chunk_start + CHUNK <= batch {
+            for di in 0..n {
+                let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
+                let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+                for l in 0..lanes {
+                    let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                    for bi in 0..CHUNK {
+                        let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
+                        let and_zmm = _mm512_and_si512(d_zmm, q_zmm);
+                        let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                        accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    }
+                }
                 for bi in 0..CHUNK {
-                    let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
-                    let and_zmm = _mm512_and_si512(d_zmm, q_zmm);
-                    let pop_zmm = _mm512_popcnt_epi64(and_zmm);
-                    accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
+                    scores[(chunk_start + bi) * n + di] = acc_sum as u32;
                 }
             }
-            for bi in 0..CHUNK {
-                let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
-                scores[(chunk_start + bi) * n + di] = acc_sum as u32;
-            }
+            chunk_start += CHUNK;
         }
-        chunk_start += CHUNK;
-    }
-    // Tail path: any remaining `batch % CHUNK` queries. Slower per
-    // doc (runtime-bounded inner loop, accs[bi] may spill) but the
-    // tail runs once per kernel call, not once per doc — total cost
-    // is at most CHUNK-1 queries of slower scan, dominated by the
-    // hot path for any batch > 1.
-    let tail = batch - chunk_start;
-    if tail > 0 {
-        for di in 0..n {
-            let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
-            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-            for l in 0..lanes {
-                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+        // Tail path: any remaining `batch % CHUNK` queries. Slower per
+        // doc (runtime-bounded inner loop, accs[bi] may spill) but the
+        // tail runs once per kernel call, not once per doc — total cost
+        // is at most CHUNK-1 queries of slower scan, dominated by the
+        // hot path for any batch > 1.
+        let tail = batch - chunk_start;
+        if tail > 0 {
+            for di in 0..n {
+                let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
+                let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+                for l in 0..lanes {
+                    let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                    for bi in 0..tail {
+                        let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
+                        let and_zmm = _mm512_and_si512(d_zmm, q_zmm);
+                        let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                        accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    }
+                }
                 for bi in 0..tail {
-                    let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
-                    let and_zmm = _mm512_and_si512(d_zmm, q_zmm);
-                    let pop_zmm = _mm512_popcnt_epi64(and_zmm);
-                    accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
+                    scores[(chunk_start + bi) * n + di] = acc_sum as u32;
                 }
             }
-            for bi in 0..tail {
-                let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
-                scores[(chunk_start + bi) * n + di] = acc_sum as u32;
-            }
         }
     }
 }
@@ -758,27 +779,31 @@ unsafe fn body_overlap_scores_subset_avx512vpop(
     // `n_vectors*qpv`-word buffer; and `out.len() == doc_ids.len()` bounds the
     // `out[i]` writes. AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` +
     // runtime detection.
-    debug_assert_eq!(qpv % 8, 0);
-    let lanes = qpv / 8;
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for l in 0..lanes {
-        q_zmms.push(_mm512_loadu_si512(
-            q_bitmap.as_ptr().add(l * 8) as *const __m512i
-        ));
-    }
-    for (i, &di) in doc_ids.iter().enumerate() {
-        let doc_ptr = bitmaps.as_ptr().add((di as usize) * qpv) as *const __m512i;
-        let mut acc_zmm = _mm512_setzero_si512();
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0);
+        let lanes = qpv / 8;
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
         #[allow(clippy::needless_range_loop)]
         // indexed access is clearer / matches the kernel layout
         for l in 0..lanes {
-            let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
-            let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
-            let pop_zmm = _mm512_popcnt_epi64(and_zmm);
-            acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            q_zmms.push(_mm512_loadu_si512(
+                q_bitmap.as_ptr().add(l * 8) as *const __m512i
+            ));
+        }
+        for (i, &di) in doc_ids.iter().enumerate() {
+            let doc_ptr = bitmaps.as_ptr().add((di as usize) * qpv) as *const __m512i;
+            let mut acc_zmm = _mm512_setzero_si512();
+            #[allow(clippy::needless_range_loop)]
+            // indexed access is clearer / matches the kernel layout
+            for l in 0..lanes {
+                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]);
+                let pop_zmm = _mm512_popcnt_epi64(and_zmm);
+                acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            }
+            let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
+            out[i] = acc_sum as u32;
         }
-        let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
-        out[i] = acc_sum as u32;
     }
 }
diff --git a/src/fastscan.rs b/src/fastscan.rs
index 7a757aed..a39b8de4 100644
--- a/src/fastscan.rs
+++ b/src/fastscan.rs
@@ -35,13 +35,6 @@
 //! [`l2_normalise`](crate::util::l2_normalise), and `k` is clamped to
 //! `n_vectors` exactly as the sibling search methods do.
 
-// Make every unsafe operation inside an `unsafe fn` require an explicit
-// `unsafe {}` block rather than leaning on the fn-level `unsafe`. This is
-// defense-in-depth for the AVX-512 FastScan kernel below: it keeps the kernel's
-// unsafe surface visible to future edits. Crate-wide rollout to the other SIMD
-// modules is tracked separately (see THREAT_MODEL.md, THREAT-SIMD-001).
-#![deny(unsafe_op_in_unsafe_fn)]
-
 use rayon::prelude::*;
 
 use crate::rank::{bucket_ranks, rank_transform, rankquant_norm};
diff --git a/src/lib.rs b/src/lib.rs
index 6fc1f35e..2182437e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -32,6 +32,13 @@
 //! assert_eq!(res.k, 10);
 //! ```
 
+// Every unsafe operation in the crate must sit inside an explicit `unsafe {}`
+// block rather than leaning on an enclosing `unsafe fn`. This keeps the unsafe
+// surface of the SIMD kernels (fastscan / bitmap / sign_bitmap / quant_kernels,
+// plus the NEON popcount in util) visible to every future edit
+// (THREAT_MODEL.md, THREAT-SIMD-001).
+#![deny(unsafe_op_in_unsafe_fn)]
+
 mod bitmap;
 mod fastscan;
 #[cfg(feature = "experimental")]
diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs
index 0b5fb8ad..92f790c7 100644
--- a/src/quant_kernels.rs
+++ b/src/quant_kernels.rs
@@ -162,61 +162,63 @@ pub(crate) unsafe fn scan_b2_asym_avx2(
     //   * `dim % K == 0` (asserted immediately below).
     // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and
     // `load_rankquant` re-validates the shape, so this holds on every path here.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        // Hard backstop: the dispatch in `quant.rs` must only route here
+        // when `dim % 16 == 0`. Kept as a real `assert!` (not debug-only)
+        // so a mis-dispatch fails loudly in release instead of silently
+        // dropping the trailing chunk and returning wrong top-k.
+        assert_eq!(dim % 16, 0, "b=2 AVX2 path needs dim % 16 == 0");
+        let bytes_per_vec = dim / 4;
+        // For each chunk of 4 doc bytes we extract 16 codes (top byte first,
+        // most-significant 2 bits first within a byte). Shift amounts:
+        //   chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3,
+        //   code k = (chunk >> ((15 - k) * 2)) & 3, for k in 0..16.
+        //
+        // Centre-drop: score(d) = Σ q[j]·(code[j] - 1.5)
+        //                       = Σ q[j]·code[j] - 1.5·Σ q[j]
+        // The second term is per-query constant and is added back to the
+        // TopK scores at finalize time. The hot loop only does the raw
+        // dot product against unsigned code values.
+        let shifts_hi = _mm256_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16);
+        let shifts_lo = _mm256_setr_epi32(14, 12, 10, 8, 6, 4, 2, 0);
+        let mask3 = _mm256_set1_epi32(3);
+
+        let bytes_per_chunk = 4usize;
+        let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
+
+        for di in 0..n {
+            let doc = packed.as_ptr().add(di * bytes_per_vec);
+            let mut acc_hi = _mm256_setzero_ps();
+            let mut acc_lo = _mm256_setzero_ps();
+
+            for c in 0..chunks_per_vec {
+                let chunk_ptr = doc.add(c * bytes_per_chunk);
+                let b0 = *chunk_ptr as u32;
+                let b1 = *chunk_ptr.add(1) as u32;
+                let b2 = *chunk_ptr.add(2) as u32;
+                let b3 = *chunk_ptr.add(3) as u32;
+                let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
+                let broadcast = _mm256_set1_epi32(chunk as i32);
+
+                let codes_hi = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_hi), mask3);
+                let codes_lo = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_lo), mask3);
+
+                let codes_f_hi = _mm256_cvtepi32_ps(codes_hi);
+                let codes_f_lo = _mm256_cvtepi32_ps(codes_lo);
+
+                let d_base = c * 16;
+                let q_hi = _mm256_loadu_ps(q.as_ptr().add(d_base));
+                let q_lo = _mm256_loadu_ps(q.as_ptr().add(d_base + 8));
+
+                acc_hi = _mm256_fmadd_ps(codes_f_hi, q_hi, acc_hi);
+                acc_lo = _mm256_fmadd_ps(codes_f_lo, q_lo, acc_lo);
+            }
 
-    // Hard backstop: the dispatch in `quant.rs` must only route here
-    // when `dim % 16 == 0`. Kept as a real `assert!` (not debug-only)
-    // so a mis-dispatch fails loudly in release instead of silently
-    // dropping the trailing chunk and returning wrong top-k.
-    assert_eq!(dim % 16, 0, "b=2 AVX2 path needs dim % 16 == 0");
-    let bytes_per_vec = dim / 4;
-    // For each chunk of 4 doc bytes we extract 16 codes (top byte first,
-    // most-significant 2 bits first within a byte). Shift amounts:
-    //   chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3,
-    //   code k = (chunk >> ((15 - k) * 2)) & 3, for k in 0..16.
-    //
-    // Centre-drop: score(d) = Σ q[j]·(code[j] - 1.5)
-    //                       = Σ q[j]·code[j] - 1.5·Σ q[j]
-    // The second term is per-query constant and is added back to the
-    // TopK scores at finalize time. The hot loop only does the raw
-    // dot product against unsigned code values.
-    let shifts_hi = _mm256_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16);
-    let shifts_lo = _mm256_setr_epi32(14, 12, 10, 8, 6, 4, 2, 0);
-    let mask3 = _mm256_set1_epi32(3);
-
-    let bytes_per_chunk = 4usize;
-    let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
-
-    for di in 0..n {
-        let doc = packed.as_ptr().add(di * bytes_per_vec);
-        let mut acc_hi = _mm256_setzero_ps();
-        let mut acc_lo = _mm256_setzero_ps();
-
-        for c in 0..chunks_per_vec {
-            let chunk_ptr = doc.add(c * bytes_per_chunk);
-            let b0 = *chunk_ptr as u32;
-            let b1 = *chunk_ptr.add(1) as u32;
-            let b2 = *chunk_ptr.add(2) as u32;
-            let b3 = *chunk_ptr.add(3) as u32;
-            let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
-            let broadcast = _mm256_set1_epi32(chunk as i32);
-
-            let codes_hi = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_hi), mask3);
-            let codes_lo = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_lo), mask3);
-
-            let codes_f_hi = _mm256_cvtepi32_ps(codes_hi);
-            let codes_f_lo = _mm256_cvtepi32_ps(codes_lo);
-
-            let d_base = c * 16;
-            let q_hi = _mm256_loadu_ps(q.as_ptr().add(d_base));
-            let q_lo = _mm256_loadu_ps(q.as_ptr().add(d_base + 8));
-
-            acc_hi = _mm256_fmadd_ps(codes_f_hi, q_hi, acc_hi);
-            acc_lo = _mm256_fmadd_ps(codes_f_lo, q_lo, acc_lo);
+            let total = _mm256_add_ps(acc_hi, acc_lo);
+            let raw = horizontal_sum_avx2(total);
+            top.maybe_insert(raw * scale, di);
         }
-
-        let total = _mm256_add_ps(acc_hi, acc_lo);
-        let raw = horizontal_sum_avx2(total);
-        top.maybe_insert(raw * scale, di);
     }
 }
 
@@ -241,46 +243,48 @@ pub(crate) unsafe fn scan_b4_asym_avx2(
     //   * `dim % K == 0` (asserted immediately below).
     // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and
     // `load_rankquant` re-validates the shape, so this holds on every path here.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
+        // loudly in release, not silently drop the trailing chunk.
+        assert_eq!(dim % 8, 0, "b=4 AVX2 path needs dim % 8 == 0");
+        let bytes_per_vec = dim / 2;
+        // For each chunk of 4 doc bytes we extract 8 codes (one nibble each).
+        //   chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3,
+        //   code k = (chunk >> ((7 - k) * 4)) & 0xF, for k in 0..8.
+        // Centre-drop: -7.5·Σq[j] is added back to the TopK scores at
+        // finalize time; the hot loop scores raw nibble values.
+        let shifts = _mm256_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+        let mask_f = _mm256_set1_epi32(0xF);
+
+        let bytes_per_chunk = 4usize;
+        let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
+
+        for di in 0..n {
+            let doc = packed.as_ptr().add(di * bytes_per_vec);
+            let mut acc = _mm256_setzero_ps();
+
+            for c in 0..chunks_per_vec {
+                let chunk_ptr = doc.add(c * bytes_per_chunk);
+                let b0 = *chunk_ptr as u32;
+                let b1 = *chunk_ptr.add(1) as u32;
+                let b2 = *chunk_ptr.add(2) as u32;
+                let b3 = *chunk_ptr.add(3) as u32;
+                let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
+                let broadcast = _mm256_set1_epi32(chunk as i32);
+
+                let codes = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts), mask_f);
+                let codes_f = _mm256_cvtepi32_ps(codes);
+
+                let d_base = c * 8;
+                let q_vec = _mm256_loadu_ps(q.as_ptr().add(d_base));
+
+                acc = _mm256_fmadd_ps(codes_f, q_vec, acc);
+            }
 
-    // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
-    // loudly in release, not silently drop the trailing chunk.
-    assert_eq!(dim % 8, 0, "b=4 AVX2 path needs dim % 8 == 0");
-    let bytes_per_vec = dim / 2;
-    // For each chunk of 4 doc bytes we extract 8 codes (one nibble each).
-    //   chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3,
-    //   code k = (chunk >> ((7 - k) * 4)) & 0xF, for k in 0..8.
-    // Centre-drop: -7.5·Σq[j] is added back to the TopK scores at
-    // finalize time; the hot loop scores raw nibble values.
-    let shifts = _mm256_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0);
-    let mask_f = _mm256_set1_epi32(0xF);
-
-    let bytes_per_chunk = 4usize;
-    let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
-
-    for di in 0..n {
-        let doc = packed.as_ptr().add(di * bytes_per_vec);
-        let mut acc = _mm256_setzero_ps();
-
-        for c in 0..chunks_per_vec {
-            let chunk_ptr = doc.add(c * bytes_per_chunk);
-            let b0 = *chunk_ptr as u32;
-            let b1 = *chunk_ptr.add(1) as u32;
-            let b2 = *chunk_ptr.add(2) as u32;
-            let b3 = *chunk_ptr.add(3) as u32;
-            let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
-            let broadcast = _mm256_set1_epi32(chunk as i32);
-
-            let codes = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts), mask_f);
-            let codes_f = _mm256_cvtepi32_ps(codes);
-
-            let d_base = c * 8;
-            let q_vec = _mm256_loadu_ps(q.as_ptr().add(d_base));
-
-            acc = _mm256_fmadd_ps(codes_f, q_vec, acc);
+            let raw = horizontal_sum_avx2(acc);
+            top.maybe_insert(raw * scale, di);
         }
-
-        let raw = horizontal_sum_avx2(acc);
-        top.maybe_insert(raw * scale, di);
     }
 }
 
@@ -288,6 +292,12 @@ pub(crate) unsafe fn scan_b4_asym_avx2(
 #[target_feature(enable = "avx2,fma")]
 unsafe fn horizontal_sum_avx2(v: std::arch::x86_64::__m256) -> f32 {
     use std::arch::x86_64::*;
+    // SAFETY: called only from `scan_b{2,4}_asym_avx2` which are themselves
+    // guarded by the AVX2+FMA `#[target_feature]` and the caller's runtime
+    // detection — the intrinsics here are always feature-available.
+    // All intrinsics in this body (extract/cast/add/cvt) are safe under the
+    // `avx2,fma` `#[target_feature]` gate; no explicit `unsafe {}` block is
+    // needed or permitted (`unused_unsafe` under `-D warnings`).
     let hi128 = _mm256_extractf128_ps(v, 1);
     let lo128 = _mm256_castps256_ps128(v);
     let sum128 = _mm_add_ps(lo128, hi128);
@@ -332,67 +342,69 @@ pub(crate) unsafe fn scan_b2_asym_avx512(
     //   * `dim % K == 0` (asserted immediately below).
     // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and
     // `load_rankquant` re-validates the shape, so this holds on every path here.
-
-    // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
-    // loudly in release, not silently drop the trailing 64-code block.
-    assert_eq!(
-        dim % 64,
-        0,
-        "b=2 AVX-512 path needs dim % 64 == 0 for 4-way unroll"
-    );
-    let bytes_per_vec = dim / 4;
-    let shifts = _mm512_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-    let mask3 = _mm512_set1_epi32(3);
-
-    let bytes_per_chunk = 4usize;
-    let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
-    // Process 4 chunks per outer iteration with 4 independent
-    // accumulators. Breaks the FMA dependency chain so the two Zen 5
-    // FMA ports can both fire each cycle instead of waiting on a
-    // single-acc dep chain.
-    let outer_iters = chunks_per_vec / 4;
-    debug_assert_eq!(chunks_per_vec % 4, 0);
-
-    for di in 0..n {
-        let doc = packed.as_ptr().add(di * bytes_per_vec);
-        let mut acc0 = _mm512_setzero_ps();
-        let mut acc1 = _mm512_setzero_ps();
-        let mut acc2 = _mm512_setzero_ps();
-        let mut acc3 = _mm512_setzero_ps();
-
-        for outer in 0..outer_iters {
-            let c0 = outer * 4;
-            let c1 = c0 + 1;
-            let c2 = c0 + 2;
-            let c3 = c0 + 3;
-
-            macro_rules! step {
-                ($c:expr, $acc:expr) => {{
-                    let chunk_ptr = doc.add($c * bytes_per_chunk);
-                    let b0 = *chunk_ptr as u32;
-                    let b1 = *chunk_ptr.add(1) as u32;
-                    let b2 = *chunk_ptr.add(2) as u32;
-                    let b3 = *chunk_ptr.add(3) as u32;
-                    let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
-                    let broadcast = _mm512_set1_epi32(chunk as i32);
-                    let codes = _mm512_and_si512(_mm512_srlv_epi32(broadcast, shifts), mask3);
-                    let codes_f = _mm512_cvtepi32_ps(codes);
-                    let d_base = $c * 16;
-                    let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base));
-                    $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc);
-                }};
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
+        // loudly in release, not silently drop the trailing 64-code block.
+        assert_eq!(
+            dim % 64,
+            0,
+            "b=2 AVX-512 path needs dim % 64 == 0 for 4-way unroll"
+        );
+        let bytes_per_vec = dim / 4;
+        let shifts = _mm512_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+        let mask3 = _mm512_set1_epi32(3);
+
+        let bytes_per_chunk = 4usize;
+        let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
+        // Process 4 chunks per outer iteration with 4 independent
+        // accumulators. Breaks the FMA dependency chain so the two Zen 5
+        // FMA ports can both fire each cycle instead of waiting on a
+        // single-acc dep chain.
+        let outer_iters = chunks_per_vec / 4;
+        debug_assert_eq!(chunks_per_vec % 4, 0);
+
+        for di in 0..n {
+            let doc = packed.as_ptr().add(di * bytes_per_vec);
+            let mut acc0 = _mm512_setzero_ps();
+            let mut acc1 = _mm512_setzero_ps();
+            let mut acc2 = _mm512_setzero_ps();
+            let mut acc3 = _mm512_setzero_ps();
+
+            for outer in 0..outer_iters {
+                let c0 = outer * 4;
+                let c1 = c0 + 1;
+                let c2 = c0 + 2;
+                let c3 = c0 + 3;
+
+                macro_rules! step {
+                    ($c:expr, $acc:expr) => {{
+                        let chunk_ptr = doc.add($c * bytes_per_chunk);
+                        let b0 = *chunk_ptr as u32;
+                        let b1 = *chunk_ptr.add(1) as u32;
+                        let b2 = *chunk_ptr.add(2) as u32;
+                        let b3 = *chunk_ptr.add(3) as u32;
+                        let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
+                        let broadcast = _mm512_set1_epi32(chunk as i32);
+                        let codes = _mm512_and_si512(_mm512_srlv_epi32(broadcast, shifts), mask3);
+                        let codes_f = _mm512_cvtepi32_ps(codes);
+                        let d_base = $c * 16;
+                        let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base));
+                        $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc);
+                    }};
+                }
+                step!(c0, acc0);
+                step!(c1, acc1);
+                step!(c2, acc2);
+                step!(c3, acc3);
             }
-            step!(c0, acc0);
-            step!(c1, acc1);
-            step!(c2, acc2);
-            step!(c3, acc3);
-        }
 
-        let s01 = _mm512_add_ps(acc0, acc1);
-        let s23 = _mm512_add_ps(acc2, acc3);
-        let total = _mm512_add_ps(s01, s23);
-        let raw = _mm512_reduce_add_ps(total);
-        top.maybe_insert(raw * scale, di);
+            let s01 = _mm512_add_ps(acc0, acc1);
+            let s23 = _mm512_add_ps(acc2, acc3);
+            let total = _mm512_add_ps(s01, s23);
+            let raw = _mm512_reduce_add_ps(total);
+            top.maybe_insert(raw * scale, di);
+        }
     }
 }
 
@@ -417,71 +429,73 @@ pub(crate) unsafe fn scan_b4_asym_avx512(
     //   * `dim % K == 0` (asserted immediately below).
     // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and
     // `load_rankquant` re-validates the shape, so this holds on every path here.
-
-    // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
-    // loudly in release, not silently drop the trailing 64-code block.
-    assert_eq!(
-        dim % 64,
-        0,
-        "b=4 AVX-512 path needs dim % 64 == 0 for 4-way unroll"
-    );
-    let bytes_per_vec = dim / 2;
-    let shifts = _mm512_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0);
-    let mask_f = _mm512_set1_epi32(0xF);
-
-    let bytes_per_chunk = 8usize;
-    let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
-    let outer_iters = chunks_per_vec / 4;
-    debug_assert_eq!(chunks_per_vec % 4, 0);
-
-    for di in 0..n {
-        let doc = packed.as_ptr().add(di * bytes_per_vec);
-        let mut acc0 = _mm512_setzero_ps();
-        let mut acc1 = _mm512_setzero_ps();
-        let mut acc2 = _mm512_setzero_ps();
-        let mut acc3 = _mm512_setzero_ps();
-
-        for outer in 0..outer_iters {
-            macro_rules! step {
-                ($c:expr, $acc:expr) => {{
-                    let chunk_ptr = doc.add($c * bytes_per_chunk);
-                    let lo0 = *chunk_ptr as u32;
-                    let lo1 = *chunk_ptr.add(1) as u32;
-                    let lo2 = *chunk_ptr.add(2) as u32;
-                    let lo3 = *chunk_ptr.add(3) as u32;
-                    let hi0 = *chunk_ptr.add(4) as u32;
-                    let hi1 = *chunk_ptr.add(5) as u32;
-                    let hi2 = *chunk_ptr.add(6) as u32;
-                    let hi3 = *chunk_ptr.add(7) as u32;
-                    let chunk_lo = (lo0 << 24) | (lo1 << 16) | (lo2 << 8) | lo3;
-                    let chunk_hi = (hi0 << 24) | (hi1 << 16) | (hi2 << 8) | hi3;
-                    let lo_zmm = _mm512_set1_epi32(chunk_lo as i32);
-                    let hi_zmm = _mm512_set1_epi32(chunk_hi as i32);
-                    // Blend mask 0xFF00 (bits 8-15 set): _mm512_mask_blend_epi32
-                    // takes lane i from `hi_zmm` where bit i is set, else from
-                    // `lo_zmm` — so lanes 0-7 <- chunk_lo, lanes 8-15 <- chunk_hi.
-                    // Pairs with `shifts` = [28,24,20,16,12,8,4,0] x2: lanes 0-7
-                    // extract chunk_lo's 8 nibbles (codes 0-7), lanes 8-15 extract
-                    // chunk_hi's (codes 8-15), most-significant nibble first.
-                    let combined = _mm512_mask_blend_epi32(0xFF00u16, lo_zmm, hi_zmm);
-                    let codes = _mm512_and_si512(_mm512_srlv_epi32(combined, shifts), mask_f);
-                    let codes_f = _mm512_cvtepi32_ps(codes);
-                    let d_base = $c * 16;
-                    let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base));
-                    $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc);
-                }};
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail
+        // loudly in release, not silently drop the trailing 64-code block.
+        assert_eq!(
+            dim % 64,
+            0,
+            "b=4 AVX-512 path needs dim % 64 == 0 for 4-way unroll"
+        );
+        let bytes_per_vec = dim / 2;
+        let shifts = _mm512_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0);
+        let mask_f = _mm512_set1_epi32(0xF);
+
+        let bytes_per_chunk = 8usize;
+        let chunks_per_vec = bytes_per_vec / bytes_per_chunk;
+        let outer_iters = chunks_per_vec / 4;
+        debug_assert_eq!(chunks_per_vec % 4, 0);
+
+        for di in 0..n {
+            let doc = packed.as_ptr().add(di * bytes_per_vec);
+            let mut acc0 = _mm512_setzero_ps();
+            let mut acc1 = _mm512_setzero_ps();
+            let mut acc2 = _mm512_setzero_ps();
+            let mut acc3 = _mm512_setzero_ps();
+
+            for outer in 0..outer_iters {
+                macro_rules! step {
+                    ($c:expr, $acc:expr) => {{
+                        let chunk_ptr = doc.add($c * bytes_per_chunk);
+                        let lo0 = *chunk_ptr as u32;
+                        let lo1 = *chunk_ptr.add(1) as u32;
+                        let lo2 = *chunk_ptr.add(2) as u32;
+                        let lo3 = *chunk_ptr.add(3) as u32;
+                        let hi0 = *chunk_ptr.add(4) as u32;
+                        let hi1 = *chunk_ptr.add(5) as u32;
+                        let hi2 = *chunk_ptr.add(6) as u32;
+                        let hi3 = *chunk_ptr.add(7) as u32;
+                        let chunk_lo = (lo0 << 24) | (lo1 << 16) | (lo2 << 8) | lo3;
+                        let chunk_hi = (hi0 << 24) | (hi1 << 16) | (hi2 << 8) | hi3;
+                        let lo_zmm = _mm512_set1_epi32(chunk_lo as i32);
+                        let hi_zmm = _mm512_set1_epi32(chunk_hi as i32);
+                        // Blend mask 0xFF00 (bits 8-15 set): _mm512_mask_blend_epi32
+                        // takes lane i from `hi_zmm` where bit i is set, else from
+                        // `lo_zmm` — so lanes 0-7 <- chunk_lo, lanes 8-15 <- chunk_hi.
+                        // Pairs with `shifts` = [28,24,20,16,12,8,4,0] x2: lanes 0-7
+                        // extract chunk_lo's 8 nibbles (codes 0-7), lanes 8-15 extract
+                        // chunk_hi's (codes 8-15), most-significant nibble first.
+                        let combined = _mm512_mask_blend_epi32(0xFF00u16, lo_zmm, hi_zmm);
+                        let codes = _mm512_and_si512(_mm512_srlv_epi32(combined, shifts), mask_f);
+                        let codes_f = _mm512_cvtepi32_ps(codes);
+                        let d_base = $c * 16;
+                        let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base));
+                        $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc);
+                    }};
+                }
+                let c0 = outer * 4;
+                step!(c0, acc0);
+                step!(c0 + 1, acc1);
+                step!(c0 + 2, acc2);
+                step!(c0 + 3, acc3);
             }
-            let c0 = outer * 4;
-            step!(c0, acc0);
-            step!(c0 + 1, acc1);
-            step!(c0 + 2, acc2);
-            step!(c0 + 3, acc3);
-        }
 
-        let s01 = _mm512_add_ps(acc0, acc1);
-        let s23 = _mm512_add_ps(acc2, acc3);
-        let total = _mm512_add_ps(s01, s23);
-        let raw = _mm512_reduce_add_ps(total);
-        top.maybe_insert(raw * scale, di);
+            let s01 = _mm512_add_ps(acc0, acc1);
+            let s23 = _mm512_add_ps(acc2, acc3);
+            let total = _mm512_add_ps(s01, s23);
+            let raw = _mm512_reduce_add_ps(total);
+            top.maybe_insert(raw * scale, di);
+        }
     }
 }
diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs
index 4cf98a02..dd6066e6 100644
--- a/src/sign_bitmap.rs
+++ b/src/sign_bitmap.rs
@@ -307,25 +307,34 @@ unsafe fn sign_scan_collect_avx512vpop(
     scores: &mut [u32],
 ) {
     use std::arch::x86_64::*;
-    debug_assert_eq!(qpv % 8, 0);
-    let lanes = qpv / 8;
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for l in 0..lanes {
-        q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
-    }
-    #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout
-    for di in 0..n {
-        let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-        let mut acc_zmm = _mm512_setzero_si512();
+    // SAFETY: mirrors `bitmap_scan_collect_avx512vpop` — the caller
+    // (`sign_scan_collect`) gates dispatch on `qpv.is_multiple_of(8)`,
+    // `q.len() == qpv`, and `bitmaps.len() == n * qpv`, bounding all raw loads.
+    // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0);
+        let lanes = qpv / 8;
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes);
+        #[allow(clippy::needless_range_loop)]
+        // indexed access is clearer / matches the kernel layout
         for l in 0..lanes {
-            let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
-            let xor_zmm = _mm512_xor_si512(d_zmm, q_zmms[l]);
-            let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
-            acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        // indexed access is clearer / matches the kernel layout
+        for di in 0..n {
+            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+            let mut acc_zmm = _mm512_setzero_si512();
+            for l in 0..lanes {
+                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                let xor_zmm = _mm512_xor_si512(d_zmm, q_zmms[l]);
+                let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
+                acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm);
+            }
+            let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
+            scores[di] = acc_sum as u32;
         }
-        let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm);
-        scores[di] = acc_sum as u32;
     }
 }
 
@@ -380,63 +389,71 @@ unsafe fn sign_scan_collect_batched_avx512vpop(
     scores: &mut [u32],
 ) {
     use std::arch::x86_64::*;
-    debug_assert_eq!(qpv % 8, 0);
-    debug_assert_eq!(q_batch.len(), batch * qpv);
-    debug_assert_eq!(scores.len(), batch * n);
-    let lanes = qpv / 8;
-    const CHUNK: usize = BATCHED_AVX512_CHUNK;
-
-    let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes);
-    for bi in 0..batch {
-        for l in 0..lanes {
-            q_zmms.push(_mm512_loadu_si512(
-                q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i
-            ));
+    // SAFETY: mirrors `bitmap_scan_collect_batched_avx512vpop` — the caller
+    // (`sign_scan_collect_batched`) gates dispatch on `qpv.is_multiple_of(8)`,
+    // `q_batch.len() == batch * qpv`, `bitmaps.len() == n * qpv`, and
+    // `scores.len() == batch * n`, bounding all raw loads and `scores[…]` writes.
+    // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection.
+    // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        debug_assert_eq!(qpv % 8, 0);
+        debug_assert_eq!(q_batch.len(), batch * qpv);
+        debug_assert_eq!(scores.len(), batch * n);
+        let lanes = qpv / 8;
+        const CHUNK: usize = BATCHED_AVX512_CHUNK;
+
+        let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes);
+        for bi in 0..batch {
+            for l in 0..lanes {
+                q_zmms.push(_mm512_loadu_si512(
+                    q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i
+                ));
+            }
         }
-    }
 
-    // Hot path: CHUNK-sized groups; const-bounded inner bi loop so
-    // LLVM unrolls and promotes the accs array to ZMM registers.
-    let mut chunk_start = 0usize;
-    while chunk_start + CHUNK <= batch {
-        for di in 0..n {
-            let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
-            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-            for l in 0..lanes {
-                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+        // Hot path: CHUNK-sized groups; const-bounded inner bi loop so
+        // LLVM unrolls and promotes the accs array to ZMM registers.
+        let mut chunk_start = 0usize;
+        while chunk_start + CHUNK <= batch {
+            for di in 0..n {
+                let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
+                let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+                for l in 0..lanes {
+                    let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                    for bi in 0..CHUNK {
+                        let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
+                        let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm);
+                        let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
+                        accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    }
+                }
                 for bi in 0..CHUNK {
-                    let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
-                    let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm);
-                    let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
-                    accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
+                    scores[(chunk_start + bi) * n + di] = acc_sum as u32;
                 }
             }
-            for bi in 0..CHUNK {
-                let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
-                scores[(chunk_start + bi) * n + di] = acc_sum as u32;
-            }
+            chunk_start += CHUNK;
         }
-        chunk_start += CHUNK;
-    }
-    // Tail.
-    let tail = batch - chunk_start;
-    if tail > 0 {
-        for di in 0..n {
-            let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
-            let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
-            for l in 0..lanes {
-                let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+        // Tail.
+        let tail = batch - chunk_start;
+        if tail > 0 {
+            for di in 0..n {
+                let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK];
+                let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i;
+                for l in 0..lanes {
+                    let d_zmm = _mm512_loadu_si512(doc_ptr.add(l));
+                    for bi in 0..tail {
+                        let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
+                        let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm);
+                        let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
+                        accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    }
+                }
                 for bi in 0..tail {
-                    let q_zmm = q_zmms[(chunk_start + bi) * lanes + l];
-                    let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm);
-                    let pop_zmm = _mm512_popcnt_epi64(xor_zmm);
-                    accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm);
+                    let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
+                    scores[(chunk_start + bi) * n + di] = acc_sum as u32;
                 }
             }
-            for bi in 0..tail {
-                let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]);
-                scores[(chunk_start + bi) * n + di] = acc_sum as u32;
-            }
         }
     }
 }
diff --git a/src/util.rs b/src/util.rs
index 933dc0c5..047444fa 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -204,21 +204,30 @@ fn xor_popcount_scalar(doc: &[u64], q: &[u64]) -> u32 {
 #[inline]
 unsafe fn and_popcount_neon(doc: &[u64], q: &[u64]) -> u32 {
     use std::arch::aarch64::*;
-    let qpv = doc.len();
-    let dptr = doc.as_ptr() as *const u8;
-    let qptr = q.as_ptr() as *const u8;
-    let mut acc = 0u32;
-    let mut w = 0usize;
-    while w + 2 <= qpv {
-        let dv = vld1q_u8(dptr.add(w * 8));
-        let qv = vld1q_u8(qptr.add(w * 8));
-        acc += vaddvq_u8(vcntq_u8(vandq_u8(dv, qv))) as u32;
-        w += 2;
-    }
-    if w < qpv {
-        acc += (doc[w] & q[w]).count_ones();
+    // SAFETY: NEON is part of the aarch64 baseline ABI — these intrinsics
+    // are unconditionally available on aarch64. The `vld1q_u8` loads read
+    // 16 bytes starting at `dptr/qptr + w*8`; `w + 2 <= qpv` guarantees
+    // both offsets are within the slice (each u64 is 8 bytes, so 2×u64 = 16
+    // bytes). The trailing scalar path reads `doc[w]`/`q[w]` with a safe
+    // slice index. The explicit block is required by
+    // `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        let qpv = doc.len();
+        let dptr = doc.as_ptr() as *const u8;
+        let qptr = q.as_ptr() as *const u8;
+        let mut acc = 0u32;
+        let mut w = 0usize;
+        while w + 2 <= qpv {
+            let dv = vld1q_u8(dptr.add(w * 8));
+            let qv = vld1q_u8(qptr.add(w * 8));
+            acc += vaddvq_u8(vcntq_u8(vandq_u8(dv, qv))) as u32;
+            w += 2;
+        }
+        if w < qpv {
+            acc += (doc[w] & q[w]).count_ones();
+        }
+        acc
     }
-    acc
 }
 
 /// NEON XOR-popcount (sign-bitmap Hamming); see [`and_popcount_neon`].
@@ -226,21 +235,27 @@ unsafe fn and_popcount_neon(doc: &[u64], q: &[u64]) -> u32 {
 #[inline]
 unsafe fn xor_popcount_neon(doc: &[u64], q: &[u64]) -> u32 {
     use std::arch::aarch64::*;
-    let qpv = doc.len();
-    let dptr = doc.as_ptr() as *const u8;
-    let qptr = q.as_ptr() as *const u8;
-    let mut acc = 0u32;
-    let mut w = 0usize;
-    while w + 2 <= qpv {
-        let dv = vld1q_u8(dptr.add(w * 8));
-        let qv = vld1q_u8(qptr.add(w * 8));
-        acc += vaddvq_u8(vcntq_u8(veorq_u8(dv, qv))) as u32;
-        w += 2;
-    }
-    if w < qpv {
-        acc += (doc[w] ^ q[w]).count_ones();
+    // SAFETY: same contract as `and_popcount_neon` — NEON baseline ABI,
+    // `vld1q_u8` loads bounded by `w + 2 <= qpv`, trailing word via safe
+    // index. The explicit block is required by
+    // `#![deny(unsafe_op_in_unsafe_fn)]`.
+    unsafe {
+        let qpv = doc.len();
+        let dptr = doc.as_ptr() as *const u8;
+        let qptr = q.as_ptr() as *const u8;
+        let mut acc = 0u32;
+        let mut w = 0usize;
+        while w + 2 <= qpv {
+            let dv = vld1q_u8(dptr.add(w * 8));
+            let qv = vld1q_u8(qptr.add(w * 8));
+            acc += vaddvq_u8(vcntq_u8(veorq_u8(dv, qv))) as u32;
+            w += 2;
+        }
+        if w < qpv {
+            acc += (doc[w] ^ q[w]).count_ones();
+        }
+        acc
     }
-    acc
 }
 
 /// WASM `simd128` AND-popcount: 16 bytes (2×`u64`) per `u8x16_popcnt`,

From a5de3e79e8da2891cc130814531a481c6002260a Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Mon, 25 May 2026 22:04:22 -0500
Subject: [PATCH 2/7] ci: add bounded cargo-fuzz smoke (PR + weekly sweep)

Adds .github/workflows/fuzz.yml: a 60s/target smoke over load_rank,
load_rankquant and fastscan_b2 on every pull request / push to main, plus a
weekly full sweep over all seven targets. Surfaces loader / write->load
round-trip / FastScan-kernel regressions in CI between manual campaigns
(THREAT-FUZZ-002). cargo-fuzz is version-pinned (0.13.1) and every action is
SHA-pinned; read-only token; the matrix target is passed via env (no run:
injection surface, per THREAT-CICD-001).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 .github/workflows/fuzz.yml | 98 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 .github/workflows/fuzz.yml

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
new file mode 100644
index 00000000..99d775bd
--- /dev/null
+++ b/.github/workflows/fuzz.yml
@@ -0,0 +1,98 @@
+name: fuzz
+
+# Bounded cargo-fuzz smoke. The seven targets in fuzz/ are normally exercised
+# in manual campaigns; this adds CI cadence so a regression that reintroduces a
+# loader panic / OOM, breaks the write->load round-trip, or destabilises the
+# FastScan kernel surfaces in CI rather than only at the next manual run
+# (THREAT-FUZZ-002 in THREAT_MODEL.md).
+#
+#   * pull_request / push(main): a SHORT smoke (60s/target) over the
+#     highest-value targets — fast enough to run on every change.
+#   * schedule (weekly) / workflow_dispatch: a LONGER sweep (300s/target)
+#     across ALL seven targets.
+#
+# This runs UNATTENDED on a cron schedule, so every third-party action is
+# SHA-pinned (not the mutable @vN tags ci.yml uses on human-triggered push/PR)
+# and cargo-fuzz is version-pinned — a fuzz smoke must not itself become a
+# supply-chain hole. Read-only token; the only `run:` interpolation is the
+# matrix target name, passed through `env:` (never inlined into the shell) so
+# there is no template-injection surface (THREAT-CICD-001).
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+  schedule:
+    - cron: "0 5 * * 4" # 05:00 UTC every Thursday (clear of audit/scorecard Mon + codeql Wed)
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: fuzz-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Short per-change smoke over the highest-value targets: two loaders plus the
+  # FastScan b=2 kernel (the one unsafe-heavy scan path the loader targets do
+  # not reach).
+  smoke:
+    name: fuzz smoke (60s)
+    if: github.event_name == 'pull_request' || github.event_name == 'push'
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [load_rank, load_rankquant, fastscan_b2]
+    steps:
+      - uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
+        with:
+          egress-policy: audit
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below
+        with:
+          toolchain: nightly
+      - name: Install cargo-fuzz (pinned)
+        run: cargo install cargo-fuzz --version 0.13.1 --locked
+      - name: Smoke
+        env:
+          TARGET: ${{ matrix.target }}
+        run: cargo +nightly fuzz run "$TARGET" -- -max_total_time=60 -rss_limit_mb=4096
+
+  # Weekly full sweep over all seven targets at a larger time budget.
+  weekly:
+    name: fuzz weekly (300s)
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - load_rank
+          - load_rankquant
+          - load_bitmap
+          - load_sign_bitmap
+          - roundtrip_rankquant
+          - search_rankquant
+          - fastscan_b2
+    steps:
+      - uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
+        with:
+          egress-policy: audit
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below
+        with:
+          toolchain: nightly
+      - name: Install cargo-fuzz (pinned)
+        run: cargo install cargo-fuzz --version 0.13.1 --locked
+      - name: Fuzz
+        env:
+          TARGET: ${{ matrix.target }}
+        run: cargo +nightly fuzz run "$TARGET" -- -max_total_time=300 -rss_limit_mb=4096

From 73d0638583c61006344f548eeef04857df64d0ac Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Mon, 25 May 2026 22:04:27 -0500
Subject: [PATCH 3/7] fix: reject rank >= d in rank_to_bucket (fail-loud)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rank_to_bucket clamped the computed bucket with `b.min(n_buckets - 1)`,
silently accepting rank >= d. A valid rank is a position in [0, d), so it now
asserts `rank < d` and drops the clamp — matching the fail-loud contract of
the sibling primitives pack_buckets / bucket_centre (#26). Valid rank vectors
(a permutation of [0, d)) are unaffected.

The Python bindings gain matching guards on rank_to_bucket and bucket_ranks so
the new assert surfaces as a clean ValueError rather than a PanicException,
with red-team tests for both. Also refreshes the stale pack_buckets binding
comment (the core now asserts in-range, no longer masks).

Closes #28.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 ordvec-python/src/lib.rs                 | 27 ++++++++++++++++--
 ordvec-python/tests/test_redteam_fuzz.py | 14 +++++++++
 src/rank.rs                              | 36 ++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs
index 301980ec..922f3a3c 100644
--- a/ordvec-python/src/lib.rs
+++ b/ordvec-python/src/lib.rs
@@ -1084,6 +1084,14 @@ fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> PyResult<u8> {
     if d == 0 {
         return Err(pyo3::exceptions::PyValueError::new_err("d must be > 0"));
     }
+    // The core `rank_to_bucket` now asserts `rank < d` (fail-loud, matching the
+    // other bucket primitives); surface that as a clean `ValueError` rather
+    // than letting the assert escape as a `PanicException`.
+    if rank as usize >= d {
+        return Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "rank ({rank}) must be < d ({d})"
+        )));
+    }
     Ok(ordvec_core::rank::rank_to_bucket(rank, d, bits))
 }
 
@@ -1110,6 +1118,17 @@ fn bucket_ranks<'py>(
     if slice.is_empty() {
         return Ok(Vec::<u8>::new().into_pyarray(py));
     }
+    // `bucket_ranks` treats the input as a rank vector: each entry indexes into
+    // `[0, len)`, and the core `rank_to_bucket` now asserts `rank < len`. Reject
+    // an out-of-range entry here with a clean `ValueError` rather than letting
+    // that assert surface as a `PanicException`. A valid rank vector (a
+    // permutation of `[0, len)`) never trips this.
+    let d = slice.len();
+    if let Some(&bad) = slice.iter().find(|&&r| r as usize >= d) {
+        return Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "rank ({bad}) must be < d ({d})"
+        )));
+    }
     Ok(ordvec_core::rank::bucket_ranks(slice, bits).into_pyarray(py))
 }
 
@@ -1135,9 +1154,11 @@ fn pack_buckets<'py>(
             slice.len()
         )));
     }
-    // Reject out-of-range bucket codes rather than silently masking them: the
-    // core packs `b & ((1 << bits) - 1)`, so a value with high bits set would be
-    // truncated to a different bucket. The bucket alphabet is [0, 1 << bits).
+    // Reject out-of-range bucket codes here so the caller gets a clean
+    // `ValueError`: the core `pack_buckets` now *asserts* every code is in
+    // `[0, 1 << bits)` (it fails loud rather than masking), so an unchecked
+    // out-of-range value would otherwise escape as a `PanicException`. The
+    // bucket alphabet is [0, 1 << bits).
     let max_code = (1u16 << bits) - 1;
     if let Some(&bad) = slice.iter().find(|&&b| b as u16 > max_code) {
         return Err(pyo3::exceptions::PyValueError::new_err(format!(
diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py
index 9ec151ce..a1216c38 100644
--- a/ordvec-python/tests/test_redteam_fuzz.py
+++ b/ordvec-python/tests/test_redteam_fuzz.py
@@ -617,6 +617,20 @@ def test_rank_to_bucket_d_zero_value_error():
         rank_to_bucket(0, 0, 2)
 
 
+def test_rank_to_bucket_rank_ge_d_value_error():
+    # The core asserts rank < d (fail-loud, like the other bucket primitives);
+    # the binding surfaces it as a clean ValueError, not a PanicException.
+    with pytest.raises(ValueError, match="must be < d"):
+        rank_to_bucket(8, 8, 2)
+
+
+def test_bucket_ranks_out_of_range_value_error():
+    # An entry >= len() would trip the core's `rank < d` assert; the binding
+    # rejects it as a ValueError rather than letting it surface as a panic.
+    with pytest.raises(ValueError, match="must be < d"):
+        bucket_ranks(np.array([0, 5, 2, 3], dtype=np.uint16), 2)
+
+
 @pytest.mark.parametrize("bits", [8, 255])
 def test_bucket_centre_bits_above_7_value_error(bits):
     with pytest.raises(ValueError, match="bits"):
diff --git a/src/rank.rs b/src/rank.rs
index 5f4077fa..e35535df 100644
--- a/src/rank.rs
+++ b/src/rank.rs
@@ -64,6 +64,14 @@ pub fn rank_transform_into(v: &[f32], out: &mut [u16]) {
 
 /// Bucket a single rank into one of `1 << bits` equal-width bins on
 /// `[0, d)`. Returns a value in `[0, 1 << bits)`.
+///
+/// # Panics
+/// Panics if `bits > 7`, if `d == 0`, or if `rank >= d`. The `rank < d`
+/// guard fails loud in *every* build — like the sibling [`pack_buckets`] and
+/// [`bucket_centre`] checks — rather than silently clamping an out-of-range
+/// rank into the top bucket. Internal callers feed ranks straight from
+/// [`rank_transform`] (a permutation of `[0, d)`), so it never trips on the
+/// hot path.
 #[inline]
 pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
     // `bits` is a `u8`, so a caller could pass e.g. 8 or 255. `1u32 << bits`
@@ -73,17 +81,32 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
     // zero. Guard both up front so the failure is loud in every build.
     assert!(bits <= 7, "bits too large");
     assert!(d > 0, "d must be positive");
+    // A valid rank is a position in `[0, d)`. Reject `rank >= d` loudly instead
+    // of silently clamping the quotient back into range: the rest of the public
+    // bucket API ([`pack_buckets`] / [`bucket_centre`]) fails loud on an
+    // out-of-domain argument, so a direct caller that miscomputes a rank should
+    // hear about it rather than receive a plausible-but-wrong bucket.
+    assert!((rank as usize) < d, "rank ({rank}) must be < d ({d})");
     let n_buckets = 1u32 << bits;
     // u64 math: `d` is a `usize` and reaches this from the Python binding as a
     // free argument, so `d as u32` could truncate a `d >= 2^32` (e.g. to 0,
     // which would divide by zero and panic). rank ≤ u16::MAX and n_buckets ≤
     // 128, so the product fits u64 comfortably; over the realistic d ≤ u16::MAX
     // domain this is bit-identical to the previous u32 form.
-    let b = (rank as u64 * n_buckets as u64) / d as u64;
-    b.min(n_buckets as u64 - 1) as u8
+    //
+    // With `rank < d` guaranteed above, `rank * n_buckets / d < n_buckets`
+    // (integer division floors), so the quotient already lands in
+    // `[0, n_buckets)` and fits the returned `u8` without a clamp.
+    ((rank as u64 * n_buckets as u64) / d as u64) as u8
 }
 
 /// Bucket every entry of a full rank vector.
+///
+/// # Panics
+/// Panics if `bits > 7`, or — via [`rank_to_bucket`] — if any entry is
+/// `>= ranks.len()`. A valid rank vector is a permutation of
+/// `[0, ranks.len())`, so a well-formed input never trips the latter; empty
+/// input returns empty without invoking the per-entry guard.
 pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec<u8> {
     let d = ranks.len();
     ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect()
@@ -567,6 +590,15 @@ mod tests {
         assert!(rank_to_bucket(u16::MAX, huge_d, 2) < 4);
     }
 
+    #[test]
+    #[should_panic(expected = "must be < d")]
+    fn rank_to_bucket_rejects_rank_ge_d() {
+        // A valid rank lives in `[0, d)`; `rank == d` is out of range. It used
+        // to clamp silently to the top bucket — now it fails loud in release
+        // too, matching pack_buckets / bucket_centre.
+        let _ = rank_to_bucket(8, 8, 2);
+    }
+
     #[test]
     fn pack_unpack_round_trip_bits2() {
         let buckets: Vec<u8> = (0..16).map(|i| (i % 4) as u8).collect();

From 4ac782232dbb4727f5f550bb2ecbef9e2283519d Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Mon, 25 May 2026 22:04:33 -0500
Subject: [PATCH 4/7] docs: mark SUPPLY-002 / SIMD-001 / FUZZ-002 mitigated

Threat model + RELEASING now reflect the in-place controls:
- SUPPLY-002: GitHub immutable releases enabled + main branch protection
  (PR review required, force-pushes and deletions blocked), closing the
  release-tag/asset mutability surface the registries already close.
- SIMD-001: unsafe_op_in_unsafe_fn now denied crate-wide.
- FUZZ-002: fuzz.yml CI smoke + weekly sweep.
Corrects the workflow count (12 -> 13) and adds the CHANGELOG entries.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 CHANGELOG.md    | 18 +++++++++++
 RELEASING.md    | 14 +++++----
 THREAT_MODEL.md | 84 +++++++++++++++++++++++++++----------------------
 3 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be58468d..bfa5ce55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- **CI fuzz smoke** (`.github/workflows/fuzz.yml`): a bounded cargo-fuzz run on
+  every pull request / push to `main` (60s each over `load_rank`,
+  `load_rankquant`, and `fastscan_b2`) plus a weekly full sweep over all seven
+  targets, so a loader, write→load round-trip, or FastScan-kernel regression
+  surfaces in CI between manual campaigns (THREAT-FUZZ-002). cargo-fuzz is
+  version-pinned and the actions are SHA-pinned.
+
 ### Changed
 
+- **`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced crate-wide** (previously
+  only in `fastscan.rs`): every unsafe operation in the `bitmap`, `sign_bitmap`,
+  `quant_kernels`, and `util` (NEON) SIMD kernels now sits in an explicit
+  `unsafe {}` block, keeping the unsafe surface visible to future edits
+  (THREAT-SIMD-001).
+- **`rank::rank_to_bucket` rejects `rank >= d`** — it now panics (and the Python
+  binding raises `ValueError`) instead of silently clamping the result into
+  range, matching the fail-loud contract of `pack_buckets` / `bucket_centre`.
+  Valid rank vectors (a permutation of `[0, d)`) are unaffected.
 - **Python bindings (`ordvec-python`):** raised the floor to **Python 3.10** and
   **numpy 2.0**; the abi3 wheel target moves to `abi3-py310`. Python 3.9 reached
   end-of-life (October 2025) and pytest's CVE-2025-71176 fix dropped 3.9 support.
diff --git a/RELEASING.md b/RELEASING.md
index 605dca36..5166dbb3 100644
--- a/RELEASING.md
+++ b/RELEASING.md
@@ -39,13 +39,15 @@ Trusted Publishing step.
 > These two settings are the supply-chain backstop the workflow code cannot
 > express on its own (THREAT-SUPPLY-001 in [THREAT_MODEL.md](THREAT_MODEL.md)).
 
-### Recommended (open)
+### Tag and branch protection
 
-- A **`v*` tag-protection ruleset** (block update + deletion) and a basic
-  `main` ruleset, so a release tag cannot be force-moved and `main` cannot be
-  force-pushed/deleted (THREAT-SUPPLY-002). Registries are already immutable
-  (crates.io is yank-only; PyPI burns a version on delete), so this closes the
-  remaining GitHub-side mutability surface.
+- **Immutable releases** is enabled, so a published release's `v*` tag cannot be
+  force-moved or deleted and its assets cannot be replaced after publication.
+  This closes the GitHub-side mutability surface the registries already close on
+  their end (crates.io is yank-only; PyPI burns a version on delete).
+- **`main` is a protected branch** — pull-request review is required and
+  force-pushes and deletions are blocked, so the branch a release dispatches
+  from cannot be rewritten (THREAT-SUPPLY-002).
 
 ## Checklist
 
diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md
index 3786f416..f2890fe2 100644
--- a/THREAT_MODEL.md
+++ b/THREAT_MODEL.md
@@ -69,7 +69,7 @@ absence of a second maintainer is itself a tracked supply-chain residual
 | **Compute kernels** | `fastscan.rs`, `quant_kernels.rs`, `bitmap.rs`, `sign_bitmap.rs` | Trust established after format validation |
 | **Index API** | `rank.rs`, `quant.rs`, `bitmap.rs`, `sign_bitmap.rs` | Caller-controlled query embeddings |
 | **Python FFI** | `ordvec-python` (PyO3 / maturin) | Python ↔ Rust boundary; NumPy buffers |
-| **CI / supply chain** | 12 GitHub Actions workflows; `Cargo.lock`; crates.io + PyPI | GitHub OIDC, crates.io, PyPI trust chains |
+| **CI / supply chain** | 13 GitHub Actions workflows; `Cargo.lock`; crates.io + PyPI | GitHub OIDC, crates.io, PyPI trust chains |
 
 The `fuzz/` directory holds **seven** cargo-fuzz targets: `load_rank`,
 `load_rankquant`, `load_bitmap`, `load_sign_bitmap` (deserialization);
@@ -161,7 +161,7 @@ to this kernel.
 
 ### 3.2 Risks
 
-**THREAT-SIMD-001 (P1, mitigated this cycle; crate-wide rollout tracked):
+**THREAT-SIMD-001 (P1, mitigated this cycle):
 Unsafe-kernel invariant preservation under future refactors.**
 `scan_b2_fastscan_avx512` safety depends on caller-established invariants —
 `packed_fs.len() == n_blocks * pairs * 32` (formed via `checked_mul`, overflow
@@ -172,11 +172,13 @@ by construction. A future refactor calling the inner function directly could
 bypass the asserts. *Mitigations:* the runtime asserts + the type wrapper are
 the primary boundary; the scalar-vs-SIMD equivalence test
 (`fastscan_b2_top10_matches_avx512_kernel`) guards behavior; and
-**`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced in `fastscan.rs`**, so
-every unsafe operation in the kernel sits in an explicit `unsafe {}` block and
-stays visible to future edits. *Open:* roll the lint out crate-wide to the
-other SIMD modules (`bitmap.rs`, `sign_bitmap.rs`, `quant_kernels.rs`,
-`util.rs` NEON) — tracked as a follow-up.
+**`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced crate-wide** (at the crate
+root in `lib.rs`), so every unsafe operation in every SIMD kernel —
+`fastscan.rs`, `bitmap.rs`, `sign_bitmap.rs`, `quant_kernels.rs`, and the
+`util.rs` NEON popcount — sits in an explicit `unsafe {}` block and stays
+visible to future edits. (The lone exception, `horizontal_sum_avx2`, is
+register-only with no memory access, so its intrinsics are safe under the
+`#[target_feature]` gate and an explicit block would be `unused_unsafe`.)
 
 **THREAT-SIMD-002 (P4, deployment note): Microarchitectural side channels in
 co-tenancy.** `ordvec` does not claim protection against microarchitectural
@@ -237,7 +239,7 @@ applications must validate paths before calling").
 
 ### 5.1 Existing controls (verified)
 
-**Workflow code (all 12 workflows):** third-party actions pinned by commit
+**Workflow code (all 13 workflows):** third-party actions pinned by commit
 SHA; `persist-credentials: false` on every checkout; `permissions: contents:
 read` default. **Release workflows** (`release-crate.yml`, `release-python.yml`)
 are `workflow_dispatch`-only (no tag/push trigger), run a `require-ci-green`
@@ -270,17 +272,22 @@ passkeys on the maintainer account; recruiting a **second owner/maintainer**
 deployment **wait timer** worthwhile (a second party able to cancel a bad
 release during the window). See [`RELEASING.md`](RELEASING.md).
 
-**THREAT-SUPPLY-002 (P3): Release immutability and tag integrity.** Published
-artifacts are **immutable by registry design** — crates.io is yank-only (a
-published version's bytes can never be overwritten) and PyPI burns a version on
-delete (no different artifact may be re-uploaded under the same version). So
-post-publish "silent replacement" of a version is not possible on either
-registry, and consumers can verify artifacts against the SLSA / PEP 740
-provenance above. *Residual (GitHub-side):* `changelog.yml` cuts tagged GitHub
-Releases, but the repo currently has **no tag-protection ruleset and no `main`
-ruleset**, so a tag could be force-moved or a release asset replaced.
-*Mitigation:* add a `v*` **tag ruleset** (block update + deletion) and a basic
-`main` ruleset; optionally enable GitHub immutable releases.
+**THREAT-SUPPLY-002 (mitigated): Release immutability and tag integrity.**
+Published artifacts are **immutable by registry design** — crates.io is
+yank-only (a published version's bytes can never be overwritten) and PyPI burns
+a version on delete (no different artifact may be re-uploaded under the same
+version). So post-publish "silent replacement" of a version is not possible on
+either registry, and consumers can verify artifacts against the SLSA / PEP 740
+provenance above. The GitHub-side mutability surface is now closed too:
+`changelog.yml` cuts tagged GitHub Releases, and **GitHub immutable releases is
+enabled**, so a published release's `v*` tag cannot be force-moved or deleted
+and its assets cannot be replaced after publication; the **`main` branch is
+protected** (pull-request review required, force-pushes and deletions blocked)
+and is the **only deployment branch** permitted for the `pypi` / `crates-io`
+release environments. *Residual:* draft / non-release tags are not covered by
+release immutability, and — as with the registries — these GitHub controls
+ultimately trust the single maintainer account; that residual folds into
+THREAT-SUPPLY-001.
 
 **THREAT-SUPPLY-003 (P3): Typosquatting adjacent names.** Namespace-adjacent
 crate/package names (`ord-vec`, `ordvecs`, `order-vec`) could be registered to
@@ -358,11 +365,15 @@ single-rate compute path, and (new) the FastScan kernel.
 non-AVX-512 CI runners it exercises the scalar reference kernel; under Intel SDE
 it exercises the AVX-512 kernel.
 
-**THREAT-FUZZ-002 (P3): No CI-bound fuzzing for continuous regression.** Fuzzing
-is run manually; there is no CI gate. A bounded weekly smoke job (e.g.
-`-runs=50000` on `load_rank`, `load_rankquant`, and `fastscan_b2`) would catch
-regressions between manual runs. (Low overhead; weighed against maintenance
-budget.)
+**THREAT-FUZZ-002 (mitigated this cycle): CI-bound fuzzing for continuous
+regression.** A `fuzz.yml` workflow now runs a bounded smoke on every pull
+request and push to `main` (`-max_total_time=60` over `load_rank`,
+`load_rankquant`, and `fastscan_b2`) plus a weekly full sweep
+(`-max_total_time=300` over all seven targets), so a regression that
+reintroduces a loader panic / OOM, breaks the write→load round-trip, or
+destabilises the FastScan kernel surfaces in CI rather than only at the next
+manual campaign. cargo-fuzz is version-pinned and the actions are SHA-pinned,
+matching the repo's scheduled-workflow hardening.
 
 *Note on `load_sign_bitmap`:* all bit patterns are structurally valid for sign
 bitmaps (no per-row invariant), so that target is correctly scoped to parser
@@ -386,16 +397,16 @@ blast radius of a compromised dependency separately.
 
 | ID | Category | Owner | Description | Likelihood | Impact | Status / priority |
 |---|---|---|---|---|---|---|
-| THREAT-SIMD-001 | Memory safety | Library | Unsafe-kernel invariant bypass on refactor | Medium | High | **P1** — lint enforced in `fastscan.rs`; crate-wide rollout tracked |
+| THREAT-SIMD-001 | Memory safety | Library | Unsafe-kernel invariant bypass on refactor | Medium | High | **Mitigated** — `unsafe_op_in_unsafe_fn` denied crate-wide + type wrapper + equivalence test |
 | THREAT-FFI-001 | FFI | Binding | Concurrent input mutation during released-GIL call | Medium | Medium | **P2** — documented contract |
 | THREAT-FFI-002 | FFI | Binding | Unsanitized path forwarding | Medium | Medium | **P2** — documented contract |
 | THREAT-SUPPLY-001 | Supply chain | Config | Release config / single-owner | Low | Critical | **Mitigated** (reviewer + main-only); residual = account compromise / 2nd owner |
-| THREAT-SUPPLY-002 | Supply chain | Config | Release immutability / tag integrity | Low | High | **P3** — registries immutable; add tag ruleset |
+| THREAT-SUPPLY-002 | Supply chain | Config | Release immutability / tag integrity | Low | High | **Mitigated** — registries immutable; GitHub immutable releases on + `main` protected |
 | THREAT-SUPPLY-003 | Supply chain | Config | Typosquatting adjacent names | Medium | Medium | P3 |
 | THREAT-QUERY-001 | Resource | Deployment | Batch / `k` exhaustion in serving | Medium | Medium | **P2** — deployment docs |
 | THREAT-QUERY-002 | Resource | Deployment | Panic on contract violation (Rust servers) | Low | Medium | P3 |
 | THREAT-FUZZ-001 | Fuzzing | Library | FastScan path unfuzzed | Medium | High | **Closed** (`fastscan_b2` added) |
-| THREAT-FUZZ-002 | Fuzzing | Library | No CI-bound fuzzing | Medium | Medium | P3 |
+| THREAT-FUZZ-002 | Fuzzing | Library | No CI-bound fuzzing | Medium | Medium | **Mitigated** — `fuzz.yml` PR smoke + weekly sweep |
 | THREAT-DESER-001 | Deserialization | Library | TOCTOU on shared mounts | Very Low | Low | P4 |
 | THREAT-DESER-002 | Provenance | Deployment | Malicious-but-valid index | Medium | High | P3 (docs — `INDEX_PROVENANCE.md`) |
 | THREAT-CICD-001 | CI/CD | Library | Workflow injection via PR metadata | Low | High | P3 — mitigated by `zizmor` |
@@ -409,19 +420,18 @@ blast radius of a compromised dependency separately.
 
 ## 11. Open mitigations
 
-**Done this cycle:** `#![deny(unsafe_op_in_unsafe_fn)]` in `fastscan.rs`
-(SIMD-001); `fastscan_b2` fuzz target (FUZZ-001); release-environment reviewers
-+ main-only deployment (SUPPLY-001); [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md)
-(DESER-002); [`RELEASING.md`](RELEASING.md) (SUPPLY-001).
+**Done this cycle:** `#![deny(unsafe_op_in_unsafe_fn)]` enforced **crate-wide**
+across all SIMD modules (SIMD-001); the `fastscan_b2` fuzz target (FUZZ-001)
+plus a CI `fuzz.yml` — PR smoke + weekly sweep (FUZZ-002); the `rank_to_bucket`
+primitive made fail-loud (`rank < d`) to match the rest of the bucket API, with
+matching binding guards; release-environment reviewers + main-only deployment
+(SUPPLY-001); **GitHub immutable releases enabled + `main` branch protection**
+(SUPPLY-002); [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md) (DESER-002);
+[`RELEASING.md`](RELEASING.md) (SUPPLY-001).
 
 **Open, low cost:**
 
-1. Add a `v*` tag-protection ruleset (+ basic `main` ruleset) and optionally
-   enable GitHub immutable releases (THREAT-SUPPLY-002).
-2. Roll `#![deny(unsafe_op_in_unsafe_fn)]` out crate-wide across the remaining
-   SIMD modules (THREAT-SIMD-001).
-3. Add a bounded weekly CI fuzz smoke job (THREAT-FUZZ-002).
-4. Document recommended `nq` / `k` / corpus bounds for single-process serving
+1. Document recommended `nq` / `k` / corpus bounds for single-process serving
    in the Rust and Python API docs (THREAT-QUERY-001).
 
 **Later (not release blockers):** a second maintainer/owner (then a release

From 14f57153c3f937b90f1405b22a072f6a74d175cd Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Mon, 25 May 2026 22:07:58 -0500
Subject: [PATCH 5/7] ci: install cargo-fuzz without --locked (nightly compat)

cargo-fuzz 0.13.1's bundled Cargo.lock pins rustix 0.36.x, which no longer compiles on current nightly (rustc_layout_scalar_valid_range_start is now a reserved attribute), so 'cargo install --locked' failed the three fuzz smoke jobs. Drop --locked; the tool stays version-pinned (0.13.1) and its build deps resolve to nightly-compatible versions. Verified locally on nightly.

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 .github/workflows/fuzz.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index 99d775bd..2c3b8e59 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -56,8 +56,12 @@ jobs:
       - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below
         with:
           toolchain: nightly
-      - name: Install cargo-fuzz (pinned)
-        run: cargo install cargo-fuzz --version 0.13.1 --locked
+      - name: Install cargo-fuzz (version-pinned)
+        # NB: no `--locked` — cargo-fuzz 0.13.1's bundled Cargo.lock pins an old
+        # rustix (0.36.x) that no longer compiles on current nightly. The tool
+        # itself stays version-pinned; its build deps resolve to compatible
+        # versions.
+        run: cargo install cargo-fuzz --version 0.13.1
       - name: Smoke
         env:
           TARGET: ${{ matrix.target }}
@@ -90,8 +94,12 @@ jobs:
       - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below
         with:
           toolchain: nightly
-      - name: Install cargo-fuzz (pinned)
-        run: cargo install cargo-fuzz --version 0.13.1 --locked
+      - name: Install cargo-fuzz (version-pinned)
+        # NB: no `--locked` — cargo-fuzz 0.13.1's bundled Cargo.lock pins an old
+        # rustix (0.36.x) that no longer compiles on current nightly. The tool
+        # itself stays version-pinned; its build deps resolve to compatible
+        # versions.
+        run: cargo install cargo-fuzz --version 0.13.1
       - name: Fuzz
         env:
           TARGET: ${{ matrix.target }}

From 169abe3ee4e08e1d40a628fea8a40cbdfd4c86cd Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Tue, 26 May 2026 08:23:04 -0500
Subject: [PATCH 6/7] fix: validate bits up front in bucket_ranks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

bucket_ranks delegated the bits<=7 check to the per-entry rank_to_bucket call, so bucket_ranks(&[], bits>7) skipped it and silently returned an empty vec instead of failing loud. Assert bits<=7 up front so an invalid width is rejected for empty input too — matching the Python binding (which checks bits before its empty short-circuit) and making the documented panic contract accurate. Addresses a PR-review finding (copilot).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 src/rank.rs | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/rank.rs b/src/rank.rs
index e35535df..cdcb22ff 100644
--- a/src/rank.rs
+++ b/src/rank.rs
@@ -103,11 +103,16 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 {
 /// Bucket every entry of a full rank vector.
 ///
 /// # Panics
-/// Panics if `bits > 7`, or — via [`rank_to_bucket`] — if any entry is
-/// `>= ranks.len()`. A valid rank vector is a permutation of
-/// `[0, ranks.len())`, so a well-formed input never trips the latter; empty
-/// input returns empty without invoking the per-entry guard.
+/// Panics if `bits > 7` (validated up front, so this holds for empty input
+/// too), or — via [`rank_to_bucket`] — if any entry is `>= ranks.len()`. A
+/// valid rank vector is a permutation of `[0, ranks.len())`, so well-formed
+/// input never trips the per-entry guard.
 pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec<u8> {
+    // Validate `bits` up front so an invalid width fails loud even for empty
+    // input — an empty `ranks` skips the per-entry `rank_to_bucket` check and
+    // would otherwise silently return an empty vec. Mirrors the Python binding,
+    // which checks `bits` before its empty short-circuit.
+    assert!(bits <= 7, "bits too large");
     let d = ranks.len();
     ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect()
 }
@@ -599,6 +604,14 @@ mod tests {
         let _ = rank_to_bucket(8, 8, 2);
     }
 
+    #[test]
+    #[should_panic(expected = "bits too large")]
+    fn bucket_ranks_rejects_bits_above_7_even_when_empty() {
+        // `bits` is validated up front, so an invalid width fails loud even on
+        // empty input — which never reaches the per-entry rank_to_bucket guard.
+        let _ = bucket_ranks(&[], 8);
+    }
+
     #[test]
     fn pack_unpack_round_trip_bits2() {
         let buckets: Vec<u8> = (0..16).map(|i| (i % 4) as u8).collect();

From 246209a6561d75a2a58a5c2d8f358b9b09b3346f Mon Sep 17 00:00:00 2001
From: Nelson Spence <nelson@projectnavi.ai>
Date: Tue, 26 May 2026 08:23:04 -0500
Subject: [PATCH 7/7] docs: drop stale ci.yml claim from fuzz.yml comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fuzz.yml header said actions are SHA-pinned 'unlike the mutable @vN tags ci.yml uses' — but ci.yml is now fully SHA-pinned, so the comparison was wrong. Drop it; the comment states fuzz.yml's own pinning without an inaccurate baseline. Addresses a PR-review finding (copilot).

Signed-off-by: Nelson Spence <nelson@projectnavi.ai>
---
 .github/workflows/fuzz.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index 2c3b8e59..7403a165 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -12,11 +12,10 @@ name: fuzz
 #     across ALL seven targets.
 #
 # This runs UNATTENDED on a cron schedule, so every third-party action is
-# SHA-pinned (not the mutable @vN tags ci.yml uses on human-triggered push/PR)
-# and cargo-fuzz is version-pinned — a fuzz smoke must not itself become a
-# supply-chain hole. Read-only token; the only `run:` interpolation is the
-# matrix target name, passed through `env:` (never inlined into the shell) so
-# there is no template-injection surface (THREAT-CICD-001).
+# SHA-pinned and cargo-fuzz is version-pinned — a fuzz smoke must not itself
+# become a supply-chain hole. Read-only token; the only `run:` interpolation is
+# the matrix target name, passed through `env:` (never inlined into the shell)
+# so there is no template-injection surface (THREAT-CICD-001).
 
 on:
   pull_request: