From 804e571f9ec4a3a6d240d24eec66ac025c8d5610 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 25 May 2026 22:04:17 -0500 Subject: [PATCH 1/7] refactor: enforce unsafe_op_in_unsafe_fn crate-wide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move #![deny(unsafe_op_in_unsafe_fn)] from fastscan.rs to the crate root (lib.rs) so every unsafe operation in the SIMD kernels must sit in an explicit `unsafe {}` block rather than leaning on an enclosing `unsafe fn`. Wraps the AVX-512/AVX2 kernel bodies in bitmap.rs, sign_bitmap.rs and quant_kernels.rs, and the NEON popcount in util.rs. horizontal_sum_avx2 is register-only (no memory access), so its intrinsics are safe under the #[target_feature] gate and need no block (an explicit one would be unused_unsafe). Purely additive — no kernel logic changes. Keeps the unsafe surface visible to future edits (THREAT-SIMD-001). Signed-off-by: Nelson Spence --- src/bitmap.rs | 251 +++++++++++++----------- src/fastscan.rs | 7 - src/lib.rs | 7 + src/quant_kernels.rs | 442 ++++++++++++++++++++++--------------------- src/sign_bitmap.rs | 147 +++++++------- src/util.rs | 71 ++++--- 6 files changed, 498 insertions(+), 427 deletions(-) diff --git a/src/bitmap.rs b/src/bitmap.rs index 5de75a99..394c8c57 100644 --- a/src/bitmap.rs +++ b/src/bitmap.rs @@ -491,26 +491,30 @@ unsafe fn bitmap_scan_avx512vpop(bitmaps: &[u64], n: usize, qpv: usize, q: &[u64 // (`di < n`) each stay within their slice. AVX-512 F/VPOPCNTDQ are confirmed // by the `#[target_feature]` gate plus the caller's runtime // `is_x86_feature_detected!`. - debug_assert_eq!(qpv % 8, 0, "AVX-512 bitmap scan needs qpv % 8 == 0"); - let lanes = qpv / 8; - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); - } - for di in 0..n { - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - let mut acc_zmm = _mm512_setzero_si512(); + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0, "AVX-512 bitmap scan needs qpv % 8 == 0"); + let lanes = qpv / 8; + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); - let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); - let pop_zmm = _mm512_popcnt_epi64(and_zmm); - acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); + } + for di in 0..n { + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + let mut acc_zmm = _mm512_setzero_si512(); + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + } + let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); + top.maybe_insert(acc_sum as f32, di); } - let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); - top.maybe_insert(acc_sum as f32, di); } } @@ -553,25 +557,34 @@ unsafe fn bitmap_scan_collect_avx512vpop( scores: &mut [u32], ) { use std::arch::x86_64::*; - debug_assert_eq!(qpv % 8, 0); - let lanes = qpv / 8; - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); - } - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for di in 0..n { - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - let mut acc_zmm = _mm512_setzero_si512(); + // SAFETY: same contract as the sibling `bitmap_scan_avx512vpop` — the caller + // (`bitmap_scan_collect`) gates dispatch on `qpv.is_multiple_of(8)`, + // `q.len() == qpv`, and `bitmaps.len() == n * qpv`, bounding all raw loads. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0); + let lanes = qpv / 8; + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); - let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); - let pop_zmm = _mm512_popcnt_epi64(and_zmm); - acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); + } + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout + for di in 0..n { + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + let mut acc_zmm = _mm512_setzero_si512(); + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + } + let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); + scores[di] = acc_sum as u32; } - let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); - scores[di] = acc_sum as u32; } } @@ -635,77 +648,85 @@ unsafe fn bitmap_scan_collect_batched_avx512vpop( scores: &mut [u32], ) { use std::arch::x86_64::*; - debug_assert_eq!(qpv % 8, 0); - debug_assert_eq!(q_batch.len(), batch * qpv); - debug_assert_eq!(scores.len(), batch * n); - let lanes = qpv / 8; - const CHUNK: usize = BATCHED_AVX512_CHUNK; - - // Pre-load all batch * lanes query ZMMs once. For typical - // (batch=8, lanes=2) this is 16 __m512i of register-equivalent - // state, which fits in the 32-ZMM file alongside the per-chunk - // accs and doc lane temps. - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes); - for bi in 0..batch { - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512( - q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i - )); + // SAFETY: same contract as the sibling `bitmap_scan_avx512vpop` — the caller + // (`bitmap_scan_collect_batched`) gates dispatch on `qpv.is_multiple_of(8)`, + // `q_batch.len() == batch * qpv`, `bitmaps.len() == n * qpv`, and + // `scores.len() == batch * n`, bounding all raw loads and `scores[…]` writes. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0); + debug_assert_eq!(q_batch.len(), batch * qpv); + debug_assert_eq!(scores.len(), batch * n); + let lanes = qpv / 8; + const CHUNK: usize = BATCHED_AVX512_CHUNK; + + // Pre-load all batch * lanes query ZMMs once. For typical + // (batch=8, lanes=2) this is 16 __m512i of register-equivalent + // state, which fits in the 32-ZMM file alongside the per-chunk + // accs and doc lane temps. + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes); + for bi in 0..batch { + for l in 0..lanes { + q_zmms.push(_mm512_loadu_si512( + q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i + )); + } } - } - // Hot path: process whole CHUNK-sized groups. The inner `for bi - // in 0..CHUNK` is bounded by a *const*, so LLVM unrolls it and - // promotes the `accs: [__m512i; CHUNK]` stack array to ZMM - // registers — that's the property that keeps the kernel - // competitive with the single-query AVX-512 path on a per-query - // basis, plus the bandwidth amortisation. A runtime-bounded - // `0..chunk` loop would force `accs[bi]` to spill to stack - // memory and double per-doc latency. - let mut chunk_start = 0usize; - while chunk_start + CHUNK <= batch { - for di in 0..n { - let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + // Hot path: process whole CHUNK-sized groups. The inner `for bi + // in 0..CHUNK` is bounded by a *const*, so LLVM unrolls it and + // promotes the `accs: [__m512i; CHUNK]` stack array to ZMM + // registers — that's the property that keeps the kernel + // competitive with the single-query AVX-512 path on a per-query + // basis, plus the bandwidth amortisation. A runtime-bounded + // `0..chunk` loop would force `accs[bi]` to spill to stack + // memory and double per-doc latency. + let mut chunk_start = 0usize; + while chunk_start + CHUNK <= batch { + for di in 0..n { + let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + for bi in 0..CHUNK { + let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; + let and_zmm = _mm512_and_si512(d_zmm, q_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + } + } for bi in 0..CHUNK { - let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; - let and_zmm = _mm512_and_si512(d_zmm, q_zmm); - let pop_zmm = _mm512_popcnt_epi64(and_zmm); - accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); + scores[(chunk_start + bi) * n + di] = acc_sum as u32; } } - for bi in 0..CHUNK { - let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); - scores[(chunk_start + bi) * n + di] = acc_sum as u32; - } + chunk_start += CHUNK; } - chunk_start += CHUNK; - } - // Tail path: any remaining `batch % CHUNK` queries. Slower per - // doc (runtime-bounded inner loop, accs[bi] may spill) but the - // tail runs once per kernel call, not once per doc — total cost - // is at most CHUNK-1 queries of slower scan, dominated by the - // hot path for any batch > 1. - let tail = batch - chunk_start; - if tail > 0 { - for di in 0..n { - let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + // Tail path: any remaining `batch % CHUNK` queries. Slower per + // doc (runtime-bounded inner loop, accs[bi] may spill) but the + // tail runs once per kernel call, not once per doc — total cost + // is at most CHUNK-1 queries of slower scan, dominated by the + // hot path for any batch > 1. + let tail = batch - chunk_start; + if tail > 0 { + for di in 0..n { + let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + for bi in 0..tail { + let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; + let and_zmm = _mm512_and_si512(d_zmm, q_zmm); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + } + } for bi in 0..tail { - let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; - let and_zmm = _mm512_and_si512(d_zmm, q_zmm); - let pop_zmm = _mm512_popcnt_epi64(and_zmm); - accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); + scores[(chunk_start + bi) * n + di] = acc_sum as u32; } } - for bi in 0..tail { - let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); - scores[(chunk_start + bi) * n + di] = acc_sum as u32; - } } } } @@ -758,27 +779,31 @@ unsafe fn body_overlap_scores_subset_avx512vpop( // `n_vectors*qpv`-word buffer; and `out.len() == doc_ids.len()` bounds the // `out[i]` writes. AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + // runtime detection. - debug_assert_eq!(qpv % 8, 0); - let lanes = qpv / 8; - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512( - q_bitmap.as_ptr().add(l * 8) as *const __m512i - )); - } - for (i, &di) in doc_ids.iter().enumerate() { - let doc_ptr = bitmaps.as_ptr().add((di as usize) * qpv) as *const __m512i; - let mut acc_zmm = _mm512_setzero_si512(); + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0); + let lanes = qpv / 8; + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); - let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); - let pop_zmm = _mm512_popcnt_epi64(and_zmm); - acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + q_zmms.push(_mm512_loadu_si512( + q_bitmap.as_ptr().add(l * 8) as *const __m512i + )); + } + for (i, &di) in doc_ids.iter().enumerate() { + let doc_ptr = bitmaps.as_ptr().add((di as usize) * qpv) as *const __m512i; + let mut acc_zmm = _mm512_setzero_si512(); + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + let and_zmm = _mm512_and_si512(d_zmm, q_zmms[l]); + let pop_zmm = _mm512_popcnt_epi64(and_zmm); + acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + } + let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); + out[i] = acc_sum as u32; } - let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); - out[i] = acc_sum as u32; } } diff --git a/src/fastscan.rs b/src/fastscan.rs index 7a757aed..a39b8de4 100644 --- a/src/fastscan.rs +++ b/src/fastscan.rs @@ -35,13 +35,6 @@ //! [`l2_normalise`](crate::util::l2_normalise), and `k` is clamped to //! `n_vectors` exactly as the sibling search methods do. -// Make every unsafe operation inside an `unsafe fn` require an explicit -// `unsafe {}` block rather than leaning on the fn-level `unsafe`. This is -// defense-in-depth for the AVX-512 FastScan kernel below: it keeps the kernel's -// unsafe surface visible to future edits. Crate-wide rollout to the other SIMD -// modules is tracked separately (see THREAT_MODEL.md, THREAT-SIMD-001). -#![deny(unsafe_op_in_unsafe_fn)] - use rayon::prelude::*; use crate::rank::{bucket_ranks, rank_transform, rankquant_norm}; diff --git a/src/lib.rs b/src/lib.rs index 6fc1f35e..2182437e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,13 @@ //! assert_eq!(res.k, 10); //! ``` +// Every unsafe operation in the crate must sit inside an explicit `unsafe {}` +// block rather than leaning on an enclosing `unsafe fn`. This keeps the unsafe +// surface of the SIMD kernels (fastscan / bitmap / sign_bitmap / quant_kernels, +// plus the NEON popcount in util) visible to every future edit +// (THREAT_MODEL.md, THREAT-SIMD-001). +#![deny(unsafe_op_in_unsafe_fn)] + mod bitmap; mod fastscan; #[cfg(feature = "experimental")] diff --git a/src/quant_kernels.rs b/src/quant_kernels.rs index 0b5fb8ad..92f790c7 100644 --- a/src/quant_kernels.rs +++ b/src/quant_kernels.rs @@ -162,61 +162,63 @@ pub(crate) unsafe fn scan_b2_asym_avx2( // * `dim % K == 0` (asserted immediately below). // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and // `load_rankquant` re-validates the shape, so this holds on every path here. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + // Hard backstop: the dispatch in `quant.rs` must only route here + // when `dim % 16 == 0`. Kept as a real `assert!` (not debug-only) + // so a mis-dispatch fails loudly in release instead of silently + // dropping the trailing chunk and returning wrong top-k. + assert_eq!(dim % 16, 0, "b=2 AVX2 path needs dim % 16 == 0"); + let bytes_per_vec = dim / 4; + // For each chunk of 4 doc bytes we extract 16 codes (top byte first, + // most-significant 2 bits first within a byte). Shift amounts: + // chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3, + // code k = (chunk >> ((15 - k) * 2)) & 3, for k in 0..16. + // + // Centre-drop: score(d) = Σ q[j]·(code[j] - 1.5) + // = Σ q[j]·code[j] - 1.5·Σ q[j] + // The second term is per-query constant and is added back to the + // TopK scores at finalize time. The hot loop only does the raw + // dot product against unsigned code values. + let shifts_hi = _mm256_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16); + let shifts_lo = _mm256_setr_epi32(14, 12, 10, 8, 6, 4, 2, 0); + let mask3 = _mm256_set1_epi32(3); + + let bytes_per_chunk = 4usize; + let chunks_per_vec = bytes_per_vec / bytes_per_chunk; + + for di in 0..n { + let doc = packed.as_ptr().add(di * bytes_per_vec); + let mut acc_hi = _mm256_setzero_ps(); + let mut acc_lo = _mm256_setzero_ps(); + + for c in 0..chunks_per_vec { + let chunk_ptr = doc.add(c * bytes_per_chunk); + let b0 = *chunk_ptr as u32; + let b1 = *chunk_ptr.add(1) as u32; + let b2 = *chunk_ptr.add(2) as u32; + let b3 = *chunk_ptr.add(3) as u32; + let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; + let broadcast = _mm256_set1_epi32(chunk as i32); + + let codes_hi = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_hi), mask3); + let codes_lo = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_lo), mask3); + + let codes_f_hi = _mm256_cvtepi32_ps(codes_hi); + let codes_f_lo = _mm256_cvtepi32_ps(codes_lo); + + let d_base = c * 16; + let q_hi = _mm256_loadu_ps(q.as_ptr().add(d_base)); + let q_lo = _mm256_loadu_ps(q.as_ptr().add(d_base + 8)); + + acc_hi = _mm256_fmadd_ps(codes_f_hi, q_hi, acc_hi); + acc_lo = _mm256_fmadd_ps(codes_f_lo, q_lo, acc_lo); + } - // Hard backstop: the dispatch in `quant.rs` must only route here - // when `dim % 16 == 0`. Kept as a real `assert!` (not debug-only) - // so a mis-dispatch fails loudly in release instead of silently - // dropping the trailing chunk and returning wrong top-k. - assert_eq!(dim % 16, 0, "b=2 AVX2 path needs dim % 16 == 0"); - let bytes_per_vec = dim / 4; - // For each chunk of 4 doc bytes we extract 16 codes (top byte first, - // most-significant 2 bits first within a byte). Shift amounts: - // chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3, - // code k = (chunk >> ((15 - k) * 2)) & 3, for k in 0..16. - // - // Centre-drop: score(d) = Σ q[j]·(code[j] - 1.5) - // = Σ q[j]·code[j] - 1.5·Σ q[j] - // The second term is per-query constant and is added back to the - // TopK scores at finalize time. The hot loop only does the raw - // dot product against unsigned code values. - let shifts_hi = _mm256_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16); - let shifts_lo = _mm256_setr_epi32(14, 12, 10, 8, 6, 4, 2, 0); - let mask3 = _mm256_set1_epi32(3); - - let bytes_per_chunk = 4usize; - let chunks_per_vec = bytes_per_vec / bytes_per_chunk; - - for di in 0..n { - let doc = packed.as_ptr().add(di * bytes_per_vec); - let mut acc_hi = _mm256_setzero_ps(); - let mut acc_lo = _mm256_setzero_ps(); - - for c in 0..chunks_per_vec { - let chunk_ptr = doc.add(c * bytes_per_chunk); - let b0 = *chunk_ptr as u32; - let b1 = *chunk_ptr.add(1) as u32; - let b2 = *chunk_ptr.add(2) as u32; - let b3 = *chunk_ptr.add(3) as u32; - let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; - let broadcast = _mm256_set1_epi32(chunk as i32); - - let codes_hi = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_hi), mask3); - let codes_lo = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts_lo), mask3); - - let codes_f_hi = _mm256_cvtepi32_ps(codes_hi); - let codes_f_lo = _mm256_cvtepi32_ps(codes_lo); - - let d_base = c * 16; - let q_hi = _mm256_loadu_ps(q.as_ptr().add(d_base)); - let q_lo = _mm256_loadu_ps(q.as_ptr().add(d_base + 8)); - - acc_hi = _mm256_fmadd_ps(codes_f_hi, q_hi, acc_hi); - acc_lo = _mm256_fmadd_ps(codes_f_lo, q_lo, acc_lo); + let total = _mm256_add_ps(acc_hi, acc_lo); + let raw = horizontal_sum_avx2(total); + top.maybe_insert(raw * scale, di); } - - let total = _mm256_add_ps(acc_hi, acc_lo); - let raw = horizontal_sum_avx2(total); - top.maybe_insert(raw * scale, di); } } @@ -241,46 +243,48 @@ pub(crate) unsafe fn scan_b4_asym_avx2( // * `dim % K == 0` (asserted immediately below). // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and // `load_rankquant` re-validates the shape, so this holds on every path here. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail + // loudly in release, not silently drop the trailing chunk. + assert_eq!(dim % 8, 0, "b=4 AVX2 path needs dim % 8 == 0"); + let bytes_per_vec = dim / 2; + // For each chunk of 4 doc bytes we extract 8 codes (one nibble each). + // chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3, + // code k = (chunk >> ((7 - k) * 4)) & 0xF, for k in 0..8. + // Centre-drop: -7.5·Σq[j] is added back to the TopK scores at + // finalize time; the hot loop scores raw nibble values. + let shifts = _mm256_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0); + let mask_f = _mm256_set1_epi32(0xF); + + let bytes_per_chunk = 4usize; + let chunks_per_vec = bytes_per_vec / bytes_per_chunk; + + for di in 0..n { + let doc = packed.as_ptr().add(di * bytes_per_vec); + let mut acc = _mm256_setzero_ps(); + + for c in 0..chunks_per_vec { + let chunk_ptr = doc.add(c * bytes_per_chunk); + let b0 = *chunk_ptr as u32; + let b1 = *chunk_ptr.add(1) as u32; + let b2 = *chunk_ptr.add(2) as u32; + let b3 = *chunk_ptr.add(3) as u32; + let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; + let broadcast = _mm256_set1_epi32(chunk as i32); + + let codes = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts), mask_f); + let codes_f = _mm256_cvtepi32_ps(codes); + + let d_base = c * 8; + let q_vec = _mm256_loadu_ps(q.as_ptr().add(d_base)); + + acc = _mm256_fmadd_ps(codes_f, q_vec, acc); + } - // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail - // loudly in release, not silently drop the trailing chunk. - assert_eq!(dim % 8, 0, "b=4 AVX2 path needs dim % 8 == 0"); - let bytes_per_vec = dim / 2; - // For each chunk of 4 doc bytes we extract 8 codes (one nibble each). - // chunk u32 = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3, - // code k = (chunk >> ((7 - k) * 4)) & 0xF, for k in 0..8. - // Centre-drop: -7.5·Σq[j] is added back to the TopK scores at - // finalize time; the hot loop scores raw nibble values. - let shifts = _mm256_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0); - let mask_f = _mm256_set1_epi32(0xF); - - let bytes_per_chunk = 4usize; - let chunks_per_vec = bytes_per_vec / bytes_per_chunk; - - for di in 0..n { - let doc = packed.as_ptr().add(di * bytes_per_vec); - let mut acc = _mm256_setzero_ps(); - - for c in 0..chunks_per_vec { - let chunk_ptr = doc.add(c * bytes_per_chunk); - let b0 = *chunk_ptr as u32; - let b1 = *chunk_ptr.add(1) as u32; - let b2 = *chunk_ptr.add(2) as u32; - let b3 = *chunk_ptr.add(3) as u32; - let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; - let broadcast = _mm256_set1_epi32(chunk as i32); - - let codes = _mm256_and_si256(_mm256_srlv_epi32(broadcast, shifts), mask_f); - let codes_f = _mm256_cvtepi32_ps(codes); - - let d_base = c * 8; - let q_vec = _mm256_loadu_ps(q.as_ptr().add(d_base)); - - acc = _mm256_fmadd_ps(codes_f, q_vec, acc); + let raw = horizontal_sum_avx2(acc); + top.maybe_insert(raw * scale, di); } - - let raw = horizontal_sum_avx2(acc); - top.maybe_insert(raw * scale, di); } } @@ -288,6 +292,12 @@ pub(crate) unsafe fn scan_b4_asym_avx2( #[target_feature(enable = "avx2,fma")] unsafe fn horizontal_sum_avx2(v: std::arch::x86_64::__m256) -> f32 { use std::arch::x86_64::*; + // SAFETY: called only from `scan_b{2,4}_asym_avx2` which are themselves + // guarded by the AVX2+FMA `#[target_feature]` and the caller's runtime + // detection — the intrinsics here are always feature-available. + // All intrinsics in this body (extract/cast/add/cvt) are safe under the + // `avx2,fma` `#[target_feature]` gate; no explicit `unsafe {}` block is + // needed or permitted (`unused_unsafe` under `-D warnings`). let hi128 = _mm256_extractf128_ps(v, 1); let lo128 = _mm256_castps256_ps128(v); let sum128 = _mm_add_ps(lo128, hi128); @@ -332,67 +342,69 @@ pub(crate) unsafe fn scan_b2_asym_avx512( // * `dim % K == 0` (asserted immediately below). // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and // `load_rankquant` re-validates the shape, so this holds on every path here. - - // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail - // loudly in release, not silently drop the trailing 64-code block. - assert_eq!( - dim % 64, - 0, - "b=2 AVX-512 path needs dim % 64 == 0 for 4-way unroll" - ); - let bytes_per_vec = dim / 4; - let shifts = _mm512_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - let mask3 = _mm512_set1_epi32(3); - - let bytes_per_chunk = 4usize; - let chunks_per_vec = bytes_per_vec / bytes_per_chunk; - // Process 4 chunks per outer iteration with 4 independent - // accumulators. Breaks the FMA dependency chain so the two Zen 5 - // FMA ports can both fire each cycle instead of waiting on a - // single-acc dep chain. - let outer_iters = chunks_per_vec / 4; - debug_assert_eq!(chunks_per_vec % 4, 0); - - for di in 0..n { - let doc = packed.as_ptr().add(di * bytes_per_vec); - let mut acc0 = _mm512_setzero_ps(); - let mut acc1 = _mm512_setzero_ps(); - let mut acc2 = _mm512_setzero_ps(); - let mut acc3 = _mm512_setzero_ps(); - - for outer in 0..outer_iters { - let c0 = outer * 4; - let c1 = c0 + 1; - let c2 = c0 + 2; - let c3 = c0 + 3; - - macro_rules! step { - ($c:expr, $acc:expr) => {{ - let chunk_ptr = doc.add($c * bytes_per_chunk); - let b0 = *chunk_ptr as u32; - let b1 = *chunk_ptr.add(1) as u32; - let b2 = *chunk_ptr.add(2) as u32; - let b3 = *chunk_ptr.add(3) as u32; - let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; - let broadcast = _mm512_set1_epi32(chunk as i32); - let codes = _mm512_and_si512(_mm512_srlv_epi32(broadcast, shifts), mask3); - let codes_f = _mm512_cvtepi32_ps(codes); - let d_base = $c * 16; - let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base)); - $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc); - }}; + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail + // loudly in release, not silently drop the trailing 64-code block. + assert_eq!( + dim % 64, + 0, + "b=2 AVX-512 path needs dim % 64 == 0 for 4-way unroll" + ); + let bytes_per_vec = dim / 4; + let shifts = _mm512_setr_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + let mask3 = _mm512_set1_epi32(3); + + let bytes_per_chunk = 4usize; + let chunks_per_vec = bytes_per_vec / bytes_per_chunk; + // Process 4 chunks per outer iteration with 4 independent + // accumulators. Breaks the FMA dependency chain so the two Zen 5 + // FMA ports can both fire each cycle instead of waiting on a + // single-acc dep chain. + let outer_iters = chunks_per_vec / 4; + debug_assert_eq!(chunks_per_vec % 4, 0); + + for di in 0..n { + let doc = packed.as_ptr().add(di * bytes_per_vec); + let mut acc0 = _mm512_setzero_ps(); + let mut acc1 = _mm512_setzero_ps(); + let mut acc2 = _mm512_setzero_ps(); + let mut acc3 = _mm512_setzero_ps(); + + for outer in 0..outer_iters { + let c0 = outer * 4; + let c1 = c0 + 1; + let c2 = c0 + 2; + let c3 = c0 + 3; + + macro_rules! step { + ($c:expr, $acc:expr) => {{ + let chunk_ptr = doc.add($c * bytes_per_chunk); + let b0 = *chunk_ptr as u32; + let b1 = *chunk_ptr.add(1) as u32; + let b2 = *chunk_ptr.add(2) as u32; + let b3 = *chunk_ptr.add(3) as u32; + let chunk = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3; + let broadcast = _mm512_set1_epi32(chunk as i32); + let codes = _mm512_and_si512(_mm512_srlv_epi32(broadcast, shifts), mask3); + let codes_f = _mm512_cvtepi32_ps(codes); + let d_base = $c * 16; + let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base)); + $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc); + }}; + } + step!(c0, acc0); + step!(c1, acc1); + step!(c2, acc2); + step!(c3, acc3); } - step!(c0, acc0); - step!(c1, acc1); - step!(c2, acc2); - step!(c3, acc3); - } - let s01 = _mm512_add_ps(acc0, acc1); - let s23 = _mm512_add_ps(acc2, acc3); - let total = _mm512_add_ps(s01, s23); - let raw = _mm512_reduce_add_ps(total); - top.maybe_insert(raw * scale, di); + let s01 = _mm512_add_ps(acc0, acc1); + let s23 = _mm512_add_ps(acc2, acc3); + let total = _mm512_add_ps(s01, s23); + let raw = _mm512_reduce_add_ps(total); + top.maybe_insert(raw * scale, di); + } } } @@ -417,71 +429,73 @@ pub(crate) unsafe fn scan_b4_asym_avx512( // * `dim % K == 0` (asserted immediately below). // `RankQuant::{new,add}` pack exactly `bytes_per_vec` bytes/doc and // `load_rankquant` re-validates the shape, so this holds on every path here. - - // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail - // loudly in release, not silently drop the trailing 64-code block. - assert_eq!( - dim % 64, - 0, - "b=4 AVX-512 path needs dim % 64 == 0 for 4-way unroll" - ); - let bytes_per_vec = dim / 2; - let shifts = _mm512_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0); - let mask_f = _mm512_set1_epi32(0xF); - - let bytes_per_chunk = 8usize; - let chunks_per_vec = bytes_per_vec / bytes_per_chunk; - let outer_iters = chunks_per_vec / 4; - debug_assert_eq!(chunks_per_vec % 4, 0); - - for di in 0..n { - let doc = packed.as_ptr().add(di * bytes_per_vec); - let mut acc0 = _mm512_setzero_ps(); - let mut acc1 = _mm512_setzero_ps(); - let mut acc2 = _mm512_setzero_ps(); - let mut acc3 = _mm512_setzero_ps(); - - for outer in 0..outer_iters { - macro_rules! step { - ($c:expr, $acc:expr) => {{ - let chunk_ptr = doc.add($c * bytes_per_chunk); - let lo0 = *chunk_ptr as u32; - let lo1 = *chunk_ptr.add(1) as u32; - let lo2 = *chunk_ptr.add(2) as u32; - let lo3 = *chunk_ptr.add(3) as u32; - let hi0 = *chunk_ptr.add(4) as u32; - let hi1 = *chunk_ptr.add(5) as u32; - let hi2 = *chunk_ptr.add(6) as u32; - let hi3 = *chunk_ptr.add(7) as u32; - let chunk_lo = (lo0 << 24) | (lo1 << 16) | (lo2 << 8) | lo3; - let chunk_hi = (hi0 << 24) | (hi1 << 16) | (hi2 << 8) | hi3; - let lo_zmm = _mm512_set1_epi32(chunk_lo as i32); - let hi_zmm = _mm512_set1_epi32(chunk_hi as i32); - // Blend mask 0xFF00 (bits 8-15 set): _mm512_mask_blend_epi32 - // takes lane i from `hi_zmm` where bit i is set, else from - // `lo_zmm` — so lanes 0-7 <- chunk_lo, lanes 8-15 <- chunk_hi. - // Pairs with `shifts` = [28,24,20,16,12,8,4,0] x2: lanes 0-7 - // extract chunk_lo's 8 nibbles (codes 0-7), lanes 8-15 extract - // chunk_hi's (codes 8-15), most-significant nibble first. - let combined = _mm512_mask_blend_epi32(0xFF00u16, lo_zmm, hi_zmm); - let codes = _mm512_and_si512(_mm512_srlv_epi32(combined, shifts), mask_f); - let codes_f = _mm512_cvtepi32_ps(codes); - let d_base = $c * 16; - let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base)); - $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc); - }}; + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + // Hard backstop (see `scan_b2_asym_avx2`): mis-dispatch must fail + // loudly in release, not silently drop the trailing 64-code block. + assert_eq!( + dim % 64, + 0, + "b=4 AVX-512 path needs dim % 64 == 0 for 4-way unroll" + ); + let bytes_per_vec = dim / 2; + let shifts = _mm512_setr_epi32(28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0); + let mask_f = _mm512_set1_epi32(0xF); + + let bytes_per_chunk = 8usize; + let chunks_per_vec = bytes_per_vec / bytes_per_chunk; + let outer_iters = chunks_per_vec / 4; + debug_assert_eq!(chunks_per_vec % 4, 0); + + for di in 0..n { + let doc = packed.as_ptr().add(di * bytes_per_vec); + let mut acc0 = _mm512_setzero_ps(); + let mut acc1 = _mm512_setzero_ps(); + let mut acc2 = _mm512_setzero_ps(); + let mut acc3 = _mm512_setzero_ps(); + + for outer in 0..outer_iters { + macro_rules! step { + ($c:expr, $acc:expr) => {{ + let chunk_ptr = doc.add($c * bytes_per_chunk); + let lo0 = *chunk_ptr as u32; + let lo1 = *chunk_ptr.add(1) as u32; + let lo2 = *chunk_ptr.add(2) as u32; + let lo3 = *chunk_ptr.add(3) as u32; + let hi0 = *chunk_ptr.add(4) as u32; + let hi1 = *chunk_ptr.add(5) as u32; + let hi2 = *chunk_ptr.add(6) as u32; + let hi3 = *chunk_ptr.add(7) as u32; + let chunk_lo = (lo0 << 24) | (lo1 << 16) | (lo2 << 8) | lo3; + let chunk_hi = (hi0 << 24) | (hi1 << 16) | (hi2 << 8) | hi3; + let lo_zmm = _mm512_set1_epi32(chunk_lo as i32); + let hi_zmm = _mm512_set1_epi32(chunk_hi as i32); + // Blend mask 0xFF00 (bits 8-15 set): _mm512_mask_blend_epi32 + // takes lane i from `hi_zmm` where bit i is set, else from + // `lo_zmm` — so lanes 0-7 <- chunk_lo, lanes 8-15 <- chunk_hi. + // Pairs with `shifts` = [28,24,20,16,12,8,4,0] x2: lanes 0-7 + // extract chunk_lo's 8 nibbles (codes 0-7), lanes 8-15 extract + // chunk_hi's (codes 8-15), most-significant nibble first. + let combined = _mm512_mask_blend_epi32(0xFF00u16, lo_zmm, hi_zmm); + let codes = _mm512_and_si512(_mm512_srlv_epi32(combined, shifts), mask_f); + let codes_f = _mm512_cvtepi32_ps(codes); + let d_base = $c * 16; + let q_vec = _mm512_loadu_ps(q.as_ptr().add(d_base)); + $acc = _mm512_fmadd_ps(codes_f, q_vec, $acc); + }}; + } + let c0 = outer * 4; + step!(c0, acc0); + step!(c0 + 1, acc1); + step!(c0 + 2, acc2); + step!(c0 + 3, acc3); } - let c0 = outer * 4; - step!(c0, acc0); - step!(c0 + 1, acc1); - step!(c0 + 2, acc2); - step!(c0 + 3, acc3); - } - let s01 = _mm512_add_ps(acc0, acc1); - let s23 = _mm512_add_ps(acc2, acc3); - let total = _mm512_add_ps(s01, s23); - let raw = _mm512_reduce_add_ps(total); - top.maybe_insert(raw * scale, di); + let s01 = _mm512_add_ps(acc0, acc1); + let s23 = _mm512_add_ps(acc2, acc3); + let total = _mm512_add_ps(s01, s23); + let raw = _mm512_reduce_add_ps(total); + top.maybe_insert(raw * scale, di); + } } } diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 4cf98a02..dd6066e6 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -307,25 +307,34 @@ unsafe fn sign_scan_collect_avx512vpop( scores: &mut [u32], ) { use std::arch::x86_64::*; - debug_assert_eq!(qpv % 8, 0); - let lanes = qpv / 8; - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); - } - #[allow(clippy::needless_range_loop)] // indexed access is clearer / matches the kernel layout - for di in 0..n { - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - let mut acc_zmm = _mm512_setzero_si512(); + // SAFETY: mirrors `bitmap_scan_collect_avx512vpop` — the caller + // (`sign_scan_collect`) gates dispatch on `qpv.is_multiple_of(8)`, + // `q.len() == qpv`, and `bitmaps.len() == n * qpv`, bounding all raw loads. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0); + let lanes = qpv / 8; + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(lanes); + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); - let xor_zmm = _mm512_xor_si512(d_zmm, q_zmms[l]); - let pop_zmm = _mm512_popcnt_epi64(xor_zmm); - acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + q_zmms.push(_mm512_loadu_si512(q.as_ptr().add(l * 8) as *const __m512i)); + } + #[allow(clippy::needless_range_loop)] + // indexed access is clearer / matches the kernel layout + for di in 0..n { + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + let mut acc_zmm = _mm512_setzero_si512(); + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + let xor_zmm = _mm512_xor_si512(d_zmm, q_zmms[l]); + let pop_zmm = _mm512_popcnt_epi64(xor_zmm); + acc_zmm = _mm512_add_epi64(acc_zmm, pop_zmm); + } + let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); + scores[di] = acc_sum as u32; } - let acc_sum: i64 = _mm512_reduce_add_epi64(acc_zmm); - scores[di] = acc_sum as u32; } } @@ -380,63 +389,71 @@ unsafe fn sign_scan_collect_batched_avx512vpop( scores: &mut [u32], ) { use std::arch::x86_64::*; - debug_assert_eq!(qpv % 8, 0); - debug_assert_eq!(q_batch.len(), batch * qpv); - debug_assert_eq!(scores.len(), batch * n); - let lanes = qpv / 8; - const CHUNK: usize = BATCHED_AVX512_CHUNK; - - let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes); - for bi in 0..batch { - for l in 0..lanes { - q_zmms.push(_mm512_loadu_si512( - q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i - )); + // SAFETY: mirrors `bitmap_scan_collect_batched_avx512vpop` — the caller + // (`sign_scan_collect_batched`) gates dispatch on `qpv.is_multiple_of(8)`, + // `q_batch.len() == batch * qpv`, `bitmaps.len() == n * qpv`, and + // `scores.len() == batch * n`, bounding all raw loads and `scores[…]` writes. + // AVX-512 F/VPOPCNTDQ confirmed by `#[target_feature]` + runtime detection. + // The explicit block is required by `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + debug_assert_eq!(qpv % 8, 0); + debug_assert_eq!(q_batch.len(), batch * qpv); + debug_assert_eq!(scores.len(), batch * n); + let lanes = qpv / 8; + const CHUNK: usize = BATCHED_AVX512_CHUNK; + + let mut q_zmms: Vec<__m512i> = Vec::with_capacity(batch * lanes); + for bi in 0..batch { + for l in 0..lanes { + q_zmms.push(_mm512_loadu_si512( + q_batch.as_ptr().add(bi * qpv + l * 8) as *const __m512i + )); + } } - } - // Hot path: CHUNK-sized groups; const-bounded inner bi loop so - // LLVM unrolls and promotes the accs array to ZMM registers. - let mut chunk_start = 0usize; - while chunk_start + CHUNK <= batch { - for di in 0..n { - let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + // Hot path: CHUNK-sized groups; const-bounded inner bi loop so + // LLVM unrolls and promotes the accs array to ZMM registers. + let mut chunk_start = 0usize; + while chunk_start + CHUNK <= batch { + for di in 0..n { + let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + for bi in 0..CHUNK { + let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; + let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm); + let pop_zmm = _mm512_popcnt_epi64(xor_zmm); + accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + } + } for bi in 0..CHUNK { - let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; - let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm); - let pop_zmm = _mm512_popcnt_epi64(xor_zmm); - accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); + scores[(chunk_start + bi) * n + di] = acc_sum as u32; } } - for bi in 0..CHUNK { - let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); - scores[(chunk_start + bi) * n + di] = acc_sum as u32; - } + chunk_start += CHUNK; } - chunk_start += CHUNK; - } - // Tail. - let tail = batch - chunk_start; - if tail > 0 { - for di in 0..n { - let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; - let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; - for l in 0..lanes { - let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + // Tail. + let tail = batch - chunk_start; + if tail > 0 { + for di in 0..n { + let mut accs: [__m512i; CHUNK] = [_mm512_setzero_si512(); CHUNK]; + let doc_ptr = bitmaps.as_ptr().add(di * qpv) as *const __m512i; + for l in 0..lanes { + let d_zmm = _mm512_loadu_si512(doc_ptr.add(l)); + for bi in 0..tail { + let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; + let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm); + let pop_zmm = _mm512_popcnt_epi64(xor_zmm); + accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + } + } for bi in 0..tail { - let q_zmm = q_zmms[(chunk_start + bi) * lanes + l]; - let xor_zmm = _mm512_xor_si512(d_zmm, q_zmm); - let pop_zmm = _mm512_popcnt_epi64(xor_zmm); - accs[bi] = _mm512_add_epi64(accs[bi], pop_zmm); + let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); + scores[(chunk_start + bi) * n + di] = acc_sum as u32; } } - for bi in 0..tail { - let acc_sum: i64 = _mm512_reduce_add_epi64(accs[bi]); - scores[(chunk_start + bi) * n + di] = acc_sum as u32; - } } } } diff --git a/src/util.rs b/src/util.rs index 933dc0c5..047444fa 100644 --- a/src/util.rs +++ b/src/util.rs @@ -204,21 +204,30 @@ fn xor_popcount_scalar(doc: &[u64], q: &[u64]) -> u32 { #[inline] unsafe fn and_popcount_neon(doc: &[u64], q: &[u64]) -> u32 { use std::arch::aarch64::*; - let qpv = doc.len(); - let dptr = doc.as_ptr() as *const u8; - let qptr = q.as_ptr() as *const u8; - let mut acc = 0u32; - let mut w = 0usize; - while w + 2 <= qpv { - let dv = vld1q_u8(dptr.add(w * 8)); - let qv = vld1q_u8(qptr.add(w * 8)); - acc += vaddvq_u8(vcntq_u8(vandq_u8(dv, qv))) as u32; - w += 2; - } - if w < qpv { - acc += (doc[w] & q[w]).count_ones(); + // SAFETY: NEON is part of the aarch64 baseline ABI — these intrinsics + // are unconditionally available on aarch64. The `vld1q_u8` loads read + // 16 bytes starting at `dptr/qptr + w*8`; `w + 2 <= qpv` guarantees + // both offsets are within the slice (each u64 is 8 bytes, so 2×u64 = 16 + // bytes). The trailing scalar path reads `doc[w]`/`q[w]` with a safe + // slice index. The explicit block is required by + // `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let qpv = doc.len(); + let dptr = doc.as_ptr() as *const u8; + let qptr = q.as_ptr() as *const u8; + let mut acc = 0u32; + let mut w = 0usize; + while w + 2 <= qpv { + let dv = vld1q_u8(dptr.add(w * 8)); + let qv = vld1q_u8(qptr.add(w * 8)); + acc += vaddvq_u8(vcntq_u8(vandq_u8(dv, qv))) as u32; + w += 2; + } + if w < qpv { + acc += (doc[w] & q[w]).count_ones(); + } + acc } - acc } /// NEON XOR-popcount (sign-bitmap Hamming); see [`and_popcount_neon`]. @@ -226,21 +235,27 @@ unsafe fn and_popcount_neon(doc: &[u64], q: &[u64]) -> u32 { #[inline] unsafe fn xor_popcount_neon(doc: &[u64], q: &[u64]) -> u32 { use std::arch::aarch64::*; - let qpv = doc.len(); - let dptr = doc.as_ptr() as *const u8; - let qptr = q.as_ptr() as *const u8; - let mut acc = 0u32; - let mut w = 0usize; - while w + 2 <= qpv { - let dv = vld1q_u8(dptr.add(w * 8)); - let qv = vld1q_u8(qptr.add(w * 8)); - acc += vaddvq_u8(vcntq_u8(veorq_u8(dv, qv))) as u32; - w += 2; - } - if w < qpv { - acc += (doc[w] ^ q[w]).count_ones(); + // SAFETY: same contract as `and_popcount_neon` — NEON baseline ABI, + // `vld1q_u8` loads bounded by `w + 2 <= qpv`, trailing word via safe + // index. The explicit block is required by + // `#![deny(unsafe_op_in_unsafe_fn)]`. + unsafe { + let qpv = doc.len(); + let dptr = doc.as_ptr() as *const u8; + let qptr = q.as_ptr() as *const u8; + let mut acc = 0u32; + let mut w = 0usize; + while w + 2 <= qpv { + let dv = vld1q_u8(dptr.add(w * 8)); + let qv = vld1q_u8(qptr.add(w * 8)); + acc += vaddvq_u8(vcntq_u8(veorq_u8(dv, qv))) as u32; + w += 2; + } + if w < qpv { + acc += (doc[w] ^ q[w]).count_ones(); + } + acc } - acc } /// WASM `simd128` AND-popcount: 16 bytes (2×`u64`) per `u8x16_popcnt`, From a5de3e79e8da2891cc130814531a481c6002260a Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 25 May 2026 22:04:22 -0500 Subject: [PATCH 2/7] ci: add bounded cargo-fuzz smoke (PR + weekly sweep) Adds .github/workflows/fuzz.yml: a 60s/target smoke over load_rank, load_rankquant and fastscan_b2 on every pull request / push to main, plus a weekly full sweep over all seven targets. Surfaces loader / write->load round-trip / FastScan-kernel regressions in CI between manual campaigns (THREAT-FUZZ-002). cargo-fuzz is version-pinned (0.13.1) and every action is SHA-pinned; read-only token; the matrix target is passed via env (no run: injection surface, per THREAT-CICD-001). Signed-off-by: Nelson Spence --- .github/workflows/fuzz.yml | 98 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 .github/workflows/fuzz.yml diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 00000000..99d775bd --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,98 @@ +name: fuzz + +# Bounded cargo-fuzz smoke. The seven targets in fuzz/ are normally exercised +# in manual campaigns; this adds CI cadence so a regression that reintroduces a +# loader panic / OOM, breaks the write->load round-trip, or destabilises the +# FastScan kernel surfaces in CI rather than only at the next manual run +# (THREAT-FUZZ-002 in THREAT_MODEL.md). +# +# * pull_request / push(main): a SHORT smoke (60s/target) over the +# highest-value targets — fast enough to run on every change. +# * schedule (weekly) / workflow_dispatch: a LONGER sweep (300s/target) +# across ALL seven targets. +# +# This runs UNATTENDED on a cron schedule, so every third-party action is +# SHA-pinned (not the mutable @vN tags ci.yml uses on human-triggered push/PR) +# and cargo-fuzz is version-pinned — a fuzz smoke must not itself become a +# supply-chain hole. Read-only token; the only `run:` interpolation is the +# matrix target name, passed through `env:` (never inlined into the shell) so +# there is no template-injection surface (THREAT-CICD-001). + +on: + pull_request: + push: + branches: [main] + schedule: + - cron: "0 5 * * 4" # 05:00 UTC every Thursday (clear of audit/scorecard Mon + codeql Wed) + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: fuzz-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + # Short per-change smoke over the highest-value targets: two loaders plus the + # FastScan b=2 kernel (the one unsafe-heavy scan path the loader targets do + # not reach). + smoke: + name: fuzz smoke (60s) + if: github.event_name == 'pull_request' || github.event_name == 'push' + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + target: [load_rank, load_rankquant, fastscan_b2] + steps: + - uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below + with: + toolchain: nightly + - name: Install cargo-fuzz (pinned) + run: cargo install cargo-fuzz --version 0.13.1 --locked + - name: Smoke + env: + TARGET: ${{ matrix.target }} + run: cargo +nightly fuzz run "$TARGET" -- -max_total_time=60 -rss_limit_mb=4096 + + # Weekly full sweep over all seven targets at a larger time budget. + weekly: + name: fuzz weekly (300s) + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + target: + - load_rank + - load_rankquant + - load_bitmap + - load_sign_bitmap + - roundtrip_rankquant + - search_rankquant + - fastscan_b2 + steps: + - uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below + with: + toolchain: nightly + - name: Install cargo-fuzz (pinned) + run: cargo install cargo-fuzz --version 0.13.1 --locked + - name: Fuzz + env: + TARGET: ${{ matrix.target }} + run: cargo +nightly fuzz run "$TARGET" -- -max_total_time=300 -rss_limit_mb=4096 From 73d0638583c61006344f548eeef04857df64d0ac Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 25 May 2026 22:04:27 -0500 Subject: [PATCH 3/7] fix: reject rank >= d in rank_to_bucket (fail-loud) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rank_to_bucket clamped the computed bucket with `b.min(n_buckets - 1)`, silently accepting rank >= d. A valid rank is a position in [0, d), so it now asserts `rank < d` and drops the clamp — matching the fail-loud contract of the sibling primitives pack_buckets / bucket_centre (#26). Valid rank vectors (a permutation of [0, d)) are unaffected. The Python bindings gain matching guards on rank_to_bucket and bucket_ranks so the new assert surfaces as a clean ValueError rather than a PanicException, with red-team tests for both. Also refreshes the stale pack_buckets binding comment (the core now asserts in-range, no longer masks). Closes #28. Signed-off-by: Nelson Spence --- ordvec-python/src/lib.rs | 27 ++++++++++++++++-- ordvec-python/tests/test_redteam_fuzz.py | 14 +++++++++ src/rank.rs | 36 ++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 301980ec..922f3a3c 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -1084,6 +1084,14 @@ fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> PyResult { if d == 0 { return Err(pyo3::exceptions::PyValueError::new_err("d must be > 0")); } + // The core `rank_to_bucket` now asserts `rank < d` (fail-loud, matching the + // other bucket primitives); surface that as a clean `ValueError` rather + // than letting the assert escape as a `PanicException`. + if rank as usize >= d { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "rank ({rank}) must be < d ({d})" + ))); + } Ok(ordvec_core::rank::rank_to_bucket(rank, d, bits)) } @@ -1110,6 +1118,17 @@ fn bucket_ranks<'py>( if slice.is_empty() { return Ok(Vec::::new().into_pyarray(py)); } + // `bucket_ranks` treats the input as a rank vector: each entry indexes into + // `[0, len)`, and the core `rank_to_bucket` now asserts `rank < len`. Reject + // an out-of-range entry here with a clean `ValueError` rather than letting + // that assert surface as a `PanicException`. A valid rank vector (a + // permutation of `[0, len)`) never trips this. + let d = slice.len(); + if let Some(&bad) = slice.iter().find(|&&r| r as usize >= d) { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "rank ({bad}) must be < d ({d})" + ))); + } Ok(ordvec_core::rank::bucket_ranks(slice, bits).into_pyarray(py)) } @@ -1135,9 +1154,11 @@ fn pack_buckets<'py>( slice.len() ))); } - // Reject out-of-range bucket codes rather than silently masking them: the - // core packs `b & ((1 << bits) - 1)`, so a value with high bits set would be - // truncated to a different bucket. The bucket alphabet is [0, 1 << bits). + // Reject out-of-range bucket codes here so the caller gets a clean + // `ValueError`: the core `pack_buckets` now *asserts* every code is in + // `[0, 1 << bits)` (it fails loud rather than masking), so an unchecked + // out-of-range value would otherwise escape as a `PanicException`. The + // bucket alphabet is [0, 1 << bits). let max_code = (1u16 << bits) - 1; if let Some(&bad) = slice.iter().find(|&&b| b as u16 > max_code) { return Err(pyo3::exceptions::PyValueError::new_err(format!( diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py index 9ec151ce..a1216c38 100644 --- a/ordvec-python/tests/test_redteam_fuzz.py +++ b/ordvec-python/tests/test_redteam_fuzz.py @@ -617,6 +617,20 @@ def test_rank_to_bucket_d_zero_value_error(): rank_to_bucket(0, 0, 2) +def test_rank_to_bucket_rank_ge_d_value_error(): + # The core asserts rank < d (fail-loud, like the other bucket primitives); + # the binding surfaces it as a clean ValueError, not a PanicException. + with pytest.raises(ValueError, match="must be < d"): + rank_to_bucket(8, 8, 2) + + +def test_bucket_ranks_out_of_range_value_error(): + # An entry >= len() would trip the core's `rank < d` assert; the binding + # rejects it as a ValueError rather than letting it surface as a panic. + with pytest.raises(ValueError, match="must be < d"): + bucket_ranks(np.array([0, 5, 2, 3], dtype=np.uint16), 2) + + @pytest.mark.parametrize("bits", [8, 255]) def test_bucket_centre_bits_above_7_value_error(bits): with pytest.raises(ValueError, match="bits"): diff --git a/src/rank.rs b/src/rank.rs index 5f4077fa..e35535df 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -64,6 +64,14 @@ pub fn rank_transform_into(v: &[f32], out: &mut [u16]) { /// Bucket a single rank into one of `1 << bits` equal-width bins on /// `[0, d)`. Returns a value in `[0, 1 << bits)`. +/// +/// # Panics +/// Panics if `bits > 7`, if `d == 0`, or if `rank >= d`. The `rank < d` +/// guard fails loud in *every* build — like the sibling [`pack_buckets`] and +/// [`bucket_centre`] checks — rather than silently clamping an out-of-range +/// rank into the top bucket. Internal callers feed ranks straight from +/// [`rank_transform`] (a permutation of `[0, d)`), so it never trips on the +/// hot path. #[inline] pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 { // `bits` is a `u8`, so a caller could pass e.g. 8 or 255. `1u32 << bits` @@ -73,17 +81,32 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 { // zero. Guard both up front so the failure is loud in every build. assert!(bits <= 7, "bits too large"); assert!(d > 0, "d must be positive"); + // A valid rank is a position in `[0, d)`. Reject `rank >= d` loudly instead + // of silently clamping the quotient back into range: the rest of the public + // bucket API ([`pack_buckets`] / [`bucket_centre`]) fails loud on an + // out-of-domain argument, so a direct caller that miscomputes a rank should + // hear about it rather than receive a plausible-but-wrong bucket. + assert!((rank as usize) < d, "rank ({rank}) must be < d ({d})"); let n_buckets = 1u32 << bits; // u64 math: `d` is a `usize` and reaches this from the Python binding as a // free argument, so `d as u32` could truncate a `d >= 2^32` (e.g. to 0, // which would divide by zero and panic). rank ≤ u16::MAX and n_buckets ≤ // 128, so the product fits u64 comfortably; over the realistic d ≤ u16::MAX // domain this is bit-identical to the previous u32 form. - let b = (rank as u64 * n_buckets as u64) / d as u64; - b.min(n_buckets as u64 - 1) as u8 + // + // With `rank < d` guaranteed above, `rank * n_buckets / d < n_buckets` + // (integer division floors), so the quotient already lands in + // `[0, n_buckets)` and fits the returned `u8` without a clamp. + ((rank as u64 * n_buckets as u64) / d as u64) as u8 } /// Bucket every entry of a full rank vector. +/// +/// # Panics +/// Panics if `bits > 7`, or — via [`rank_to_bucket`] — if any entry is +/// `>= ranks.len()`. A valid rank vector is a permutation of +/// `[0, ranks.len())`, so a well-formed input never trips the latter; empty +/// input returns empty without invoking the per-entry guard. pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec { let d = ranks.len(); ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect() @@ -567,6 +590,15 @@ mod tests { assert!(rank_to_bucket(u16::MAX, huge_d, 2) < 4); } + #[test] + #[should_panic(expected = "must be < d")] + fn rank_to_bucket_rejects_rank_ge_d() { + // A valid rank lives in `[0, d)`; `rank == d` is out of range. It used + // to clamp silently to the top bucket — now it fails loud in release + // too, matching pack_buckets / bucket_centre. + let _ = rank_to_bucket(8, 8, 2); + } + #[test] fn pack_unpack_round_trip_bits2() { let buckets: Vec = (0..16).map(|i| (i % 4) as u8).collect(); From 4ac782232dbb4727f5f550bb2ecbef9e2283519d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 25 May 2026 22:04:33 -0500 Subject: [PATCH 4/7] docs: mark SUPPLY-002 / SIMD-001 / FUZZ-002 mitigated Threat model + RELEASING now reflect the in-place controls: - SUPPLY-002: GitHub immutable releases enabled + main branch protection (PR review required, force-pushes and deletions blocked), closing the release-tag/asset mutability surface the registries already close. - SIMD-001: unsafe_op_in_unsafe_fn now denied crate-wide. - FUZZ-002: fuzz.yml CI smoke + weekly sweep. Corrects the workflow count (12 -> 13) and adds the CHANGELOG entries. Signed-off-by: Nelson Spence --- CHANGELOG.md | 18 +++++++++++ RELEASING.md | 14 +++++---- THREAT_MODEL.md | 84 +++++++++++++++++++++++++++---------------------- 3 files changed, 73 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be58468d..bfa5ce55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **CI fuzz smoke** (`.github/workflows/fuzz.yml`): a bounded cargo-fuzz run on + every pull request / push to `main` (60s each over `load_rank`, + `load_rankquant`, and `fastscan_b2`) plus a weekly full sweep over all seven + targets, so a loader, write→load round-trip, or FastScan-kernel regression + surfaces in CI between manual campaigns (THREAT-FUZZ-002). cargo-fuzz is + version-pinned and the actions are SHA-pinned. + ### Changed +- **`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced crate-wide** (previously + only in `fastscan.rs`): every unsafe operation in the `bitmap`, `sign_bitmap`, + `quant_kernels`, and `util` (NEON) SIMD kernels now sits in an explicit + `unsafe {}` block, keeping the unsafe surface visible to future edits + (THREAT-SIMD-001). +- **`rank::rank_to_bucket` rejects `rank >= d`** — it now panics (and the Python + binding raises `ValueError`) instead of silently clamping the result into + range, matching the fail-loud contract of `pack_buckets` / `bucket_centre`. + Valid rank vectors (a permutation of `[0, d)`) are unaffected. - **Python bindings (`ordvec-python`):** raised the floor to **Python 3.10** and **numpy 2.0**; the abi3 wheel target moves to `abi3-py310`. Python 3.9 reached end-of-life (October 2025) and pytest's CVE-2025-71176 fix dropped 3.9 support. diff --git a/RELEASING.md b/RELEASING.md index 605dca36..5166dbb3 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -39,13 +39,15 @@ Trusted Publishing step. > These two settings are the supply-chain backstop the workflow code cannot > express on its own (THREAT-SUPPLY-001 in [THREAT_MODEL.md](THREAT_MODEL.md)). -### Recommended (open) +### Tag and branch protection -- A **`v*` tag-protection ruleset** (block update + deletion) and a basic - `main` ruleset, so a release tag cannot be force-moved and `main` cannot be - force-pushed/deleted (THREAT-SUPPLY-002). Registries are already immutable - (crates.io is yank-only; PyPI burns a version on delete), so this closes the - remaining GitHub-side mutability surface. +- **Immutable releases** is enabled, so a published release's `v*` tag cannot be + force-moved or deleted and its assets cannot be replaced after publication. + This closes the GitHub-side mutability surface the registries already close on + their end (crates.io is yank-only; PyPI burns a version on delete). +- **`main` is a protected branch** — pull-request review is required and + force-pushes and deletions are blocked, so the branch a release dispatches + from cannot be rewritten (THREAT-SUPPLY-002). ## Checklist diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index 3786f416..f2890fe2 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -69,7 +69,7 @@ absence of a second maintainer is itself a tracked supply-chain residual | **Compute kernels** | `fastscan.rs`, `quant_kernels.rs`, `bitmap.rs`, `sign_bitmap.rs` | Trust established after format validation | | **Index API** | `rank.rs`, `quant.rs`, `bitmap.rs`, `sign_bitmap.rs` | Caller-controlled query embeddings | | **Python FFI** | `ordvec-python` (PyO3 / maturin) | Python ↔ Rust boundary; NumPy buffers | -| **CI / supply chain** | 12 GitHub Actions workflows; `Cargo.lock`; crates.io + PyPI | GitHub OIDC, crates.io, PyPI trust chains | +| **CI / supply chain** | 13 GitHub Actions workflows; `Cargo.lock`; crates.io + PyPI | GitHub OIDC, crates.io, PyPI trust chains | The `fuzz/` directory holds **seven** cargo-fuzz targets: `load_rank`, `load_rankquant`, `load_bitmap`, `load_sign_bitmap` (deserialization); @@ -161,7 +161,7 @@ to this kernel. ### 3.2 Risks -**THREAT-SIMD-001 (P1, mitigated this cycle; crate-wide rollout tracked): +**THREAT-SIMD-001 (P1, mitigated this cycle): Unsafe-kernel invariant preservation under future refactors.** `scan_b2_fastscan_avx512` safety depends on caller-established invariants — `packed_fs.len() == n_blocks * pairs * 32` (formed via `checked_mul`, overflow @@ -172,11 +172,13 @@ by construction. A future refactor calling the inner function directly could bypass the asserts. *Mitigations:* the runtime asserts + the type wrapper are the primary boundary; the scalar-vs-SIMD equivalence test (`fastscan_b2_top10_matches_avx512_kernel`) guards behavior; and -**`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced in `fastscan.rs`**, so -every unsafe operation in the kernel sits in an explicit `unsafe {}` block and -stays visible to future edits. *Open:* roll the lint out crate-wide to the -other SIMD modules (`bitmap.rs`, `sign_bitmap.rs`, `quant_kernels.rs`, -`util.rs` NEON) — tracked as a follow-up. +**`#![deny(unsafe_op_in_unsafe_fn)]` is now enforced crate-wide** (at the crate +root in `lib.rs`), so every unsafe operation in every SIMD kernel — +`fastscan.rs`, `bitmap.rs`, `sign_bitmap.rs`, `quant_kernels.rs`, and the +`util.rs` NEON popcount — sits in an explicit `unsafe {}` block and stays +visible to future edits. (The lone exception, `horizontal_sum_avx2`, is +register-only with no memory access, so its intrinsics are safe under the +`#[target_feature]` gate and an explicit block would be `unused_unsafe`.) **THREAT-SIMD-002 (P4, deployment note): Microarchitectural side channels in co-tenancy.** `ordvec` does not claim protection against microarchitectural @@ -237,7 +239,7 @@ applications must validate paths before calling"). ### 5.1 Existing controls (verified) -**Workflow code (all 12 workflows):** third-party actions pinned by commit +**Workflow code (all 13 workflows):** third-party actions pinned by commit SHA; `persist-credentials: false` on every checkout; `permissions: contents: read` default. **Release workflows** (`release-crate.yml`, `release-python.yml`) are `workflow_dispatch`-only (no tag/push trigger), run a `require-ci-green` @@ -270,17 +272,22 @@ passkeys on the maintainer account; recruiting a **second owner/maintainer** deployment **wait timer** worthwhile (a second party able to cancel a bad release during the window). See [`RELEASING.md`](RELEASING.md). -**THREAT-SUPPLY-002 (P3): Release immutability and tag integrity.** Published -artifacts are **immutable by registry design** — crates.io is yank-only (a -published version's bytes can never be overwritten) and PyPI burns a version on -delete (no different artifact may be re-uploaded under the same version). So -post-publish "silent replacement" of a version is not possible on either -registry, and consumers can verify artifacts against the SLSA / PEP 740 -provenance above. *Residual (GitHub-side):* `changelog.yml` cuts tagged GitHub -Releases, but the repo currently has **no tag-protection ruleset and no `main` -ruleset**, so a tag could be force-moved or a release asset replaced. -*Mitigation:* add a `v*` **tag ruleset** (block update + deletion) and a basic -`main` ruleset; optionally enable GitHub immutable releases. +**THREAT-SUPPLY-002 (mitigated): Release immutability and tag integrity.** +Published artifacts are **immutable by registry design** — crates.io is +yank-only (a published version's bytes can never be overwritten) and PyPI burns +a version on delete (no different artifact may be re-uploaded under the same +version). So post-publish "silent replacement" of a version is not possible on +either registry, and consumers can verify artifacts against the SLSA / PEP 740 +provenance above. The GitHub-side mutability surface is now closed too: +`changelog.yml` cuts tagged GitHub Releases, and **GitHub immutable releases is +enabled**, so a published release's `v*` tag cannot be force-moved or deleted +and its assets cannot be replaced after publication; the **`main` branch is +protected** (pull-request review required, force-pushes and deletions blocked) +and is the **only deployment branch** permitted for the `pypi` / `crates-io` +release environments. *Residual:* draft / non-release tags are not covered by +release immutability, and — as with the registries — these GitHub controls +ultimately trust the single maintainer account; that residual folds into +THREAT-SUPPLY-001. **THREAT-SUPPLY-003 (P3): Typosquatting adjacent names.** Namespace-adjacent crate/package names (`ord-vec`, `ordvecs`, `order-vec`) could be registered to @@ -358,11 +365,15 @@ single-rate compute path, and (new) the FastScan kernel. non-AVX-512 CI runners it exercises the scalar reference kernel; under Intel SDE it exercises the AVX-512 kernel. -**THREAT-FUZZ-002 (P3): No CI-bound fuzzing for continuous regression.** Fuzzing -is run manually; there is no CI gate. A bounded weekly smoke job (e.g. -`-runs=50000` on `load_rank`, `load_rankquant`, and `fastscan_b2`) would catch -regressions between manual runs. (Low overhead; weighed against maintenance -budget.) +**THREAT-FUZZ-002 (mitigated this cycle): CI-bound fuzzing for continuous +regression.** A `fuzz.yml` workflow now runs a bounded smoke on every pull +request and push to `main` (`-max_total_time=60` over `load_rank`, +`load_rankquant`, and `fastscan_b2`) plus a weekly full sweep +(`-max_total_time=300` over all seven targets), so a regression that +reintroduces a loader panic / OOM, breaks the write→load round-trip, or +destabilises the FastScan kernel surfaces in CI rather than only at the next +manual campaign. cargo-fuzz is version-pinned and the actions are SHA-pinned, +matching the repo's scheduled-workflow hardening. *Note on `load_sign_bitmap`:* all bit patterns are structurally valid for sign bitmaps (no per-row invariant), so that target is correctly scoped to parser @@ -386,16 +397,16 @@ blast radius of a compromised dependency separately. | ID | Category | Owner | Description | Likelihood | Impact | Status / priority | |---|---|---|---|---|---|---| -| THREAT-SIMD-001 | Memory safety | Library | Unsafe-kernel invariant bypass on refactor | Medium | High | **P1** — lint enforced in `fastscan.rs`; crate-wide rollout tracked | +| THREAT-SIMD-001 | Memory safety | Library | Unsafe-kernel invariant bypass on refactor | Medium | High | **Mitigated** — `unsafe_op_in_unsafe_fn` denied crate-wide + type wrapper + equivalence test | | THREAT-FFI-001 | FFI | Binding | Concurrent input mutation during released-GIL call | Medium | Medium | **P2** — documented contract | | THREAT-FFI-002 | FFI | Binding | Unsanitized path forwarding | Medium | Medium | **P2** — documented contract | | THREAT-SUPPLY-001 | Supply chain | Config | Release config / single-owner | Low | Critical | **Mitigated** (reviewer + main-only); residual = account compromise / 2nd owner | -| THREAT-SUPPLY-002 | Supply chain | Config | Release immutability / tag integrity | Low | High | **P3** — registries immutable; add tag ruleset | +| THREAT-SUPPLY-002 | Supply chain | Config | Release immutability / tag integrity | Low | High | **Mitigated** — registries immutable; GitHub immutable releases on + `main` protected | | THREAT-SUPPLY-003 | Supply chain | Config | Typosquatting adjacent names | Medium | Medium | P3 | | THREAT-QUERY-001 | Resource | Deployment | Batch / `k` exhaustion in serving | Medium | Medium | **P2** — deployment docs | | THREAT-QUERY-002 | Resource | Deployment | Panic on contract violation (Rust servers) | Low | Medium | P3 | | THREAT-FUZZ-001 | Fuzzing | Library | FastScan path unfuzzed | Medium | High | **Closed** (`fastscan_b2` added) | -| THREAT-FUZZ-002 | Fuzzing | Library | No CI-bound fuzzing | Medium | Medium | P3 | +| THREAT-FUZZ-002 | Fuzzing | Library | No CI-bound fuzzing | Medium | Medium | **Mitigated** — `fuzz.yml` PR smoke + weekly sweep | | THREAT-DESER-001 | Deserialization | Library | TOCTOU on shared mounts | Very Low | Low | P4 | | THREAT-DESER-002 | Provenance | Deployment | Malicious-but-valid index | Medium | High | P3 (docs — `INDEX_PROVENANCE.md`) | | THREAT-CICD-001 | CI/CD | Library | Workflow injection via PR metadata | Low | High | P3 — mitigated by `zizmor` | @@ -409,19 +420,18 @@ blast radius of a compromised dependency separately. ## 11. Open mitigations -**Done this cycle:** `#![deny(unsafe_op_in_unsafe_fn)]` in `fastscan.rs` -(SIMD-001); `fastscan_b2` fuzz target (FUZZ-001); release-environment reviewers -+ main-only deployment (SUPPLY-001); [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md) -(DESER-002); [`RELEASING.md`](RELEASING.md) (SUPPLY-001). +**Done this cycle:** `#![deny(unsafe_op_in_unsafe_fn)]` enforced **crate-wide** +across all SIMD modules (SIMD-001); the `fastscan_b2` fuzz target (FUZZ-001) +plus a CI `fuzz.yml` — PR smoke + weekly sweep (FUZZ-002); the `rank_to_bucket` +primitive made fail-loud (`rank < d`) to match the rest of the bucket API, with +matching binding guards; release-environment reviewers + main-only deployment +(SUPPLY-001); **GitHub immutable releases enabled + `main` branch protection** +(SUPPLY-002); [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md) (DESER-002); +[`RELEASING.md`](RELEASING.md) (SUPPLY-001). **Open, low cost:** -1. Add a `v*` tag-protection ruleset (+ basic `main` ruleset) and optionally - enable GitHub immutable releases (THREAT-SUPPLY-002). -2. Roll `#![deny(unsafe_op_in_unsafe_fn)]` out crate-wide across the remaining - SIMD modules (THREAT-SIMD-001). -3. Add a bounded weekly CI fuzz smoke job (THREAT-FUZZ-002). -4. Document recommended `nq` / `k` / corpus bounds for single-process serving +1. Document recommended `nq` / `k` / corpus bounds for single-process serving in the Rust and Python API docs (THREAT-QUERY-001). **Later (not release blockers):** a second maintainer/owner (then a release From 14f57153c3f937b90f1405b22a072f6a74d175cd Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 25 May 2026 22:07:58 -0500 Subject: [PATCH 5/7] ci: install cargo-fuzz without --locked (nightly compat) cargo-fuzz 0.13.1's bundled Cargo.lock pins rustix 0.36.x, which no longer compiles on current nightly (rustc_layout_scalar_valid_range_start is now a reserved attribute), so 'cargo install --locked' failed the three fuzz smoke jobs. Drop --locked; the tool stays version-pinned (0.13.1) and its build deps resolve to nightly-compatible versions. Verified locally on nightly. Signed-off-by: Nelson Spence --- .github/workflows/fuzz.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 99d775bd..2c3b8e59 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -56,8 +56,12 @@ jobs: - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below with: toolchain: nightly - - name: Install cargo-fuzz (pinned) - run: cargo install cargo-fuzz --version 0.13.1 --locked + - name: Install cargo-fuzz (version-pinned) + # NB: no `--locked` — cargo-fuzz 0.13.1's bundled Cargo.lock pins an old + # rustix (0.36.x) that no longer compiles on current nightly. The tool + # itself stays version-pinned; its build deps resolve to compatible + # versions. + run: cargo install cargo-fuzz --version 0.13.1 - name: Smoke env: TARGET: ${{ matrix.target }} @@ -90,8 +94,12 @@ jobs: - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # nightly; channel via toolchain: below with: toolchain: nightly - - name: Install cargo-fuzz (pinned) - run: cargo install cargo-fuzz --version 0.13.1 --locked + - name: Install cargo-fuzz (version-pinned) + # NB: no `--locked` — cargo-fuzz 0.13.1's bundled Cargo.lock pins an old + # rustix (0.36.x) that no longer compiles on current nightly. The tool + # itself stays version-pinned; its build deps resolve to compatible + # versions. + run: cargo install cargo-fuzz --version 0.13.1 - name: Fuzz env: TARGET: ${{ matrix.target }} From 169abe3ee4e08e1d40a628fea8a40cbdfd4c86cd Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 26 May 2026 08:23:04 -0500 Subject: [PATCH 6/7] fix: validate bits up front in bucket_ranks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bucket_ranks delegated the bits<=7 check to the per-entry rank_to_bucket call, so bucket_ranks(&[], bits>7) skipped it and silently returned an empty vec instead of failing loud. Assert bits<=7 up front so an invalid width is rejected for empty input too — matching the Python binding (which checks bits before its empty short-circuit) and making the documented panic contract accurate. Addresses a PR-review finding (copilot). Signed-off-by: Nelson Spence --- src/rank.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/rank.rs b/src/rank.rs index e35535df..cdcb22ff 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -103,11 +103,16 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 { /// Bucket every entry of a full rank vector. /// /// # Panics -/// Panics if `bits > 7`, or — via [`rank_to_bucket`] — if any entry is -/// `>= ranks.len()`. A valid rank vector is a permutation of -/// `[0, ranks.len())`, so a well-formed input never trips the latter; empty -/// input returns empty without invoking the per-entry guard. +/// Panics if `bits > 7` (validated up front, so this holds for empty input +/// too), or — via [`rank_to_bucket`] — if any entry is `>= ranks.len()`. A +/// valid rank vector is a permutation of `[0, ranks.len())`, so well-formed +/// input never trips the per-entry guard. pub fn bucket_ranks(ranks: &[u16], bits: u8) -> Vec { + // Validate `bits` up front so an invalid width fails loud even for empty + // input — an empty `ranks` skips the per-entry `rank_to_bucket` check and + // would otherwise silently return an empty vec. Mirrors the Python binding, + // which checks `bits` before its empty short-circuit. + assert!(bits <= 7, "bits too large"); let d = ranks.len(); ranks.iter().map(|&r| rank_to_bucket(r, d, bits)).collect() } @@ -599,6 +604,14 @@ mod tests { let _ = rank_to_bucket(8, 8, 2); } + #[test] + #[should_panic(expected = "bits too large")] + fn bucket_ranks_rejects_bits_above_7_even_when_empty() { + // `bits` is validated up front, so an invalid width fails loud even on + // empty input — which never reaches the per-entry rank_to_bucket guard. + let _ = bucket_ranks(&[], 8); + } + #[test] fn pack_unpack_round_trip_bits2() { let buckets: Vec = (0..16).map(|i| (i % 4) as u8).collect(); From 246209a6561d75a2a58a5c2d8f358b9b09b3346f Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 26 May 2026 08:23:04 -0500 Subject: [PATCH 7/7] docs: drop stale ci.yml claim from fuzz.yml comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fuzz.yml header said actions are SHA-pinned 'unlike the mutable @vN tags ci.yml uses' — but ci.yml is now fully SHA-pinned, so the comparison was wrong. Drop it; the comment states fuzz.yml's own pinning without an inaccurate baseline. Addresses a PR-review finding (copilot). Signed-off-by: Nelson Spence --- .github/workflows/fuzz.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 2c3b8e59..7403a165 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -12,11 +12,10 @@ name: fuzz # across ALL seven targets. # # This runs UNATTENDED on a cron schedule, so every third-party action is -# SHA-pinned (not the mutable @vN tags ci.yml uses on human-triggered push/PR) -# and cargo-fuzz is version-pinned — a fuzz smoke must not itself become a -# supply-chain hole. Read-only token; the only `run:` interpolation is the -# matrix target name, passed through `env:` (never inlined into the shell) so -# there is no template-injection surface (THREAT-CICD-001). +# SHA-pinned and cargo-fuzz is version-pinned — a fuzz smoke must not itself +# become a supply-chain hole. Read-only token; the only `run:` interpolation is +# the matrix target name, passed through `env:` (never inlined into the shell) +# so there is no template-injection surface (THREAT-CICD-001). on: pull_request: