Project-Navi · Navi Bot (project-navi-bot) · Jun 15, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
@@ -433,6 +433,9 @@ jobs:
         if: steps.sde.outputs.sde-available == 'true'
         env:
           CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: ${{ steps.sde.outputs.sde-path }} -spr --
+          # Cause any AVX-512 test that would silently skip on a non-AVX-512 host
+          # to panic loudly here, so this job genuinely enforces the kernels.
+          ORDVEC_REQUIRE_AVX512: "1"
         run: |
           set -euo pipefail
           cargo test

@@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Performance
+
+- **AVX-512 VPOPCNTDQ scan kernels now cover every `dim` (a multiple of 64), not
+  just multiples of 512 bits.** Previously the `SignBitmap` and `Bitmap` scan
+  kernels took the AVX-512 path only when the per-vector 64-bit word count was a
+  multiple of 8 (`dim` a multiple of 512), silently falling back to the scalar
+  loop otherwise — so common embedding widths like **768 (BGE) and 384
+  (bge-small / MiniLM)** ran the entire stage-1 candidate scan scalar. The
+  kernels now process the trailing `(dim / 64) % 8` words with a masked load
+  (`_mm512_maskz_loadu_epi64`), so any supported `dim` uses VPOPCNTDQ. Measured
+  **~4× faster** stage-1 scan at dim=768 on a Zen5 / AVX-512 host (609 → 153
+  µs/query, n=100k; see `examples/bge_kernel_bench`); 1024/1536 unchanged.
+  Results are byte-identical to the scalar path — parity tests cover qpv tail
+  residues 0..7 plus 384/512/768/1024/1536 for all six SignBitmap/Bitmap scan
+  kernels. This is stage-1 scan-kernel throughput, not a whole-pipeline figure.
+
+### Added
+
+- `avx512vpop_supported()` (`#[doc(hidden)]`) — reports whether the AVX-512
+  VPOPCNTDQ scan kernels are active on the current CPU. The scan dispatch reads
+  only this predicate (no per-dimension gate).
+
 ### Fixed
 
 - **`ordvec-manifest` crate and wheel now ship license text.** Both declared

@@ -0,0 +1,56 @@
+// Stage-1 SignBitmap scan-kernel A/B for BGE-style dims.
+// Times `score_all_batched_flat` (the per-query dense Hamming scan, the
+// stage-1 candidate-gen kernel) at a given dim. On origin/main, dim=768
+// (qpv=12) takes the SCALAR fallback; on the avx512-tail branch it takes
+// AVX-512 VPOPCNTDQ with a masked tail. Same public call, same inputs.
+//
+//   cargo run --release --example bge_kernel_bench -- <dim> <n> <batch> <reps>
+use ordvec::SignBitmap;
+use rand::{RngExt, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use std::time::Instant;
+
+fn median(mut v: Vec<f64>) -> f64 {
+    v.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    v[v.len() / 2]
+}
+
+fn main() {
+    let a: Vec<String> = std::env::args().collect();
+    let dim: usize = a.get(1).and_then(|s| s.parse().ok()).unwrap_or(768);
+    let n: usize = a.get(2).and_then(|s| s.parse().ok()).unwrap_or(100_000);
+    let batch: usize = a.get(3).and_then(|s| s.parse().ok()).unwrap_or(256);
+    let reps: usize = a.get(4).and_then(|s| s.parse().ok()).unwrap_or(40);
+
+    let mut rng = ChaCha8Rng::seed_from_u64(42);
+    let corpus: Vec<f32> = (0..n * dim).map(|_| rng.random_range(-1.0..1.0)).collect();
+    let mut idx = SignBitmap::new(dim);
+    idx.add(&corpus);
+    let queries: Vec<f32> = (0..batch * dim)
+        .map(|_| rng.random_range(-1.0..1.0))
+        .collect();
+
+    // Warmup.
+    for _ in 0..3 {
+        std::hint::black_box(idx.score_all_batched_flat(&queries));
+    }
+
+    let mut samples = Vec::with_capacity(reps);
+    for _ in 0..reps {
+        let t = Instant::now();
+        let s = idx.score_all_batched_flat(&queries);
+        let us = t.elapsed().as_secs_f64() * 1e6 / batch as f64;
+        std::hint::black_box(&s);
+        samples.push(us);
+    }
+    let med = median(samples.clone());
+    let p10 = {
+        let mut v = samples.clone();
+        v.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        v[v.len() / 10]
+    };
+    println!(
+        "dim={dim} n={n} batch={batch} reps={reps} qpv={} -> scan median {med:.2} us/query (p10 {p10:.2})",
+        dim / 64,
+    );
+}