From d1edf48e422f6fd2742e64467592f426a7e3c4fa Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sun, 17 May 2026 08:59:40 +0800
Subject: [PATCH 1/2] perf(scan): use vmaxvq_u8 for in-string fast probe in
 NEON scanner

Replace byte_mask64 (which uses movemask16 pairwise-add chain) with
vmaxvq_u8 on OR'd comparison results for detecting quote/backslash
in the in-string fast path. The vmaxvq_u8 approach is ~3x faster for
the probe itself, though end-to-end gains are masked by the existing
memchr2 cross-chunk jump optimization.

Changes:
- Remove unused byte_mask16 and byte_mask64 functions (-19 lines)
- Inline vmaxvq_u8 probe logic in scan_neon_impl (+12 lines)
- Add ARM64 (macos-14) to CI matrix for NEON coverage
- Add bench_neon128.rs for micro-benchmarking probe methods
---
 .github/workflows/ci.yml |  13 +-
 src/scan/neon.rs         |  34 ++--
 tests/bench_neon128.rs   | 340 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 360 insertions(+), 27 deletions(-)
 create mode 100644 tests/bench_neon128.rs
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cd39c2a..225b49f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,8 +10,11 @@ env:
 
 jobs:
   rust:
-    name: Rust tests
-    runs-on: ubuntu-latest
+    name: Rust tests (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-14]
     steps:
       - uses: actions/checkout@v4
 
@@ -27,9 +30,9 @@ jobs:
             ~/.cargo/registry
             ~/.cargo/git
             target
-          key: cargo-${{ runner.os }}-${{ hashFiles('Cargo.toml') }}
+          key: cargo-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('Cargo.toml') }}
           restore-keys: |
-            cargo-${{ runner.os }}-
+            cargo-${{ runner.os }}-${{ runner.arch }}-
 
       - name: Build (release)
         run: cargo build --release
@@ -37,7 +40,7 @@ jobs:
       - name: Test (release)
         run: cargo test --release
 
-      - name: Test scalar-only (no AVX2 feature)
+      - name: Test scalar-only (no AVX2/NEON feature)
         run: cargo test --release --no-default-features
 
       - name: Test with test-panic feature
diff --git a/src/scan/neon.rs b/src/scan/neon.rs
index 7c1db0a..2242da3 100644
--- a/src/scan/neon.rs
+++ b/src/scan/neon.rs
@@ -88,25 +88,6 @@ unsafe fn tag_mask16(tag: uint8x16_t, bits: u8) -> u16 {
     movemask16(vtstq_u8(tag, vdupq_n_u8(bits)))
 }
 
-#[inline(always)]
-unsafe fn byte_mask16(bytes: uint8x16_t, needle: u8) -> u16 {
-    movemask16(vceqq_u8(bytes, vdupq_n_u8(needle)))
-}
-
-#[inline(always)]
-unsafe fn byte_mask64(
-    c0: uint8x16_t,
-    c1: uint8x16_t,
-    c2: uint8x16_t,
-    c3: uint8x16_t,
-    needle: u8,
-) -> u64 {
-    (byte_mask16(c0, needle) as u64)
-        | ((byte_mask16(c1, needle) as u64) << 16)
-        | ((byte_mask16(c2, needle) as u64) << 32)
-        | ((byte_mask16(c3, needle) as u64) << 48)
-}
-
 #[inline(always)]
 unsafe fn classify_tags64(
     c0: uint8x16_t,
@@ -170,10 +151,19 @@ unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
 
         // In-string fast probe: while already in a string, avoid the full
         // nibble-LUT classification unless this block contains quote/backslash.
+        // Uses vmaxvq_u8 on OR'd comparison results instead of byte_mask64 to
+        // avoid the expensive movemask16 pairwise-add chain (~3x faster probe).
         if in_string != 0 {
-            let quote_probe = byte_mask64(c0, c1, c2, c3, b'"');
-            let backslash_probe = byte_mask64(c0, c1, c2, c3, b'\\');
-            if (quote_probe | backslash_probe) == 0 {
+            let quote = vdupq_n_u8(b'"');
+            let backslash = vdupq_n_u8(b'\\');
+            let m0 = vorrq_u8(vceqq_u8(c0, quote), vceqq_u8(c0, backslash));
+            let m1 = vorrq_u8(vceqq_u8(c1, quote), vceqq_u8(c1, backslash));
+            let m2 = vorrq_u8(vceqq_u8(c2, quote), vceqq_u8(c2, backslash));
+            let m3 = vorrq_u8(vceqq_u8(c3, quote), vceqq_u8(c3, backslash));
+            let m01 = vorrq_u8(m0, m1);
+            let m23 = vorrq_u8(m2, m3);
+            let m = vorrq_u8(m01, m23);
+            if vmaxvq_u8(m) == 0 {
                 bs_carry = 0;
                 i += 64;
                 // Cross-chunk jump: with no quote/backslash in the chunk we just
diff --git a/tests/bench_neon128.rs b/tests/bench_neon128.rs
new file mode 100644
index 0000000..0d312b0
--- /dev/null
+++ b/tests/bench_neon128.rs
@@ -0,0 +1,340 @@
+//! Micro-benchmark: vmaxvq_u8 fast probe early-exit for NEON scanner.
+//!
+//! Run: cargo test --release --test bench_neon128 -- --nocapture --ignored
+
+#![cfg(target_arch = "aarch64")]
+
+use core::arch::aarch64::*;
+use std::time::Instant;
+
+fn has_quote_or_backslash(chunk: &[u8]) -> bool {
+    chunk.iter().any(|&b| b == b'"' || b == b'\\')
+}
+
+fn make_probe_heavy_payload(size: usize) -> Vec<u8> {
+    let mut buf = Vec::with_capacity(size);
+    buf.extend_from_slice(b"{\"data\":\"");
+    while buf.len() < size - 2 {
+        buf.push(b'A');
+    }
+    buf.extend_from_slice(b"\"}");
+    buf
+}
+
+fn make_mixed_payload(size: usize) -> Vec<u8> {
+    let mut buf = Vec::with_capacity(size);
+    buf.push(b'[');
+    let mut i = 1;
+    while i < size - 1 {
+        let remaining = size - 1 - i;
+        if remaining < 20 {
+            buf.extend(std::iter::repeat_n(b' ', remaining));
+            break;
+        }
+        buf.extend_from_slice(b"{\"k\":\"");
+        let str_len = std::cmp::min(100, remaining - 10);
+        buf.extend(std::iter::repeat_n(b'x', str_len));
+        buf.extend_from_slice(b"\"},");
+        i = buf.len();
+    }
+    if buf.last() == Some(&b',') {
+        buf.pop();
+    }
+    buf.push(b']');
+    buf
+}
+
+fn make_small_objects_payload(size: usize) -> Vec<u8> {
+    let mut buf = Vec::with_capacity(size);
+    buf.push(b'[');
+    while buf.len() < size - 2 {
+        buf.extend_from_slice(b"{\"a\":1},");
+    }
+    if buf.last() == Some(&b',') {
+        buf.pop();
+    }
+    buf.push(b']');
+    buf
+}
+
+/// Current approach: byte_mask64 using movemask16 (pairwise add chain)
+#[inline(always)]
+unsafe fn current_probe(
+    c0: uint8x16_t,
+    c1: uint8x16_t,
+    c2: uint8x16_t,
+    c3: uint8x16_t,
+) -> bool {
+    #[inline(always)]
+    unsafe fn movemask16(v: uint8x16_t) -> u16 {
+        const LANE_BITS: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
+        let lane_mask = vld1q_u8(LANE_BITS.as_ptr());
+        let hi = vshrq_n_s8(vreinterpretq_s8_u8(v), 7);
+        let weighted = vandq_u8(vreinterpretq_u8_s8(hi), lane_mask);
+        let s16 = vpaddlq_u8(weighted);
+        let s32 = vpaddlq_u16(s16);
+        let s64 = vpaddlq_u32(s32);
+        let lo = vgetq_lane_u64(s64, 0) as u16;
+        let hi = vgetq_lane_u64(s64, 1) as u16;
+        lo | (hi << 8)
+    }
+
+    #[inline(always)]
+    unsafe fn byte_mask16(bytes: uint8x16_t, needle: u8) -> u16 {
+        movemask16(vceqq_u8(bytes, vdupq_n_u8(needle)))
+    }
+
+    #[inline(always)]
+    unsafe fn byte_mask64(
+        c0: uint8x16_t,
+        c1: uint8x16_t,
+        c2: uint8x16_t,
+        c3: uint8x16_t,
+        needle: u8,
+    ) -> u64 {
+        (byte_mask16(c0, needle) as u64)
+            | ((byte_mask16(c1, needle) as u64) << 16)
+            | ((byte_mask16(c2, needle) as u64) << 32)
+            | ((byte_mask16(c3, needle) as u64) << 48)
+    }
+
+    let quote_mask = byte_mask64(c0, c1, c2, c3, b'"');
+    let backslash_mask = byte_mask64(c0, c1, c2, c3, b'\\');
+    (quote_mask | backslash_mask) != 0
+}
+
+/// New approach: vmaxvq_u8 on OR'd comparison results
+#[inline(always)]
+unsafe fn vmaxvq_probe(
+    c0: uint8x16_t,
+    c1: uint8x16_t,
+    c2: uint8x16_t,
+    c3: uint8x16_t,
+) -> bool {
+    let quote = vdupq_n_u8(b'"');
+    let backslash = vdupq_n_u8(b'\\');
+
+    // Check for quote OR backslash in each register
+    let m0 = vorrq_u8(vceqq_u8(c0, quote), vceqq_u8(c0, backslash));
+    let m1 = vorrq_u8(vceqq_u8(c1, quote), vceqq_u8(c1, backslash));
+    let m2 = vorrq_u8(vceqq_u8(c2, quote), vceqq_u8(c2, backslash));
+    let m3 = vorrq_u8(vceqq_u8(c3, quote), vceqq_u8(c3, backslash));
+
+    // OR all together and check max
+    let m01 = vorrq_u8(m0, m1);
+    let m23 = vorrq_u8(m2, m3);
+    let m = vorrq_u8(m01, m23);
+
+    vmaxvq_u8(m) != 0
+}
+
+#[test]
+#[ignore]
+fn bench_realistic_scanner_path() {
+    println!("\n=== Realistic Scanner Path Comparison ===\n");
+    println!("Simulating the actual in_string fast path in the scanner.\n");
+    println!("Current: 2x byte_mask64 (each uses movemask16 with pairwise add chain)");
+    println!("New: vmaxvq_u8 on OR'd comparison results (single horizontal max)\n");
+
+    let payload = make_probe_heavy_payload(10 * 1024 * 1024);
+    println!("Payload: {} MB (probe-heavy: long base64 string)", payload.len() / 1024 / 1024);
+
+    // Count chunks with quote/backslash
+    let chunks: Vec<&[u8]> = payload.chunks_exact(64).collect();
+    let has_qb = chunks.iter().filter(|c| has_quote_or_backslash(c)).count();
+    println!(
+        "Chunks with quote/backslash: {} / {} ({:.2}%)\n",
+        has_qb,
+        chunks.len(),
+        100.0 * has_qb as f64 / chunks.len() as f64
+    );
+
+    let iters = 100;
+
+    // Warmup
+    for _ in 0..10 {
+        let mut i = 0usize;
+        unsafe {
+            while i + 64 <= payload.len() {
+                let c0 = vld1q_u8(payload.as_ptr().add(i));
+                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
+                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
+                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
+                std::hint::black_box(current_probe(c0, c1, c2, c3));
+                i += 64;
+            }
+        }
+    }
+
+    // Current approach
+    let t0 = Instant::now();
+    for _ in 0..iters {
+        let mut i = 0usize;
+        let mut skip_count = 0u64;
+        unsafe {
+            while i + 64 <= payload.len() {
+                let c0 = vld1q_u8(payload.as_ptr().add(i));
+                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
+                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
+                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
+
+                if !current_probe(c0, c1, c2, c3) {
+                    skip_count += 1;
+                }
+                i += 64;
+            }
+        }
+        std::hint::black_box(skip_count);
+    }
+    let current_ms = t0.elapsed().as_millis();
+
+    // New approach
+    let t0 = Instant::now();
+    for _ in 0..iters {
+        let mut i = 0usize;
+        let mut skip_count = 0u64;
+        unsafe {
+            while i + 64 <= payload.len() {
+                let c0 = vld1q_u8(payload.as_ptr().add(i));
+                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
+                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
+                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
+
+                if !vmaxvq_probe(c0, c1, c2, c3) {
+                    skip_count += 1;
+                }
+                i += 64;
+            }
+        }
+        std::hint::black_box(skip_count);
+    }
+    let vmaxvq_ms = t0.elapsed().as_millis();
+
+    println!("Current (2x byte_mask64):  {} ms ({} iters)", current_ms, iters);
+    println!("vmaxvq probe:              {} ms ({} iters)", vmaxvq_ms, iters);
+
+    let speedup = current_ms as f64 / vmaxvq_ms as f64;
+    let pct = (1.0 - vmaxvq_ms as f64 / current_ms as f64) * 100.0;
+    println!("\nSpeedup: {:.2}x ({:+.1}%)", speedup, pct);
+
+    if speedup > 1.1 {
+        println!("\nCONCLUSION: vmaxvq_u8 probe shows significant benefit ({:.0}% faster)", pct);
+    } else if speedup > 0.95 {
+        println!("\nCONCLUSION: vmaxvq_u8 probe is roughly equivalent (within noise)");
+    } else {
+        println!("\nCONCLUSION: vmaxvq_u8 probe is slower - not recommended");
+    }
+}
+
+#[test]
+#[ignore]
+fn bench_full_scanner_comparison() {
+    use quickdecode::__test_api::{NeonScanner, Scanner};
+
+    println!("\n=== Full Scanner Throughput (current implementation) ===\n");
+
+    let scenarios = [
+        ("probe-heavy 100KB", make_probe_heavy_payload(100 * 1024)),
+        ("probe-heavy 1MB", make_probe_heavy_payload(1024 * 1024)),
+        ("mixed 100KB", make_mixed_payload(100 * 1024)),
+        ("mixed 1MB", make_mixed_payload(1024 * 1024)),
+        ("small objects 100KB", make_small_objects_payload(100 * 1024)),
+        ("small objects 1MB", make_small_objects_payload(1024 * 1024)),
+    ];
+
+    for (name, payload) in &scenarios {
+        let iters = if payload.len() > 500_000 { 500 } else { 2000 };
+
+        // Warmup
+        for _ in 0..10 {
+            let mut out = Vec::new();
+            let _ = NeonScanner::scan(payload, &mut out);
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            let mut out = Vec::new();
+            let _ = NeonScanner::scan(payload, &mut out);
+            std::hint::black_box(&out);
+        }
+        let elapsed = t0.elapsed();
+        let ns_per_iter = elapsed.as_nanos() as f64 / iters as f64;
+        let throughput_gbps = (payload.len() as f64 * iters as f64 * 8.0)
+            / elapsed.as_secs_f64()
+            / 1_000_000_000.0;
+
+        println!(
+            "{:25} {:>10.0} ns/iter  {:.2} Gbps",
+            name, ns_per_iter, throughput_gbps
+        );
+    }
+}
+
+#[test]
+#[ignore]
+fn analyze_chunk_distribution() {
+    println!("\n=== Chunk Distribution Analysis ===\n");
+    println!("Checking if vmaxvq_u8 < 0x22 early-exit would help.\n");
+
+    let scenarios = [
+        ("probe-heavy 1MB", make_probe_heavy_payload(1024 * 1024)),
+        ("mixed 1MB", make_mixed_payload(1024 * 1024)),
+        ("small objects 1MB", make_small_objects_payload(1024 * 1024)),
+    ];
+
+    for (name, payload) in &scenarios {
+        println!("--- {} ---", name);
+
+        let chunks: Vec<&[u8]> = payload.chunks_exact(64).collect();
+        let mut max_bytes: Vec<u8> = Vec::with_capacity(chunks.len());
+
+        for chunk in &chunks {
+            let max = chunk.iter().copied().max().unwrap_or(0);
+            max_bytes.push(max);
+        }
+
+        let below_threshold = max_bytes.iter().filter(|&&m| m < 0x22).count();
+        let at_or_above = max_bytes.len() - below_threshold;
+
+        println!(
+            "  Total chunks: {}, max < 0x22: {} ({:.1}%), max >= 0x22: {} ({:.1}%)",
+            chunks.len(),
+            below_threshold,
+            100.0 * below_threshold as f64 / chunks.len() as f64,
+            at_or_above,
+            100.0 * at_or_above as f64 / chunks.len() as f64
+        );
+
+        // Distribution of max bytes
+        let mut histogram = [0usize; 256];
+        for &m in &max_bytes {
+            histogram[m as usize] += 1;
+        }
+
+        println!("  Max byte distribution (top 3):");
+        let mut sorted: Vec<(u8, usize)> = histogram
+            .iter()
+            .enumerate()
+            .filter(|(_, &c)| c > 0)
+            .map(|(b, &c)| (b as u8, c))
+            .collect();
+        sorted.sort_by(|a, b| b.1.cmp(&a.1));
+        for (byte, count) in sorted.iter().take(3) {
+            println!(
+                "    0x{:02x} '{}': {} ({:.1}%)",
+                byte,
+                if *byte >= 0x20 && *byte < 0x7f {
+                    *byte as char
+                } else {
+                    '.'
+                },
+                count,
+                100.0 * *count as f64 / chunks.len() as f64
+            );
+        }
+        println!();
+    }
+
+    println!("CONCLUSION: vmaxvq_u8 < 0x22 early-exit has ZERO benefit because");
+    println!("typical JSON content (letters, digits, base64) all have max >= 0x22.");
+}

From 6a575421da8cc3f15fcb6b3608c81e850cf80037 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sun, 17 May 2026 09:09:44 +0800
Subject: [PATCH 2/2] chore: remove bench_neon128.rs (not needed for CI)

---
 tests/bench_neon128.rs | 340 -----------------------------------------
 1 file changed, 340 deletions(-)
 delete mode 100644 tests/bench_neon128.rs

diff --git a/tests/bench_neon128.rs b/tests/bench_neon128.rs
deleted file mode 100644
index 0d312b0..0000000
--- a/tests/bench_neon128.rs
+++ /dev/null
@@ -1,340 +0,0 @@
-//! Micro-benchmark: vmaxvq_u8 fast probe early-exit for NEON scanner.
-//!
-//! Run: cargo test --release --test bench_neon128 -- --nocapture --ignored
-
-#![cfg(target_arch = "aarch64")]
-
-use core::arch::aarch64::*;
-use std::time::Instant;
-
-fn has_quote_or_backslash(chunk: &[u8]) -> bool {
-    chunk.iter().any(|&b| b == b'"' || b == b'\\')
-}
-
-fn make_probe_heavy_payload(size: usize) -> Vec<u8> {
-    let mut buf = Vec::with_capacity(size);
-    buf.extend_from_slice(b"{\"data\":\"");
-    while buf.len() < size - 2 {
-        buf.push(b'A');
-    }
-    buf.extend_from_slice(b"\"}");
-    buf
-}
-
-fn make_mixed_payload(size: usize) -> Vec<u8> {
-    let mut buf = Vec::with_capacity(size);
-    buf.push(b'[');
-    let mut i = 1;
-    while i < size - 1 {
-        let remaining = size - 1 - i;
-        if remaining < 20 {
-            buf.extend(std::iter::repeat_n(b' ', remaining));
-            break;
-        }
-        buf.extend_from_slice(b"{\"k\":\"");
-        let str_len = std::cmp::min(100, remaining - 10);
-        buf.extend(std::iter::repeat_n(b'x', str_len));
-        buf.extend_from_slice(b"\"},");
-        i = buf.len();
-    }
-    if buf.last() == Some(&b',') {
-        buf.pop();
-    }
-    buf.push(b']');
-    buf
-}
-
-fn make_small_objects_payload(size: usize) -> Vec<u8> {
-    let mut buf = Vec::with_capacity(size);
-    buf.push(b'[');
-    while buf.len() < size - 2 {
-        buf.extend_from_slice(b"{\"a\":1},");
-    }
-    if buf.last() == Some(&b',') {
-        buf.pop();
-    }
-    buf.push(b']');
-    buf
-}
-
-/// Current approach: byte_mask64 using movemask16 (pairwise add chain)
-#[inline(always)]
-unsafe fn current_probe(
-    c0: uint8x16_t,
-    c1: uint8x16_t,
-    c2: uint8x16_t,
-    c3: uint8x16_t,
-) -> bool {
-    #[inline(always)]
-    unsafe fn movemask16(v: uint8x16_t) -> u16 {
-        const LANE_BITS: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
-        let lane_mask = vld1q_u8(LANE_BITS.as_ptr());
-        let hi = vshrq_n_s8(vreinterpretq_s8_u8(v), 7);
-        let weighted = vandq_u8(vreinterpretq_u8_s8(hi), lane_mask);
-        let s16 = vpaddlq_u8(weighted);
-        let s32 = vpaddlq_u16(s16);
-        let s64 = vpaddlq_u32(s32);
-        let lo = vgetq_lane_u64(s64, 0) as u16;
-        let hi = vgetq_lane_u64(s64, 1) as u16;
-        lo | (hi << 8)
-    }
-
-    #[inline(always)]
-    unsafe fn byte_mask16(bytes: uint8x16_t, needle: u8) -> u16 {
-        movemask16(vceqq_u8(bytes, vdupq_n_u8(needle)))
-    }
-
-    #[inline(always)]
-    unsafe fn byte_mask64(
-        c0: uint8x16_t,
-        c1: uint8x16_t,
-        c2: uint8x16_t,
-        c3: uint8x16_t,
-        needle: u8,
-    ) -> u64 {
-        (byte_mask16(c0, needle) as u64)
-            | ((byte_mask16(c1, needle) as u64) << 16)
-            | ((byte_mask16(c2, needle) as u64) << 32)
-            | ((byte_mask16(c3, needle) as u64) << 48)
-    }
-
-    let quote_mask = byte_mask64(c0, c1, c2, c3, b'"');
-    let backslash_mask = byte_mask64(c0, c1, c2, c3, b'\\');
-    (quote_mask | backslash_mask) != 0
-}
-
-/// New approach: vmaxvq_u8 on OR'd comparison results
-#[inline(always)]
-unsafe fn vmaxvq_probe(
-    c0: uint8x16_t,
-    c1: uint8x16_t,
-    c2: uint8x16_t,
-    c3: uint8x16_t,
-) -> bool {
-    let quote = vdupq_n_u8(b'"');
-    let backslash = vdupq_n_u8(b'\\');
-
-    // Check for quote OR backslash in each register
-    let m0 = vorrq_u8(vceqq_u8(c0, quote), vceqq_u8(c0, backslash));
-    let m1 = vorrq_u8(vceqq_u8(c1, quote), vceqq_u8(c1, backslash));
-    let m2 = vorrq_u8(vceqq_u8(c2, quote), vceqq_u8(c2, backslash));
-    let m3 = vorrq_u8(vceqq_u8(c3, quote), vceqq_u8(c3, backslash));
-
-    // OR all together and check max
-    let m01 = vorrq_u8(m0, m1);
-    let m23 = vorrq_u8(m2, m3);
-    let m = vorrq_u8(m01, m23);
-
-    vmaxvq_u8(m) != 0
-}
-
-#[test]
-#[ignore]
-fn bench_realistic_scanner_path() {
-    println!("\n=== Realistic Scanner Path Comparison ===\n");
-    println!("Simulating the actual in_string fast path in the scanner.\n");
-    println!("Current: 2x byte_mask64 (each uses movemask16 with pairwise add chain)");
-    println!("New: vmaxvq_u8 on OR'd comparison results (single horizontal max)\n");
-
-    let payload = make_probe_heavy_payload(10 * 1024 * 1024);
-    println!("Payload: {} MB (probe-heavy: long base64 string)", payload.len() / 1024 / 1024);
-
-    // Count chunks with quote/backslash
-    let chunks: Vec<&[u8]> = payload.chunks_exact(64).collect();
-    let has_qb = chunks.iter().filter(|c| has_quote_or_backslash(c)).count();
-    println!(
-        "Chunks with quote/backslash: {} / {} ({:.2}%)\n",
-        has_qb,
-        chunks.len(),
-        100.0 * has_qb as f64 / chunks.len() as f64
-    );
-
-    let iters = 100;
-
-    // Warmup
-    for _ in 0..10 {
-        let mut i = 0usize;
-        unsafe {
-            while i + 64 <= payload.len() {
-                let c0 = vld1q_u8(payload.as_ptr().add(i));
-                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
-                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
-                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
-                std::hint::black_box(current_probe(c0, c1, c2, c3));
-                i += 64;
-            }
-        }
-    }
-
-    // Current approach
-    let t0 = Instant::now();
-    for _ in 0..iters {
-        let mut i = 0usize;
-        let mut skip_count = 0u64;
-        unsafe {
-            while i + 64 <= payload.len() {
-                let c0 = vld1q_u8(payload.as_ptr().add(i));
-                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
-                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
-                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
-
-                if !current_probe(c0, c1, c2, c3) {
-                    skip_count += 1;
-                }
-                i += 64;
-            }
-        }
-        std::hint::black_box(skip_count);
-    }
-    let current_ms = t0.elapsed().as_millis();
-
-    // New approach
-    let t0 = Instant::now();
-    for _ in 0..iters {
-        let mut i = 0usize;
-        let mut skip_count = 0u64;
-        unsafe {
-            while i + 64 <= payload.len() {
-                let c0 = vld1q_u8(payload.as_ptr().add(i));
-                let c1 = vld1q_u8(payload.as_ptr().add(i + 16));
-                let c2 = vld1q_u8(payload.as_ptr().add(i + 32));
-                let c3 = vld1q_u8(payload.as_ptr().add(i + 48));
-
-                if !vmaxvq_probe(c0, c1, c2, c3) {
-                    skip_count += 1;
-                }
-                i += 64;
-            }
-        }
-        std::hint::black_box(skip_count);
-    }
-    let vmaxvq_ms = t0.elapsed().as_millis();
-
-    println!("Current (2x byte_mask64):  {} ms ({} iters)", current_ms, iters);
-    println!("vmaxvq probe:              {} ms ({} iters)", vmaxvq_ms, iters);
-
-    let speedup = current_ms as f64 / vmaxvq_ms as f64;
-    let pct = (1.0 - vmaxvq_ms as f64 / current_ms as f64) * 100.0;
-    println!("\nSpeedup: {:.2}x ({:+.1}%)", speedup, pct);
-
-    if speedup > 1.1 {
-        println!("\nCONCLUSION: vmaxvq_u8 probe shows significant benefit ({:.0}% faster)", pct);
-    } else if speedup > 0.95 {
-        println!("\nCONCLUSION: vmaxvq_u8 probe is roughly equivalent (within noise)");
-    } else {
-        println!("\nCONCLUSION: vmaxvq_u8 probe is slower - not recommended");
-    }
-}
-
-#[test]
-#[ignore]
-fn bench_full_scanner_comparison() {
-    use quickdecode::__test_api::{NeonScanner, Scanner};
-
-    println!("\n=== Full Scanner Throughput (current implementation) ===\n");
-
-    let scenarios = [
-        ("probe-heavy 100KB", make_probe_heavy_payload(100 * 1024)),
-        ("probe-heavy 1MB", make_probe_heavy_payload(1024 * 1024)),
-        ("mixed 100KB", make_mixed_payload(100 * 1024)),
-        ("mixed 1MB", make_mixed_payload(1024 * 1024)),
-        ("small objects 100KB", make_small_objects_payload(100 * 1024)),
-        ("small objects 1MB", make_small_objects_payload(1024 * 1024)),
-    ];
-
-    for (name, payload) in &scenarios {
-        let iters = if payload.len() > 500_000 { 500 } else { 2000 };
-
-        // Warmup
-        for _ in 0..10 {
-            let mut out = Vec::new();
-            let _ = NeonScanner::scan(payload, &mut out);
-        }
-
-        let t0 = Instant::now();
-        for _ in 0..iters {
-            let mut out = Vec::new();
-            let _ = NeonScanner::scan(payload, &mut out);
-            std::hint::black_box(&out);
-        }
-        let elapsed = t0.elapsed();
-        let ns_per_iter = elapsed.as_nanos() as f64 / iters as f64;
-        let throughput_gbps = (payload.len() as f64 * iters as f64 * 8.0)
-            / elapsed.as_secs_f64()
-            / 1_000_000_000.0;
-
-        println!(
-            "{:25} {:>10.0} ns/iter  {:.2} Gbps",
-            name, ns_per_iter, throughput_gbps
-        );
-    }
-}
-
-#[test]
-#[ignore]
-fn analyze_chunk_distribution() {
-    println!("\n=== Chunk Distribution Analysis ===\n");
-    println!("Checking if vmaxvq_u8 < 0x22 early-exit would help.\n");
-
-    let scenarios = [
-        ("probe-heavy 1MB", make_probe_heavy_payload(1024 * 1024)),
-        ("mixed 1MB", make_mixed_payload(1024 * 1024)),
-        ("small objects 1MB", make_small_objects_payload(1024 * 1024)),
-    ];
-
-    for (name, payload) in &scenarios {
-        println!("--- {} ---", name);
-
-        let chunks: Vec<&[u8]> = payload.chunks_exact(64).collect();
-        let mut max_bytes: Vec<u8> = Vec::with_capacity(chunks.len());
-
-        for chunk in &chunks {
-            let max = chunk.iter().copied().max().unwrap_or(0);
-            max_bytes.push(max);
-        }
-
-        let below_threshold = max_bytes.iter().filter(|&&m| m < 0x22).count();
-        let at_or_above = max_bytes.len() - below_threshold;
-
-        println!(
-            "  Total chunks: {}, max < 0x22: {} ({:.1}%), max >= 0x22: {} ({:.1}%)",
-            chunks.len(),
-            below_threshold,
-            100.0 * below_threshold as f64 / chunks.len() as f64,
-            at_or_above,
-            100.0 * at_or_above as f64 / chunks.len() as f64
-        );
-
-        // Distribution of max bytes
-        let mut histogram = [0usize; 256];
-        for &m in &max_bytes {
-            histogram[m as usize] += 1;
-        }
-
-        println!("  Max byte distribution (top 3):");
-        let mut sorted: Vec<(u8, usize)> = histogram
-            .iter()
-            .enumerate()
-            .filter(|(_, &c)| c > 0)
-            .map(|(b, &c)| (b as u8, c))
-            .collect();
-        sorted.sort_by(|a, b| b.1.cmp(&a.1));
-        for (byte, count) in sorted.iter().take(3) {
-            println!(
-                "    0x{:02x} '{}': {} ({:.1}%)",
-                byte,
-                if *byte >= 0x20 && *byte < 0x7f {
-                    *byte as char
-                } else {
-                    '.'
-                },
-                count,
-                100.0 * *count as f64 / chunks.len() as f64
-            );
-        }
-        println!();
-    }
-
-    println!("CONCLUSION: vmaxvq_u8 < 0x22 early-exit has ZERO benefit because");
-    println!("typical JSON content (letters, digits, base64) all have max >= 0x22.");
-}