api7 · membphis · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL structural scanner, root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below.
+Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below.
 
 ## Building
 
@@ -99,6 +99,15 @@ with similar throughput. Memory retention for `quickdecode` is essentially
 flat in payload size (a few KB for the reusable buffers), where `cjson`
 and `simdjson` retain ~1× the input size as live Lua-table state.
 
+ARM64 (Apple M4, NEON/PMULL scanner, same workload):
+
+| Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson |
+|---:|---:|---:|---:|---:|
+|   2 KB | 254,738 | 654,108 | 392,711 | 2.6× / 1.5× |
+| 100 KB |  15,281 | 108,932 |  99,701 | 7.1× / 6.5× |
+|   1 MB |   1,523 |  11,905 |  11,876 | 7.8× / 7.8× |
+|  10 MB |     153 |   1,218 |   1,222 | 8.0× / 8.0× |
+
 See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
 memory numbers, an "encode round-trip" row (passthrough emit via
 `memcpy`), the pure-decode (no-access) comparison, and the exact
@@ -112,7 +121,6 @@ make bench       # quickdecode vs cjson
 
 Items intentionally pushed out of the first implementation. Each will be picked up individually.
 
-- **ARM64 NEON scanner backend** — first version ships with scalar + AVX2 backends only. NEON backend (for Apple Silicon / Graviton / 鲲鹏) is deferred.
 - **SmallVec fast path for small documents (< 4 KB)** — avoid heap allocation for `indices` on tiny inputs.
 - **SIMD-accelerated backslash search** in the `decode_string` fast path.
 - **`lexical` fast float parser** if `<f64>::from_str` benchmarks as a bottleneck.
@@ -124,7 +132,7 @@ Items intentionally pushed out of the first implementation. Each will be picked
 - **Adaptive `out.reserve` in scanners** — `out.reserve(buf.len() / 6)` is calibrated for object-heavy JSON. On string-heavy multimodal payloads (one big content array, mostly base64) the actual emit rate is <1 structural per 1 KB, so we over-reserve by 100x+. Mainly a memory hygiene concern (mmap'd pages stay lazily faulted), <5% throughput effect.
 - **AVX-512 scanner backend** — 64-byte → 128-byte chunks. On the 1 MB string-heavy bench, profile shows scan throughput is L3-bandwidth-bound, so realistic win is ~1.5–1.8×, not a clean 2×; larger wins need fixtures that fit in L1/L2. Needs `avx512bw` + `vpclmulqdq` (Sapphire Rapids, Zen 4+).
 - **`cargo fmt --check` not enforced** — `make lint` runs clippy only. The codebase uses intentional manual column alignment in struct definitions and compact single-line literals that default rustfmt would reflow. Skip rather than reformat until a project-wide style decision is made.
-- **`validate_brackets` fusion into scan emit loop** — surfaced by profiling: on structurally-dense workloads `validate_brackets` is 65% of parse time (second linear pass over emitted indices). Folding bracket pairing into the scan emit loop via an inline depth stack eliminates that pass. No effect on the current string-heavy bench (0.3% there); a win for config / JSONL / table-shape JSON.
+- **`validate_brackets` fusion in SIMD scanners** — fused into `ScalarScanner` via `scan_and_validate`; AVX2 and NEON scanners still run the two-pass emit + `validate_brackets` design. Folding bracket pairing into the SIMD emit loops would require carrying a depth stack across chunks (the inline `emit_bits` loop currently has no such state). <1% effect on string-heavy workloads; worth revisiting only if profiling on structurally-dense input flags it.
 - **`memchr2` cross-chunk jump for very long string interiors** — the AVX2 in-string fast probe (issue #5) drops per-chunk cost from ~25 to ~10 ops but still pays ALU work for every 64-byte chunk in a string. A `memchr2(b'"', b'\\')` jump can approach memory bandwidth on multi-MB single-string payloads. Deferred until a workload that benefits clearly emerges; needs careful `bs_carry` reasoning across the jump.
 - **Stateful O(N) iterator FFI** — current `qd.pairs` and the `__newindex`
   materialization path walk the object cursor from the start on every step,

diff --git a/src/lib.rs b/src/lib.rs
@@ -14,4 +14,6 @@ pub mod __test_api {
     pub use crate::scan::{Scanner, ScalarScanner};
     #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
     pub use crate::scan::avx2::Avx2Scanner;
+    #[cfg(target_arch = "aarch64")]
+    pub use crate::scan::neon::NeonScanner;
 }
diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs
@@ -44,7 +44,7 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
 
         let backslash = byte_mask(chunk_lo, chunk_hi, b'\\');
         let quote     = byte_mask(chunk_lo, chunk_hi, b'"');
-        let escaped   = find_escape_mask_with_carry(backslash, &mut bs_carry);
+        let escaped   = super::find_escape_mask_with_carry(backslash, &mut bs_carry);
         let real_quote = quote & !escaped;
 
         let (inside, new_in_string) = inside_string_mask(real_quote, in_string);
@@ -54,7 +54,7 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
         // Exclude structural chars inside strings; re-add real quotes.
         let final_mask = (struct_mask & !inside) | real_quote;
 
-        emit_bits(final_mask, i as u32, out);
+        super::emit_bits(final_mask, i as u32, out);
 
         i += 64;
     }
@@ -106,15 +106,6 @@ unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 {
     (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32)
 }
 
-#[inline(always)]
-fn emit_bits(mut mask: u64, base: u32, out: &mut Vec<u32>) {
-    while mask != 0 {
-        let tz = mask.trailing_zeros();
-        out.push(base + tz);
-        mask &= mask - 1; // clear lowest bit
-    }
-}
-
 /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`.
 /// Used by the in-string fast-probe to detect pure string-interior chunks
 /// in ~10 vector ops (4 cmpeq + 2 or + 2 movemask + shift/or), avoiding
@@ -141,60 +132,6 @@ unsafe fn byte_mask(lo: __m256i, hi: __m256i, c: u8) -> u64 {
     mlo | (mhi << 32)
 }
 
-/// Compute escape mask + new carry. Pure bit-twiddling, no SIMD intrinsics.
-/// `prev_carry` is 1 iff the previous chunk ended such that the FIRST byte of
-/// the current chunk is "escaped" (preceded by an odd-length run of backslashes
-/// that ends at byte 0 of this chunk).
-#[inline(always)]
-fn find_escape_mask_with_carry(bs: u64, prev_carry: &mut u64) -> u64 {
-    let pc = *prev_carry;
-
-    // Identify run starts: positions where bs[i] is set AND bs[i-1] is not.
-    // Bit 0's "i-1" is the prev-chunk carry. If prev_carry is 1, bit 0
-    // continues a previous run (not a new start). If 0, bit 0 is a new start
-    // iff bs bit 0 is set.
-    let starts = bs & !((bs << 1) | pc);
-
-    let even_bits: u64 = 0x5555_5555_5555_5555;
-    let odd_bits:  u64 = 0xAAAA_AAAA_AAAA_AAAA;
-    let even_starts = starts & even_bits;
-    let odd_starts  = starts & odd_bits;
-
-    // Carry-adding: each start propagates 1-bits through the run via the bs mask.
-    let even_carries = bs.wrapping_add(even_starts);
-    let odd_carries  = bs.wrapping_add(odd_starts);
-
-    let even_carry_ends = even_carries & !bs;
-    let odd_carry_ends  = odd_carries  & !bs;
-
-    // Bytes that follow odd-length runs are escaped.
-    // Even-start, odd-length runs end at an odd position.
-    // Odd-start, odd-length runs end at an even position.
-    let escaped_from_runs = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits);
-
-    // If carry-in is 1, bit 0 is also escaped (the prev-chunk run ended exactly
-    // at the boundary with odd parity).
-    let escaped = escaped_from_runs | pc;
-
-    // Compute the new carry: it's 1 iff the chunk ends mid-run AND the run's
-    // length (combined with any continuation from prev_carry) is odd at the
-    // boundary.
-    //
-    // Count trailing backslashes in bs (consecutive 1-bits ending at bit 63):
-    let trailing_bs = (!bs).leading_zeros();
-
-    let new_carry = if bs == u64::MAX {
-        // Whole chunk is backslashes — parity flips by 64 (even).
-        pc
-    } else {
-        // The trailing run is isolated in this chunk.
-        (trailing_bs as u64) & 1
-    };
-
-    *prev_carry = new_carry;
-    escaped
-}
-
 /// Given the chunk's real-quote mask and the prior chunk's "ended-in-string"
 /// state, return (inside_string_mask, new_in_string_state).
 /// `prev_in_string` is 0 or 1.

diff --git a/src/scan/mod.rs b/src/scan/mod.rs
@@ -1,6 +1,8 @@
 pub(crate) mod scalar;
 #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
 pub(crate) mod avx2;
+#[cfg(target_arch = "aarch64")]
+pub(crate) mod neon;
 
 use once_cell::sync::OnceCell;
 
@@ -29,11 +31,64 @@ pub(crate) fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
                 return <avx2::Avx2Scanner as Scanner>::scan;
             }
         }
+        #[cfg(target_arch = "aarch64")]
+        {
+            if std::arch::is_aarch64_feature_detected!("aes") {
+                return <neon::NeonScanner as Scanner>::scan;
+            }
+        }
         <ScalarScanner as Scanner>::scan
     });
     f(buf, out)
 }
 
+/// Compute escape mask + new carry. Pure bit-twiddling, no SIMD intrinsics.
+/// `prev_carry` is 1 iff the previous chunk ended such that the FIRST byte of
+/// the current chunk is "escaped" (preceded by an odd-length run of backslashes
+/// that ends at byte 0 of this chunk).
+#[inline(always)]
+pub(crate) fn find_escape_mask_with_carry(bs: u64, prev_carry: &mut u64) -> u64 {
+    let pc = *prev_carry;
+
+    // Identify run starts: positions where bs[i] is set AND bs[i-1] is not.
+    let starts = bs & !((bs << 1) | pc);
+
+    let even_bits: u64 = 0x5555_5555_5555_5555;
+    let odd_bits:  u64 = 0xAAAA_AAAA_AAAA_AAAA;
+    let even_starts = starts & even_bits;
+    let odd_starts  = starts & odd_bits;
+
+    let even_carries = bs.wrapping_add(even_starts);
+    let odd_carries  = bs.wrapping_add(odd_starts);
+
+    let even_carry_ends = even_carries & !bs;
+    let odd_carry_ends  = odd_carries  & !bs;
+
+    let escaped_from_runs = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits);
+    let escaped = escaped_from_runs | pc;
+
+    let trailing_bs = (!bs).leading_zeros();
+
+    let new_carry = if bs == u64::MAX {
+        pc
+    } else {
+        (trailing_bs as u64) & 1
+    };
+
+    *prev_carry = new_carry;
+    escaped
+}
+
+/// Emit all set-bit positions in `mask` (relative to `base`) into `out`.
+#[inline(always)]
+pub(crate) fn emit_bits(mut mask: u64, base: u32, out: &mut Vec<u32>) {
+    while mask != 0 {
+        let tz = mask.trailing_zeros();
+        out.push(base + tz);
+        mask &= mask - 1;
+    }
+}
+
 /// Walk a sequence of already-emitted structural offsets and verify that
 /// `{`/`}` and `[`/`]` are properly paired. String quotes toggle an
 /// `in_string` flag and are otherwise skipped. This pass trusts the emit