diff --git a/README.md b/README.md index c017d7a..b836e85 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the ## Status -Initial implementation complete: scalar + AVX2/PCLMUL structural scanner, root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below. +Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below. ## Building @@ -99,6 +99,15 @@ with similar throughput. Memory retention for `quickdecode` is essentially flat in payload size (a few KB for the reusable buffers), where `cjson` and `simdjson` retain ~1× the input size as live Lua-table state. +ARM64 (Apple M4, NEON/PMULL scanner, same workload): + +| Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | +|---:|---:|---:|---:|---:| +| 2 KB | 254,738 | 654,108 | 392,711 | 2.6× / 1.5× | +| 100 KB | 15,281 | 108,932 | 99,701 | 7.1× / 6.5× | +| 1 MB | 1,523 | 11,905 | 11,876 | 7.8× / 7.8× | +| 10 MB | 153 | 1,218 | 1,222 | 8.0× / 8.0× | + See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, memory numbers, an "encode round-trip" row (passthrough emit via `memcpy`), the pure-decode (no-access) comparison, and the exact @@ -112,7 +121,6 @@ make bench # quickdecode vs cjson Items intentionally pushed out of the first implementation. Each will be picked up individually. -- **ARM64 NEON scanner backend** — first version ships with scalar + AVX2 backends only. NEON backend (for Apple Silicon / Graviton / 鲲鹏) is deferred. - **SmallVec fast path for small documents (< 4 KB)** — avoid heap allocation for `indices` on tiny inputs. - **SIMD-accelerated backslash search** in the `decode_string` fast path. - **`lexical` fast float parser** if `::from_str` benchmarks as a bottleneck. @@ -124,7 +132,7 @@ Items intentionally pushed out of the first implementation. Each will be picked - **Adaptive `out.reserve` in scanners** — `out.reserve(buf.len() / 6)` is calibrated for object-heavy JSON. On string-heavy multimodal payloads (one big content array, mostly base64) the actual emit rate is <1 structural per 1 KB, so we over-reserve by 100x+. Mainly a memory hygiene concern (mmap'd pages stay lazily faulted), <5% throughput effect. - **AVX-512 scanner backend** — 64-byte → 128-byte chunks. On the 1 MB string-heavy bench, profile shows scan throughput is L3-bandwidth-bound, so realistic win is ~1.5–1.8×, not a clean 2×; larger wins need fixtures that fit in L1/L2. Needs `avx512bw` + `vpclmulqdq` (Sapphire Rapids, Zen 4+). - **`cargo fmt --check` not enforced** — `make lint` runs clippy only. The codebase uses intentional manual column alignment in struct definitions and compact single-line literals that default rustfmt would reflow. Skip rather than reformat until a project-wide style decision is made. -- **`validate_brackets` fusion into scan emit loop** — surfaced by profiling: on structurally-dense workloads `validate_brackets` is 65% of parse time (second linear pass over emitted indices). Folding bracket pairing into the scan emit loop via an inline depth stack eliminates that pass. No effect on the current string-heavy bench (0.3% there); a win for config / JSONL / table-shape JSON. +- **`validate_brackets` fusion in SIMD scanners** — fused into `ScalarScanner` via `scan_and_validate`; AVX2 and NEON scanners still run the two-pass emit + `validate_brackets` design. Folding bracket pairing into the SIMD emit loops would require carrying a depth stack across chunks (the inline `emit_bits` loop currently has no such state). <1% effect on string-heavy workloads; worth revisiting only if profiling on structurally-dense input flags it. - **`memchr2` cross-chunk jump for very long string interiors** — the AVX2 in-string fast probe (issue #5) drops per-chunk cost from ~25 to ~10 ops but still pays ALU work for every 64-byte chunk in a string. A `memchr2(b'"', b'\\')` jump can approach memory bandwidth on multi-MB single-string payloads. Deferred until a workload that benefits clearly emerges; needs careful `bs_carry` reasoning across the jump. - **Stateful O(N) iterator FFI** — current `qd.pairs` and the `__newindex` materialization path walk the object cursor from the start on every step, diff --git a/src/lib.rs b/src/lib.rs index 90215cd..83f161b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,4 +14,6 @@ pub mod __test_api { pub use crate::scan::{Scanner, ScalarScanner}; #[cfg(all(target_arch = "x86_64", feature = "avx2"))] pub use crate::scan::avx2::Avx2Scanner; + #[cfg(target_arch = "aarch64")] + pub use crate::scan::neon::NeonScanner; } diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 48fc3e0..2161f7d 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -44,7 +44,7 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { let backslash = byte_mask(chunk_lo, chunk_hi, b'\\'); let quote = byte_mask(chunk_lo, chunk_hi, b'"'); - let escaped = find_escape_mask_with_carry(backslash, &mut bs_carry); + let escaped = super::find_escape_mask_with_carry(backslash, &mut bs_carry); let real_quote = quote & !escaped; let (inside, new_in_string) = inside_string_mask(real_quote, in_string); @@ -54,7 +54,7 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { // Exclude structural chars inside strings; re-add real quotes. let final_mask = (struct_mask & !inside) | real_quote; - emit_bits(final_mask, i as u32, out); + super::emit_bits(final_mask, i as u32, out); i += 64; } @@ -106,15 +106,6 @@ unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 { (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32) } -#[inline(always)] -fn emit_bits(mut mask: u64, base: u32, out: &mut Vec) { - while mask != 0 { - let tz = mask.trailing_zeros(); - out.push(base + tz); - mask &= mask - 1; // clear lowest bit - } -} - /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`. /// Used by the in-string fast-probe to detect pure string-interior chunks /// in ~10 vector ops (4 cmpeq + 2 or + 2 movemask + shift/or), avoiding @@ -141,60 +132,6 @@ unsafe fn byte_mask(lo: __m256i, hi: __m256i, c: u8) -> u64 { mlo | (mhi << 32) } -/// Compute escape mask + new carry. Pure bit-twiddling, no SIMD intrinsics. -/// `prev_carry` is 1 iff the previous chunk ended such that the FIRST byte of -/// the current chunk is "escaped" (preceded by an odd-length run of backslashes -/// that ends at byte 0 of this chunk). -#[inline(always)] -fn find_escape_mask_with_carry(bs: u64, prev_carry: &mut u64) -> u64 { - let pc = *prev_carry; - - // Identify run starts: positions where bs[i] is set AND bs[i-1] is not. - // Bit 0's "i-1" is the prev-chunk carry. If prev_carry is 1, bit 0 - // continues a previous run (not a new start). If 0, bit 0 is a new start - // iff bs bit 0 is set. - let starts = bs & !((bs << 1) | pc); - - let even_bits: u64 = 0x5555_5555_5555_5555; - let odd_bits: u64 = 0xAAAA_AAAA_AAAA_AAAA; - let even_starts = starts & even_bits; - let odd_starts = starts & odd_bits; - - // Carry-adding: each start propagates 1-bits through the run via the bs mask. - let even_carries = bs.wrapping_add(even_starts); - let odd_carries = bs.wrapping_add(odd_starts); - - let even_carry_ends = even_carries & !bs; - let odd_carry_ends = odd_carries & !bs; - - // Bytes that follow odd-length runs are escaped. - // Even-start, odd-length runs end at an odd position. - // Odd-start, odd-length runs end at an even position. - let escaped_from_runs = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits); - - // If carry-in is 1, bit 0 is also escaped (the prev-chunk run ended exactly - // at the boundary with odd parity). - let escaped = escaped_from_runs | pc; - - // Compute the new carry: it's 1 iff the chunk ends mid-run AND the run's - // length (combined with any continuation from prev_carry) is odd at the - // boundary. - // - // Count trailing backslashes in bs (consecutive 1-bits ending at bit 63): - let trailing_bs = (!bs).leading_zeros(); - - let new_carry = if bs == u64::MAX { - // Whole chunk is backslashes — parity flips by 64 (even). - pc - } else { - // The trailing run is isolated in this chunk. - (trailing_bs as u64) & 1 - }; - - *prev_carry = new_carry; - escaped -} - /// Given the chunk's real-quote mask and the prior chunk's "ended-in-string" /// state, return (inside_string_mask, new_in_string_state). /// `prev_in_string` is 0 or 1. diff --git a/src/scan/mod.rs b/src/scan/mod.rs index 23739e0..84b9867 100644 --- a/src/scan/mod.rs +++ b/src/scan/mod.rs @@ -1,6 +1,8 @@ pub(crate) mod scalar; #[cfg(all(target_arch = "x86_64", feature = "avx2"))] pub(crate) mod avx2; +#[cfg(target_arch = "aarch64")] +pub(crate) mod neon; use once_cell::sync::OnceCell; @@ -29,11 +31,64 @@ pub(crate) fn scan(buf: &[u8], out: &mut Vec) -> Result<(), usize> { return ::scan; } } + #[cfg(target_arch = "aarch64")] + { + if std::arch::is_aarch64_feature_detected!("aes") { + return ::scan; + } + } ::scan }); f(buf, out) } +/// Compute escape mask + new carry. Pure bit-twiddling, no SIMD intrinsics. +/// `prev_carry` is 1 iff the previous chunk ended such that the FIRST byte of +/// the current chunk is "escaped" (preceded by an odd-length run of backslashes +/// that ends at byte 0 of this chunk). +#[inline(always)] +pub(crate) fn find_escape_mask_with_carry(bs: u64, prev_carry: &mut u64) -> u64 { + let pc = *prev_carry; + + // Identify run starts: positions where bs[i] is set AND bs[i-1] is not. + let starts = bs & !((bs << 1) | pc); + + let even_bits: u64 = 0x5555_5555_5555_5555; + let odd_bits: u64 = 0xAAAA_AAAA_AAAA_AAAA; + let even_starts = starts & even_bits; + let odd_starts = starts & odd_bits; + + let even_carries = bs.wrapping_add(even_starts); + let odd_carries = bs.wrapping_add(odd_starts); + + let even_carry_ends = even_carries & !bs; + let odd_carry_ends = odd_carries & !bs; + + let escaped_from_runs = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits); + let escaped = escaped_from_runs | pc; + + let trailing_bs = (!bs).leading_zeros(); + + let new_carry = if bs == u64::MAX { + pc + } else { + (trailing_bs as u64) & 1 + }; + + *prev_carry = new_carry; + escaped +} + +/// Emit all set-bit positions in `mask` (relative to `base`) into `out`. +#[inline(always)] +pub(crate) fn emit_bits(mut mask: u64, base: u32, out: &mut Vec) { + while mask != 0 { + let tz = mask.trailing_zeros(); + out.push(base + tz); + mask &= mask - 1; + } +} + /// Walk a sequence of already-emitted structural offsets and verify that /// `{`/`}` and `[`/`]` are properly paired. String quotes toggle an /// `in_string` flag and are otherwise skipped. This pass trusts the emit diff --git a/src/scan/neon.rs b/src/scan/neon.rs new file mode 100644 index 0000000..568f0f0 --- /dev/null +++ b/src/scan/neon.rs @@ -0,0 +1,271 @@ +#![cfg(target_arch = "aarch64")] + +use core::arch::aarch64::*; +use super::Scanner; + +pub struct NeonScanner; + +impl Scanner for NeonScanner { + fn scan(buf: &[u8], out: &mut Vec) -> Result<(), usize> { + if buf.is_empty() { return Ok(()); } + out.reserve(buf.len() / 6); + // SAFETY: caller (dispatcher in mod.rs) verified `aes` feature is + // present at runtime via `is_aarch64_feature_detected!("aes")`. + unsafe { scan_neon_impl(buf, out) } + } +} + +/// Simulate `_mm_movemask_epi8` for a 128-bit NEON register. +/// Returns a u16 where bit i is the high bit of lane i. +/// The input lanes are expected to be 0xFF (match) or 0x00 (no match). +#[inline(always)] +unsafe fn movemask16(v: uint8x16_t) -> u16 { + // Weight each byte by its bit position within its half-register. + // Lanes 0..7 use weights 1,2,4,8,16,32,64,128 (low byte of result). + // Lanes 8..15 use the same weights but are pairsum'd into the high byte. + const LANE_BITS: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128]; + let lane_mask = vld1q_u8(LANE_BITS.as_ptr()); + // Extract high bit of each lane (all-FF → all-FF, all-00 → all-00). + let hi = vshrq_n_s8(vreinterpretq_s8_u8(v), 7); + let weighted = vandq_u8(vreinterpretq_u8_s8(hi), lane_mask); + // Pairwise sum to condense 16 bytes → 8 u16 → 4 u32 → 2 u64. + let s16 = vpaddlq_u8(weighted); + let s32 = vpaddlq_u16(s16); + let s64 = vpaddlq_u32(s32); + let lo = vgetq_lane_u64(s64, 0) as u16; + let hi = vgetq_lane_u64(s64, 1) as u16; + lo | (hi << 8) +} + +/// Build a u64 mask where bit i is set if byte i (across c0..c3) equals `byte`. +#[inline(always)] +unsafe fn byte_mask64(c0: uint8x16_t, c1: uint8x16_t, c2: uint8x16_t, c3: uint8x16_t, byte: u8) -> u64 { + let v = vdupq_n_u8(byte); + let m0 = movemask16(vceqq_u8(c0, v)) as u64; + let m1 = movemask16(vceqq_u8(c1, v)) as u64; + let m2 = movemask16(vceqq_u8(c2, v)) as u64; + let m3 = movemask16(vceqq_u8(c3, v)) as u64; + m0 | (m1 << 16) | (m2 << 32) | (m3 << 48) +} + +/// Build a u64 mask where bit i is set if byte i is one of: { } [ ] : , " +#[inline(always)] +unsafe fn structural_mask64(c0: uint8x16_t, c1: uint8x16_t, c2: uint8x16_t, c3: uint8x16_t) -> u64 { + let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"']; + let mut m0: u16 = 0; + let mut m1: u16 = 0; + let mut m2: u16 = 0; + let mut m3: u16 = 0; + for c in chars { + let v = vdupq_n_u8(c); + m0 |= movemask16(vceqq_u8(c0, v)); + m1 |= movemask16(vceqq_u8(c1, v)); + m2 |= movemask16(vceqq_u8(c2, v)); + m3 |= movemask16(vceqq_u8(c3, v)); + } + (m0 as u64) | ((m1 as u64) << 16) | ((m2 as u64) << 32) | ((m3 as u64) << 48) +} + +/// Prefix-XOR via PMULL (carry-less multiply by all-ones) to produce an +/// inside-string mask from the real-quote positions. +/// Returns `(inside_mask, new_in_string_state)` where state is 0 or 1. +#[target_feature(enable = "neon,aes")] +unsafe fn inside_string_neon(real_quote: u64, prev_in_string: u64) -> (u64, u64) { + // vmull_p64(a, u64::MAX) = prefix XOR of bits in `a`. + let result = vmull_p64(real_quote, u64::MAX); + // Extract low 64 bits of the 128-bit poly result. + let result_v: uint64x2_t = vreinterpretq_u64_p128(result); + let mut mask = vgetq_lane_u64(result_v, 0); + if prev_in_string != 0 { + mask = !mask; + } + let new_state = (mask >> 63) & 1; + (mask, new_state) +} + +#[target_feature(enable = "neon,aes")] +unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { + let mut i = 0usize; + let mut bs_carry: u64 = 0; + let mut in_string: u64 = 0; + + while i + 64 <= buf.len() { + let c0 = vld1q_u8(buf.as_ptr().add(i)); + let c1 = vld1q_u8(buf.as_ptr().add(i + 16)); + let c2 = vld1q_u8(buf.as_ptr().add(i + 32)); + let c3 = vld1q_u8(buf.as_ptr().add(i + 48)); + + let backslash = byte_mask64(c0, c1, c2, c3, b'\\'); + let quote = byte_mask64(c0, c1, c2, c3, b'"'); + + // In-string fast probe: skip the escape/prefix-XOR path entirely when + // we are already inside a string and there are no quotes or backslashes. + if in_string != 0 && (backslash | quote) == 0 { + bs_carry = 0; + i += 64; + continue; + } + + let escaped = super::find_escape_mask_with_carry(backslash, &mut bs_carry); + let real_quote = quote & !escaped; + let (inside, new_in_string) = inside_string_neon(real_quote, in_string); + in_string = new_in_string; + + let struct_mask = structural_mask64(c0, c1, c2, c3); + let final_mask = (struct_mask & !inside) | real_quote; + super::emit_bits(final_mask, i as u32, out); + i += 64; + } + + // Tail (<64 bytes): hand off to scalar emit, carrying in_string / bs_carry state. + if i < buf.len() { + let scalar_start = if in_string != 0 && bs_carry != 0 { i + 1 } else { i }; + super::scalar::scan_emit_resume(buf, scalar_start, in_string != 0, out)?; + } else if in_string != 0 { + return Err(buf.len()); + } + + super::validate_brackets(buf, out) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scan::{Scanner, scalar::ScalarScanner}; + + fn host_supports_neon_aes() -> bool { + std::arch::is_aarch64_feature_detected!("aes") + } + + fn parity(input: &[u8]) { + let mut a = Vec::new(); + let mut b = Vec::new(); + let ra = ScalarScanner::scan(input, &mut a); + let rb = NeonScanner::scan(input, &mut b); + assert_eq!(ra, rb, "result mismatch on {:?}", std::str::from_utf8(input).unwrap_or("(non-utf8)")); + assert_eq!(a, b, "indices mismatch on {:?}", std::str::from_utf8(input).unwrap_or("(non-utf8)")); + } + + #[test] + fn no_strings_matches_scalar() { + if !host_supports_neon_aes() { return; } + parity(b"{}"); + parity(b"[]"); + parity(b"[{}]"); + parity(b"[[[]]]"); + parity(b"[1,2,3,4,5,6,7,8,9,0]"); + parity(b"[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]"); + } + + #[test] + fn within_chunk_strings_match_scalar() { + if !host_supports_neon_aes() { return; } + parity(b"{\"a\":\"hello\"}"); + parity(b"{\"a\":\"he\\nlo\"}"); + parity(b"{\"a\":\"he\\\"lo\"}"); + parity(b"[\"x\",\"y\",\"z\"]"); + } + + #[test] + fn chunked_path_with_string() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::with_capacity(64); + buf.extend_from_slice(b"{\"k\":\""); + buf.resize(62, b'a'); + buf.extend_from_slice(b"\"}"); + assert_eq!(buf.len(), 64); + parity(&buf); + } + + #[test] + fn chunked_path_with_escapes() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::with_capacity(64); + buf.extend_from_slice(b"{\"k\":\"aa\\\"bb\\\\cc"); + while buf.len() < 62 { buf.push(b'x'); } + buf.push(b'"'); + buf.push(b'}'); + assert_eq!(buf.len(), 64); + parity(&buf); + } + + #[test] + fn long_string_engages_skip_fastpath() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"k\":\""); + buf.resize(buf.len() + 1_048_576, b'a'); + buf.extend_from_slice(b"\"}"); + while buf.len() % 64 != 0 { buf.push(b' '); } + parity(&buf); + } + + #[test] + fn backslash_at_chunk_boundary() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"key\":\""); + while buf.len() < 63 { buf.push(b'x'); } + buf.push(b'\\'); + buf.push(b'"'); + buf.push(b'y'); + buf.push(b'"'); + buf.push(b'}'); + parity(&buf); + } + + #[test] + fn unaligned_tail_parity() { + if !host_supports_neon_aes() { return; } + for tail_len in [1usize, 5, 17, 33, 63] { + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"key\":\""); + while buf.len() < 60 { buf.push(b'x'); } + buf.extend_from_slice(b"abc\"}"); + let target = 64 + tail_len; + while buf.len() < target { buf.push(b' '); } + assert_eq!(buf.len(), target, "test setup"); + parity(&buf); + } + } + + #[test] + fn string_crosses_neon_boundary() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::new(); + buf.extend_from_slice(b"{\"k\":\""); + while buf.len() < 80 { buf.push(b'a'); } + buf.push(b'"'); + buf.push(b'}'); + parity(&buf); + } + + #[test] + fn pclmul_inside_string_correct() { + if !host_supports_neon_aes() { return; } + let mut buf = Vec::with_capacity(64); + buf.extend_from_slice(b"{\"a\":\"foo\",\"b\":\"bar\"}"); + while buf.len() < 64 { buf.push(b' '); } + assert_eq!(buf.len(), 64); + parity(&buf); + + let mut buf2 = Vec::with_capacity(64); + buf2.extend_from_slice(b"[\"a\",\"b\",\"c\",\"d\",\"e\"]"); + while buf2.len() < 64 { buf2.push(b' '); } + parity(&buf2); + + let mut buf3 = Vec::with_capacity(64); + buf3.extend_from_slice(b"{\"a\":\"\\\\\\\\\\\"\"}"); + while buf3.len() < 64 { buf3.push(b' '); } + parity(&buf3); + } + + #[test] + fn invalid_bracket_detected() { + if !host_supports_neon_aes() { return; } + // Mismatch detected in scalar tail (short input) + assert!(NeonScanner::scan(b"{]", &mut Vec::new()).is_err()); + assert!(NeonScanner::scan(b"[}", &mut Vec::new()).is_err()); + assert!(NeonScanner::scan(b"{\"a\":\"foo\"", &mut Vec::new()).is_err()); + } +} diff --git a/src/scan/scalar.rs b/src/scan/scalar.rs index 1608d78..634a3f0 100644 --- a/src/scan/scalar.rs +++ b/src/scan/scalar.rs @@ -4,12 +4,40 @@ pub struct ScalarScanner; impl Scanner for ScalarScanner { fn scan(buf: &[u8], out: &mut Vec) -> Result<(), usize> { - out.reserve(buf.len() / 6); - scan_emit_resume(buf, 0, false, out)?; - super::validate_brackets(buf, out) + scan_and_validate(buf, out) } } +/// Single-pass: emit structural offsets AND validate bracket pairing inline. +/// Replaces the two-pass `scan_emit_resume` + `validate_brackets` sequence. +pub(crate) fn scan_and_validate(buf: &[u8], out: &mut Vec) -> Result<(), usize> { + out.reserve(buf.len() / 6); + let mut i = 0usize; + let mut in_str = false; + let mut stack: Vec = Vec::with_capacity(32); + while i < buf.len() { + let b = buf[i]; + if in_str { + if b == b'\\' { i += 2; continue; } + if b == b'"' { in_str = false; out.push(i as u32); } + i += 1; + continue; + } + match b { + b'"' => { in_str = true; out.push(i as u32); } + b'{' | b'[' => { stack.push(b); out.push(i as u32); } + b'}' => { out.push(i as u32); if stack.pop() != Some(b'{') { return Err(i); } } + b']' => { out.push(i as u32); if stack.pop() != Some(b'[') { return Err(i); } } + b':' | b',' => { out.push(i as u32); } + _ => {} + } + i += 1; + } + if in_str { return Err(buf.len()); } + if !stack.is_empty() { return Err(buf.len()); } + Ok(()) +} + /// Emit structural-character offsets for `buf[start..]`, continuing from a /// given in-string state. Does NOT validate bracket pairing; the caller is /// responsible for running `validate_brackets` over the emitted offsets. diff --git a/tests/scanner_crosscheck.proptest-regressions b/tests/scanner_crosscheck.proptest-regressions new file mode 100644 index 0000000..a3d67ab --- /dev/null +++ b/tests/scanner_crosscheck.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc c6c0fe08741f776c4161661e285e9889c0e3ef6fa4b832484adcae4f6d264c25 # shrinks to input = "}{" diff --git a/tests/scanner_crosscheck.rs b/tests/scanner_crosscheck.rs index 209ac5a..f27b737 100644 --- a/tests/scanner_crosscheck.rs +++ b/tests/scanner_crosscheck.rs @@ -18,15 +18,15 @@ proptest! { let mut b = Vec::new(); let ra = ScalarScanner::scan(input.as_bytes(), &mut a); let rb = Avx2Scanner::scan(input.as_bytes(), &mut b); - // Both paths run the same scan_emit_resume + validate_brackets - // pipeline, so Result equality is required: same Ok/Err verdict - // AND same error offset when Err. + // Both scanners must agree on Ok vs Err (and on the error offset). prop_assert_eq!(&ra, &rb, "scan results differ for {:?}", input); - // Indices are produced entirely by scan_emit_resume (which walks - // through end-of-buffer before any Err) and are not modified by - // validate_brackets, so both `a` and `b` reflect the full emit - // regardless of whether the final result was Ok or Err. - prop_assert_eq!(&a, &b, "indices differ for {:?}", input); + // On success, indices must be identical. On error, the partial + // emit may differ: the fused scalar (scan_and_validate) aborts at + // the first bracket mismatch, while AVX2 emits all structural + // chars before validate_brackets runs. Only compare on Ok. + if ra.is_ok() { + prop_assert_eq!(&a, &b, "indices differ for {:?}", input); + } } } @@ -55,4 +55,63 @@ fn valid_jsonish() -> impl Strategy { } #[cfg(not(all(target_arch = "x86_64", feature = "avx2")))] -#[test] fn skip() {} +#[test] fn skip_avx2() {} + +// ── NEON cross-check ────────────────────────────────────────────────────────── + +#[cfg(target_arch = "aarch64")] +use proptest::prelude::*; + +#[cfg(target_arch = "aarch64")] +use quickdecode::__test_api::{Scanner, ScalarScanner, NeonScanner}; + +#[cfg(target_arch = "aarch64")] +proptest! { + #![proptest_config(ProptestConfig::with_cases(2000))] + + #[test] + fn scalar_neon_bit_identical(input in neon_valid_jsonish()) { + if !std::arch::is_aarch64_feature_detected!("aes") { + return Ok(()); + } + let mut a = Vec::new(); + let mut b = Vec::new(); + let ra = ScalarScanner::scan(input.as_bytes(), &mut a); + let rb = NeonScanner::scan(input.as_bytes(), &mut b); + // Both scanners must agree on Ok vs Err (and on the error offset). + prop_assert_eq!(&ra, &rb, "scan results differ for {:?}", input); + // On success, indices must be identical. On error, the partial + // emit may differ between fused-scalar and two-pass NEON because + // the fused path stops at the first bracket error while NEON emits + // all structural chars before validating; only check on Ok. + if ra.is_ok() { + prop_assert_eq!(&a, &b, "indices differ for {:?}", input); + } + } +} + +#[cfg(target_arch = "aarch64")] +fn neon_valid_jsonish() -> impl Strategy { + proptest::collection::vec( + prop_oneof![ + Just("{".to_string()), + Just("}".to_string()), + Just("[".to_string()), + Just("]".to_string()), + Just(",".to_string()), + Just(":".to_string()), + Just("\"a\"".to_string()), + Just("\"\\\\\"".to_string()), + Just("\"\\\"\"".to_string()), + Just("\"\\u00e9\"".to_string()), + Just("\"中文\"".to_string()), + Just("123".to_string()), + Just("\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"".to_string()), + Just("\"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\"".to_string()), + ], + 0..200, + ).prop_map(|v| v.concat()) +} + +#[cfg(not(target_arch = "aarch64"))] +#[test] fn skip_neon() {}