diff --git a/README.md b/README.md index 94adb75..9054aef 100644 --- a/README.md +++ b/README.md @@ -103,10 +103,10 @@ ARM64 (Apple M4, NEON/PMULL scanner, same workload): | Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | |---:|---:|---:|---:|---:| -| 2 KB | 254,738 | 654,108 | 392,711 | 2.6× / 1.5× | -| 100 KB | 15,281 | 108,932 | 99,701 | 7.1× / 6.5× | -| 1 MB | 1,523 | 11,905 | 11,876 | 7.8× / 7.8× | -| 10 MB | 153 | 1,218 | 1,222 | 8.0× / 8.0× | +| 2 KB | 237,124 | 705,000 | 390,000 | 3.0× / 1.6× | +| 100 KB | 14,667 | 232,000 | 208,000 | 15.8× / 14.2× | +| 1 MB | 1,494 | 33,700 | 33,000 | 22.6× / 22.1× | +| 10 MB | 150 | 3,376 | 3,454 | 22.5× / 23.0× | See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, memory numbers, an "encode round-trip" row (passthrough emit via diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 2161f7d..91d7584 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -38,6 +38,22 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { if interesting == 0 { bs_carry = 0; i += 64; + // Cross-chunk jump: no quote/backslash means in_string polarity + // cannot flip and no escape can start, so jump straight to the + // 64B-aligned chunk containing the next interesting byte. + // The 4 KB remaining-buffer threshold suppresses the memchr2 + // call entirely on small payloads (≤4 KB total), where the per- + // call libc overhead exceeds the in-string probe loop it would + // replace. On larger payloads only the last 4 KB foregoes the + // jump — negligible against MB-scale gains. + if i + 4096 <= buf.len() { + let scan_end = buf.len() - (buf.len() % 64); + let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { + Some(rel) => rel & !63, + None => scan_end - i, + }; + i += jump; + } continue; } } diff --git a/src/scan/neon.rs b/src/scan/neon.rs index 91d3f43..7c1db0a 100644 --- a/src/scan/neon.rs +++ b/src/scan/neon.rs @@ -176,6 +176,24 @@ unsafe fn scan_neon_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { if (quote_probe | backslash_probe) == 0 { bs_carry = 0; i += 64; + // Cross-chunk jump: with no quote/backslash in the chunk we just + // skipped, in_string polarity cannot flip and no escape can start, + // so we can use memchr2 to skip ahead to the 64B-aligned chunk + // containing the next interesting byte. Bounded by the last full + // 64B chunk; the <64B tail is handled by the scalar resume path. + // The 4 KB remaining-buffer threshold suppresses the memchr2 + // call entirely on small payloads (≤4 KB total), where the per- + // call libc overhead exceeds the in-string probe loop it would + // replace. On larger payloads only the last 4 KB foregoes the + // jump — negligible against MB-scale gains. + if i + 4096 <= buf.len() { + let scan_end = buf.len() - (buf.len() % 64); + let jump = match memchr::memchr2(b'"', b'\\', &buf[i..scan_end]) { + Some(rel) => rel & !63, + None => scan_end - i, + }; + i += jump; + } continue; } }