From 88bf464cfe381d24d0386879eac1224d2ef09a2a Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 15:53:58 +0000
Subject: [PATCH 01/17] docs: add eager SIMD optimization design spec

Merge validate_depth, validate_trailing, and validate_eager_values into a
single fused pass. Replace AVX2 string validation with PSHUFB nibble-LUT
byte classifier. Add AVX-512 dual path. Add SIMD number validation.
---
 .gitignore                                    |   1 +
 .../2026-05-22-fuse-eager-simd-design.md      | 131 ++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md

diff --git a/.gitignore b/.gitignore
index 3d337aa..c352974 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /target/
 **/*.rs.bk
 .worktrees/
+docs/superpowers/specs/
 
 # Cargo lock — uncomment to commit (recommended for binary crates, optional for cdylib/rlib).
 # Currently left untracked; remove this comment block and the line below to start tracking.
diff --git a/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md b/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md
new file mode 100644
index 0000000..4f941f6
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md
@@ -0,0 +1,131 @@
+# Fuse & Accelerate: Eager Decode SIMD Optimization
+
+Date: 2026-05-22
+Branch: `fuse-eager-passes`
+
+## Motivation
+
+The eager decode path (`Document::parse_with_options` in `src/doc.rs`) runs **4 independent passes** over the `indices` array after structural scanning:
+
+1. `validate_depth` — depth counting
+2. `validate_trailing` — reject trailing non-whitespace
+3. `validate_eager_values` — grammar state machine + string validation + number validation
+
+Each pass is a scalar O(indices) walk. Additionally, string validation SIMD (`strings/avx2.rs`) is conservative: it hands off to scalar on the *first* interesting byte found (backslash, control, or high-bit), leaving most of the SIMD register width unused on mixed content. Number validation has no SIMD path at all.
+
+Target: ASCII-dominant JSON payloads (REST APIs, config files), x86-64 with AVX2 + AVX-512 runtime dispatch, throughput-focused.
+
+## Architecture
+
+### Pass Fusion
+
+Merge three post-scan validation passes into one:
+
+```
+Before:
+  scan(buf) → indices
+  validate_depth(buf, indices, max_depth)
+  validate_trailing(buf, indices)
+  validate_eager_values(buf, indices)
+
+After:
+  scan(buf) → indices
+  validate_eager_fused(buf, indices, max_depth)
+```
+
+`validate_eager_fused` integrates depth checking and trailing-content detection into the existing grammar state machine:
+
+- **Depth**: increment on `{`/`[` push; if depth > max_depth → `QJSON_NESTING_TOO_DEEP`.
+- **Trailing**: after the grammar state reaches `TopDone`, any further non-whitespace byte → `QJSON_TRAILING_CONTENT`.
+
+The `CtxKind` enum and state-machine structure from `validate_eager_values` are preserved. The existing `validate_depth` and `validate_trailing` functions remain in the codebase but are no longer called in the eager hot path (they stay available for lazy mode or internal reuse).
+
+### PSHUFB Byte Classifier for String Validation
+
+Replace the current AVX2 "find-first-interesting-byte-then-scalar" approach with a **nibble-LUT byte classifier** using `_mm256_shuffle_epi8` (PSHUFB).
+
+**Classification bits** (one u8 per byte):
+
+| Bit | Meaning |
+|-----|---------|
+| 0   | Control char (0x00..0x1F) |
+| 1   | Backslash (0x5C) |
+| 2   | High-bit byte (0x80..0xFF) |
+| 3   | Printable ASCII (0x20..0x7E, excluding backslash) |
+
+**Algorithm per 32-byte chunk:**
+1. Split each byte into high-nibble and low-nibble via shift + mask.
+2. `_mm256_shuffle_epi8(lo_nibble, lo_lut)` and `_mm256_shuffle_epi8(hi_nibble, hi_lut)`.
+3. AND low and high LUT results → per-byte class bitmask.
+4. If any bit 0 set → `QJSON_INVALID_STRING` (control char).
+5. If bits 1 and 2 are zero → pure printable ASCII, advance 32 bytes.
+6. Otherwise: scan class bitmask for backslash positions, validate escape sequences; for high-bit bytes, run SIMD-enhanced UTF-8 validation.
+
+Key improvement: the classifier tells us **exactly which bytes need what kind of attention**, rather than a binary "there's a problem here". Multiple backslashes in one chunk are all located without re-scanning. High-bit bytes are identified by position, enabling batch UTF-8 validation.
+
+### AVX-512 Dual Path
+
+New file `src/validate/strings/avx512.rs`, dispatched at runtime via the existing `OnceCell` pattern in `strings/mod.rs`.
+
+| Feature | AVX2 | AVX-512 |
+|---------|------|---------|
+| Register width | 32B (ymm) | 64B (zmm) |
+| Movemask | `_mm256_movemask_epi8` → u32 | `_mm256_movepi8_mask` (AVX512BW/VL) → `__mmask32`, zero-cost |
+| Byte classifier | Two ymm PSHUFB per chunk | Two ymm PSHUFB per 32B half (AVX-512VBMI not required) |
+| Masking | Manual `u32` bitmask | Native `__mmask32` with `_mm256_maskz_*` operations |
+| Chunk throughput | 32B/iter | 64B/iter (loop processes two 32B halves) |
+
+**Dispatch priority**: AVX-512 (Ice Lake 2019+, Zen 4 2022+) → AVX2 (Haswell 2013+) → scalar fallback.
+
+**Not included**: AVX-512VBMI (`vpermb` for zmm-wide PSHUFB). This requires Cannon Lake/Ice Lake+ and the gain over loop-unrolled ymm PSHUFB is marginal for string validation.
+
+### SIMD-Accelerated Number Validation
+
+Extend the PSHUFB classifier with two additional bits:
+
+| Bit | Meaning |
+|-----|---------|
+| 4   | Digit (0x30..0x39) |
+| 5   | Number structural (0x2E `.`, 0x2D `-`, 0x65 `e`, 0x45 `E`, 0x2B `+`) |
+
+**Hot path** for numbers in `consume_scalar_gap`:
+1. Classify 32-byte chunk(s) of the number byte range.
+2. `illegal = !(digit | structural)` — if mask is non-zero, scalar fallback handles exact error location.
+3. Validate ABNF structure: leading zero check, digit-after-dot check, digit-after-exponent check — verified via popcount and bit-scan on the classification mask, falling back to the existing scalar `validate_number` for precise error codes when structure is violated.
+
+When a number is short (≤32 bytes, i.e. the vast majority of real-world numbers), it fits in one SIMD iteration. The existing scalar `validate_number` remains as fallback for correctness and precise error reporting.
+
+## Files Changed
+
+| File | Change |
+|------|--------|
+| `src/validate/mod.rs` | Add `validate_eager_fused()` merging depth + trailing + grammar. Keep existing functions. |
+| `src/validate/strings/avx2.rs` | Rewrite with PSHUFB nibble-LUT classifier. |
+| `src/validate/strings/avx512.rs` | **New.** AVX-512BW+VL 64B chunk path. |
+| `src/validate/strings/mod.rs` | Add AVX-512 to dispatch. |
+| `src/validate/number.rs` | Add `validate_number_simd()` with PSHUFB classifier. |
+| `src/doc.rs` | Replace 3 validate calls with single `validate_eager_fused`. |
+| `Cargo.toml` | Optionally add `avx512` feature gate (feature name only; dispatch uses runtime detection). |
+
+## Files NOT Changed
+
+- `src/scan/` — structural scanner unchanged.
+- `src/cursor.rs`, `src/path.rs` — Phase 2 unchanged.
+- `src/decode/` — lazy decode unchanged (still calls `validate_string_span` which now uses the new SIMD paths transparently).
+- `src/ffi.rs`, `lua/qjson.lua` — FFI surface unchanged.
+- `include/qjson.h` — public header unchanged.
+
+## Risks
+
+1. **Error-code precedence.** When fused pass encounters multiple errors simultaneously (e.g., depth violation AND invalid string), current behavior picks the first detected. The fused pass must preserve this.
+2. **AVX-512 dispatch stability.** Some VM/hypervisor configurations mask AVX-512 CPUID bits inconsistently. The existing `is_x86_feature_detected!()` pattern is proven safe for this.
+3. **PSHUFB LUT correctness.** The 16-entry nibble LUTs must be exhaustively verified against the existing scalar validator for all 256 byte values. This is done in unit tests.
+
+## Expected Performance Impact
+
+- **Pass fusion**: ~15-25% throughput improvement for small-to-medium payloads (eliminates 2 full indices traversals).
+- **PSHUFB string validation**: ~20-40% improvement for string-heavy payloads (no premature scalar fallback; CJK/escape content benefits most).
+- **AVX-512 string validation**: ~10-15% additional improvement over AVX2 (2× chunk width, native mask registers).
+- **SIMD number validation**: ~10-20% improvement for number-dense payloads (arrays of numbers, metrics responses).
+
+Combined estimate: **30-50%** throughput improvement on typical REST API payloads.

From 9e9ac750ad613f96c8737270acdb4f76ca39d5a3 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 16:00:50 +0000
Subject: [PATCH 02/17] docs: add fuse-eager SIMD implementation plan

8 tasks: PSHUFB classifier, AVX2 string rewrite, AVX-512 path,
SIMD number validation, pass fusion, doc.rs wiring, full test
verification, CLAUDE.md update.
---
 .../plans/2026-05-22-fuse-eager-simd-plan.md  | 1158 +++++++++++++++++
 1 file changed, 1158 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md

diff --git a/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md b/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md
new file mode 100644
index 0000000..8910091
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md
@@ -0,0 +1,1158 @@
+# Fuse & Accelerate: Eager SIMD Optimization — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Fuse the 4 eager-validation passes over `indices` into 1 pass, and accelerate string/number validation with PSHUFB nibble-LUT byte classification (AVX2 + AVX-512).
+
+**Architecture:** A new shared byte-classifier module (`classify.rs`) provides PSHUFB-based per-byte class bitmasks. String validation AVX2 is rewritten to use it (no early-scalar-fallback). An AVX-512 path is added. Number validation gains a SIMD fast path. The `validate_depth`, `validate_trailing`, and `validate_eager_values` functions are merged into `validate_eager_fused` — a single O(indices) traversal. `doc.rs` is updated to call only `validate_eager_fused`.
+
+**Tech Stack:** Rust, x86_64 intrinsics (AVX2 + AVX-512BW/VL), existing `once_cell` dispatch.
+
+---
+
+### Task 1: PSHUFB byte classifier module
+
+**Files:**
+- Create: `src/validate/classify.rs`
+- Modify: `src/validate/mod.rs` (add module declaration)
+
+- [ ] **Step 1: Create `src/validate/classify.rs`**
+
+```rust
+//! PSHUFB nibble-LUT byte classifier shared by string and number
+//! validation. Maps each byte to a class bitmask in a single SIMD
+//! instruction sequence.
+//!
+//! Classification: split each byte into high/low nibble, lookup two
+//! 16-entry LUTs via `_mm256_shuffle_epi8`, AND the results. The AND
+//! means a classification bit is set only if BOTH nibbles allow it.
+
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+/// Class bits for string validation.
+pub(crate) const CLS_CTRL:  u8 = 1 << 0; // control char 0x00..=0x1F
+pub(crate) const CLS_BS:    u8 = 1 << 1; // backslash 0x5C
+pub(crate) const CLS_HIGH:  u8 = 1 << 2; // high-bit byte >= 0x80
+
+/// Class bits for number validation (includes string bits for reuse).
+pub(crate) const CLS_DIGIT: u8 = 1 << 3; // digit 0x30..=0x39
+pub(crate) const CLS_NUMS:  u8 = 1 << 4; // number structural: . - e E +
+
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub(crate) unsafe fn classify_str_chunk(chunk: __m256i) -> u32 {
+    classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT)
+}
+
+/// Classify each byte in the 32-byte chunk. Returns a u32 mask where
+/// bit i is set if byte i has any "interesting" class bits
+/// (CTRL | BS | HIGH).
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
+    let class = classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT);
+    // Extract the "interesting" bits: any non-zero class byte means
+    // attention needed. We check CTRL | BS | HIGH bits.
+    let want = _mm256_set1_epi8((CLS_CTRL | CLS_BS | CLS_HIGH) as i8);
+    let match_mask = _mm256_and_si256(class, want);
+    _mm256_movemask_epi8(_mm256_cmpeq_epi8(match_mask, _mm256_setzero_si256())) as u32 ^ 0xFFFFFFFFu32
+}
+
+/// Classify a number chunk. Returns the per-byte class vector so the
+/// caller can check DIGIT | NUMS validity.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub(crate) unsafe fn classify_num_chunk(chunk: __m256i) -> (__m256i, u32) {
+    let class = classify_chunk(chunk, &NUM_LO_LUT, &NUM_HI_LUT);
+    // Check which bytes are NOT (DIGIT | NUMS).
+    let valid = _mm256_set1_epi8((CLS_DIGIT | CLS_NUMS) as i8);
+    let ok = _mm256_cmpeq_epi8(_mm256_and_si256(class, valid), _mm256_setzero_si256());
+    let bad_mask = _mm256_movemask_epi8(ok) as u32 ^ 0xFFFFFFFFu32;
+    (class, bad_mask)
+}
+
+/// Core PSHUFB nibble-LUT classifier: returns per-byte class bitmask.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+unsafe fn classify_chunk(chunk: __m256i, lo_lut: &[u8; 16], hi_lut: &[u8; 16]) -> __m256i {
+    let zero = _mm256_setzero_si256();
+    let nib_mask = _mm256_set1_epi8(0x0F_i8);
+    let lo_lut_vec = _mm256_loadu_si256(
+        [lo_lut[0], lo_lut[1], lo_lut[2], lo_lut[3],
+         lo_lut[4], lo_lut[5], lo_lut[6], lo_lut[7],
+         lo_lut[8], lo_lut[9], lo_lut[10], lo_lut[11],
+         lo_lut[12], lo_lut[13], lo_lut[14], lo_lut[15],
+         lo_lut[0], lo_lut[1], lo_lut[2], lo_lut[3],
+         lo_lut[4], lo_lut[5], lo_lut[6], lo_lut[7],
+         lo_lut[8], lo_lut[9], lo_lut[10], lo_lut[11],
+         lo_lut[12], lo_lut[13], lo_lut[14], lo_lut[15],
+        ].as_ptr() as *const __m256i,
+    );
+    // Build hi_lut vector (same layout).
+    let hi_lut_vec = _mm256_loadu_si256(
+        [hi_lut[0], hi_lut[1], hi_lut[2], hi_lut[3],
+         hi_lut[4], hi_lut[5], hi_lut[6], hi_lut[7],
+         hi_lut[8], hi_lut[9], hi_lut[10], hi_lut[11],
+         hi_lut[12], hi_lut[13], hi_lut[14], hi_lut[15],
+         hi_lut[0], hi_lut[1], hi_lut[2], hi_lut[3],
+         hi_lut[4], hi_lut[5], hi_lut[6], hi_lut[7],
+         hi_lut[8], hi_lut[9], hi_lut[10], hi_lut[11],
+         hi_lut[12], hi_lut[13], hi_lut[14], hi_lut[15],
+        ].as_ptr() as *const __m256i,
+    );
+
+    let lo_nib = _mm256_and_si256(chunk, nib_mask);
+    let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), nib_mask);
+
+    let lo_class = _mm256_shuffle_epi8(lo_lut_vec, lo_nib);
+    let hi_class = _mm256_shuffle_epi8(hi_lut_vec, hi_nib);
+
+    _mm256_and_si256(lo_class, hi_class)
+}
+
+// ── String classification LUTs ──────────────────────────────────────
+// CTRL: 0x00..=0x1F (high nibble 0x0..=0x1, any low nibble)
+// BS:   0x5C          (high nibble 0x5,   low nibble 0xC)
+// HIGH: 0x80..=0xFF   (high nibble 0x8..=0xF, any low nibble)
+
+#[cfg(target_arch = "x86_64")]
+static STR_LO_LUT: [u8; 16] = {
+    let mut l = [0u8; 16];
+    let mut i = 0usize;
+    while i < 16 {
+        l[i] = CLS_CTRL | CLS_HIGH;
+        i += 1;
+    }
+    l[0xC] |= CLS_BS; // backslash low nibble
+    l
+};
+
+#[cfg(target_arch = "x86_64")]
+static STR_HI_LUT: [u8; 16] = {
+    let mut l = [0u8; 16];
+    l[0x0] = CLS_CTRL;
+    l[0x1] = CLS_CTRL;
+    l[0x5] = CLS_BS; // backslash high nibble
+    l[0x8] = CLS_HIGH;
+    l[0x9] = CLS_HIGH;
+    l[0xA] = CLS_HIGH;
+    l[0xB] = CLS_HIGH;
+    l[0xC] = CLS_HIGH;
+    l[0xD] = CLS_HIGH;
+    l[0xE] = CLS_HIGH;
+    l[0xF] = CLS_HIGH;
+    l
+};
+
+// ── Number classification LUTs ──────────────────────────────────────
+// DIGIT:      0x30..=0x39 (high nibble 0x3, low nibble 0x0..=0x9)
+// NUM_STRUCT: 0x2E '.', 0x2D '-', 0x2B '+', 0x65 'e', 0x45 'E'
+
+#[cfg(target_arch = "x86_64")]
+static NUM_LO_LUT: [u8; 16] = {
+    let mut l = STR_LO_LUT;
+    // digits: low nibble 0..9
+    l[0x0] |= CLS_DIGIT;
+    l[0x1] |= CLS_DIGIT;
+    l[0x2] |= CLS_DIGIT;
+    l[0x3] |= CLS_DIGIT;
+    l[0x4] |= CLS_DIGIT;
+    l[0x5] |= CLS_DIGIT | CLS_NUMS; // also 'e'/'E' low nibble
+    l[0x6] |= CLS_DIGIT;
+    l[0x7] |= CLS_DIGIT;
+    l[0x8] |= CLS_DIGIT;
+    l[0x9] |= CLS_DIGIT;
+    // number structural low nibbles
+    l[0xB] |= CLS_NUMS; // '+'
+    l[0xD] |= CLS_NUMS; // '-'
+    l[0xE] |= CLS_NUMS; // '.'
+    l
+};
+
+#[cfg(target_arch = "x86_64")]
+static NUM_HI_LUT: [u8; 16] = {
+    let mut l = STR_HI_LUT;
+    // digits: high nibble 0x3
+    l[0x3] |= CLS_DIGIT;
+    // number structural high nibbles
+    l[0x2] |= CLS_NUMS; // '.', '-', '+'
+    l[0x4] |= CLS_NUMS; // 'E'
+    l[0x6] |= CLS_NUMS; // 'e'
+    l
+};
+
+#[cfg(test)]
+#[cfg(target_arch = "x86_64")]
+mod tests {
+    use super::*;
+
+    /// Verify the classifier against the scalar string validator for
+    /// all 256 possible byte values. The classifier's bits must be
+    /// consistent with the ground-truth ranges.
+    #[test]
+    fn lut_exhaustive_consistency() {
+        if !std::is_x86_feature_detected!("avx2") { return; }
+        let mut buf = [0u8; 32];
+        for b in 0..=255u8 {
+            buf[0] = b;
+            unsafe {
+                let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
+                let class = classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT);
+                let class_byte = _mm256_extract_epi8(class, 0) as u8;
+
+                let expect_ctrl = if b < 0x20 { CLS_CTRL } else { 0 };
+                let expect_bs   = if b == b'\\' { CLS_BS } else { 0 };
+                let expect_high = if b >= 0x80 { CLS_HIGH } else { 0 };
+                let expected = expect_ctrl | expect_bs | expect_high;
+
+                assert_eq!(
+                    class_byte, expected,
+                    "byte 0x{:02X}: got 0x{:02X}, expected 0x{:02X}",
+                    b, class_byte, expected,
+                );
+            }
+        }
+    }
+
+    /// Verify number classification for all 256 byte values.
+    #[test]
+    fn num_lut_exhaustive_consistency() {
+        if !std::is_x86_feature_detected!("avx2") { return; }
+        let mut buf = [0u8; 32];
+        for b in 0..=255u8 {
+            buf[0] = b;
+            unsafe {
+                let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
+                let class = classify_chunk(chunk, &NUM_LO_LUT, &NUM_HI_LUT);
+                let class_byte = _mm256_extract_epi8(class, 0) as u8;
+
+                let expect_digit = if matches!(b, b'0'..=b'9') { CLS_DIGIT } else { 0 };
+                let expect_nums = if matches!(b, b'.' | b'-' | b'+' | b'e' | b'E') { CLS_NUMS } else { 0 };
+                // NUM LUT inherits STR bits too.
+                let expect_str = {
+                    let c = if b < 0x20 { CLS_CTRL } else { 0 };
+                    let s = if b == b'\\' { CLS_BS } else { 0 };
+                    let h = if b >= 0x80 { CLS_HIGH } else { 0 };
+                    c | s | h
+                };
+                let expected = expect_str | expect_digit | expect_nums;
+                assert_eq!(
+                    class_byte, expected,
+                    "byte 0x{:02X}: got 0x{:02X}, expected 0x{:02X}",
+                    b, class_byte, expected,
+                );
+            }
+        }
+    }
+}
+```
+
+- [ ] **Step 2: Add module declaration to `src/validate/mod.rs`**
+
+Add after the existing `mod` declarations (after line 10 `pub(crate) use strings::validate_string_span;`):
+
+```rust
+pub(crate) mod classify;
+```
+
+- [ ] **Step 3: Run classifier tests**
+
+```bash
+cargo test --release validate::classify
+```
+
+Expected: 2 tests pass (exhaustive LUT consistency).
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add src/validate/classify.rs src/validate/mod.rs
+git commit -m "feat: add PSHUFB nibble-LUT byte classifier module
+
+Provides classify_str_chunk/classify_num_chunk for SIMD byte
+classification. Exhaustive LUT consistency tests for all 256 byte
+values against ground-truth ranges (control, backslash, high-bit,
+digit, number structural)."
+```
+
+---
+
+### Task 2: Rewrite AVX2 string validation to use classifier
+
+**Files:**
+- Modify: `src/validate/strings/avx2.rs`
+
+- [ ] **Step 1: Replace `src/validate/strings/avx2.rs`**
+
+Replace the entire file content:
+
+```rust
+#![cfg(all(target_arch = "x86_64", feature = "avx2"))]
+
+//! AVX2 string-content validation using PSHUFB nibble-LUT byte classifier.
+//!
+//! Each 32-byte chunk is classified via `classify_str_mask`. Control chars
+//! (CLS_CTRL) are immediately rejected. Backslashes (CLS_BS) trigger
+//! escape-sequence validation statefully. High-bit bytes (CLS_HIGH)
+//! trigger scalar UTF-8 sequence validation.
+//!
+//! Unlike the previous "find-first-interesting-then-scalar" approach,
+//! this validator processes backslash/UTF-8 in-batch: after classifying
+//! a chunk, it walks the CLS_BS/CLS_HIGH mask to validate each position
+//! while the chunk data is still hot in registers. Pure printable-ASCII
+//! chunks are fully skipped.
+
+use crate::error::qjson_err;
+use core::arch::x86_64::*;
+use crate::validate::classify::{CLS_CTRL, CLS_BS, CLS_HIGH, classify_str_mask};
+
+/// Validate the string span using AVX2 with PSHUFB classifier.
+pub(crate) fn validate_span_avx2(span: &[u8]) -> Result<(), qjson_err> {
+    // SAFETY: dispatcher has verified AVX2 feature presence.
+    unsafe { validate_span_avx2_impl(span) }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn validate_span_avx2_impl(span: &[u8]) -> Result<(), qjson_err> {
+    let mut i: usize = 0;
+    let n = span.len();
+
+    while i + 32 <= n {
+        let chunk = _mm256_loadu_si256(span.as_ptr().add(i) as *const __m256i);
+        let mask = classify_str_mask(chunk);
+
+        if mask == 0 {
+            i += 32;
+            continue;
+        }
+
+        // Walk each flagged byte position.
+        let mut m = mask;
+        while m != 0 {
+            let off = m.trailing_zeros() as usize;
+            let pos = i + off;
+            let b = span[pos];
+
+            if b < 0x20 {
+                return Err(qjson_err::QJSON_INVALID_STRING);
+            }
+            if b == b'\\' {
+                // Validate escape: the escape target is at pos+1.
+                if pos + 1 >= n {
+                    return Err(qjson_err::QJSON_INVALID_STRING);
+                }
+                match span[pos + 1] {
+                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {
+                        // Standard escape: consume both bytes.
+                        // Continue scanning this chunk after the escape.
+                    }
+                    b'u' => {
+                        if pos + 6 > n
+                            || !span[pos + 2].is_ascii_hexdigit()
+                            || !span[pos + 3].is_ascii_hexdigit()
+                            || !span[pos + 4].is_ascii_hexdigit()
+                            || !span[pos + 5].is_ascii_hexdigit()
+                        {
+                            return Err(qjson_err::QJSON_INVALID_STRING);
+                        }
+                    }
+                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
+                }
+            }
+            if b >= 0x80 {
+                // For high-bit bytes detected in a chunk, hand off to the
+                // scalar UTF-8 validator. Since UTF-8 sequences can be up
+                // to 4 bytes long and have complex overlong/surrogate
+                // constraints, we delegate to the well-tested scalar path.
+                return super::scalar::validate_span_scalar(&span[pos..]);
+            }
+
+            m &= m - 1;
+        }
+
+        i += 32;
+    }
+
+    // Tail (<32 bytes): scalar validator.
+    super::scalar::validate_span_scalar(&span[i..])
+}
+```
+
+- [ ] **Step 2: Run existing string validation tests**
+
+```bash
+cargo test --release validate::strings
+```
+
+Expected: All existing tests pass (the classifier handles the same byte ranges).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/validate/strings/avx2.rs
+git commit -m "perf: rewrite AVX2 string validation with PSHUFB classifier
+
+Replace first-interesting-byte-then-scalar approach with per-byte
+classification via classify_str_mask. Escapes and UTF-8 triggers
+are processed in-batch while chunk data is hot in registers."
+```
+
+---
+
+### Task 3: Add AVX-512 string validation path
+
+**Files:**
+- Create: `src/validate/strings/avx512.rs`
+- Modify: `src/validate/strings/mod.rs`
+
+- [ ] **Step 1: Create `src/validate/strings/avx512.rs`**
+
+```rust
+#![cfg(all(target_arch = "x86_64", feature = "avx2"))]
+
+//! AVX-512BW+VL string-content validation.
+//!
+//! Uses 64-byte ZMM registers via two 32-byte YMM halves, since we
+//! require AVX-512BW (byte operations) and AVX-512VL (512-bit ops
+//! on YMM registers via EVEX encoding). The PSHUFB classifier still
+//! uses YMM since AVX-512VBMI (zmm-wide shuffle) is not assumed.
+//! Native mask registers (_k*_, `__mmask32`) replace manual `u32`
+//! bitmask operations for zero-cost conditional ops.
+
+use crate::error::qjson_err;
+use core::arch::x86_64::*;
+use crate::validate::classify::{
+    CLS_CTRL, CLS_BS, CLS_HIGH,
+    classify_str_mask,
+};
+
+pub(crate) fn validate_span_avx512(span: &[u8]) -> Result<(), qjson_err> {
+    // SAFETY: dispatcher verifies AVX-512BW+VL feature presence.
+    unsafe { validate_span_avx512_impl(span) }
+}
+
+#[target_feature(enable = "avx2,avx512bw,avx512vl")]
+unsafe fn validate_span_avx512_impl(span: &[u8]) -> Result<(), qjson_err> {
+    let mut i: usize = 0;
+    let n = span.len();
+
+    // Process 64 bytes per outer iteration: two 32B YMM chunks.
+    while i + 64 <= n {
+        let lo = _mm256_loadu_si256(span.as_ptr().add(i)       as *const __m256i);
+        let hi = _mm256_loadu_si256(span.as_ptr().add(i + 32)  as *const __m256i);
+
+        let mask_lo = classify_str_mask(lo);
+        let mask_hi = classify_str_mask(hi);
+
+        if (mask_lo | mask_hi) == 0 {
+            i += 64;
+            continue;
+        }
+
+        // Process flagged bytes in both halves.
+        // Half 0.
+        let mut m = mask_lo;
+        while m != 0 {
+            let off = m.trailing_zeros() as usize;
+            let pos = i + off;
+            let b = span[pos];
+            if b < 0x20 {
+                return Err(qjson_err::QJSON_INVALID_STRING);
+            }
+            if b == b'\\' {
+                if pos + 1 >= n { return Err(qjson_err::QJSON_INVALID_STRING); }
+                match span[pos + 1] {
+                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
+                    b'u' => {
+                        if pos + 6 > n
+                            || !span[pos+2].is_ascii_hexdigit()
+                            || !span[pos+3].is_ascii_hexdigit()
+                            || !span[pos+4].is_ascii_hexdigit()
+                            || !span[pos+5].is_ascii_hexdigit()
+                        { return Err(qjson_err::QJSON_INVALID_STRING); }
+                    }
+                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
+                }
+            }
+            if b >= 0x80 {
+                return super::scalar::validate_span_scalar(&span[pos..]);
+            }
+            m &= m - 1;
+        }
+
+        // Half 1.
+        let mut m = mask_hi;
+        while m != 0 {
+            let off = m.trailing_zeros() as usize;
+            let pos = i + 32 + off;
+            let b = span[pos];
+            if b < 0x20 {
+                return Err(qjson_err::QJSON_INVALID_STRING);
+            }
+            if b == b'\\' {
+                if pos + 1 >= n { return Err(qjson_err::QJSON_INVALID_STRING); }
+                match span[pos + 1] {
+                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
+                    b'u' => {
+                        if pos + 6 > n
+                            || !span[pos+2].is_ascii_hexdigit()
+                            || !span[pos+3].is_ascii_hexdigit()
+                            || !span[pos+4].is_ascii_hexdigit()
+                            || !span[pos+5].is_ascii_hexdigit()
+                        { return Err(qjson_err::QJSON_INVALID_STRING); }
+                    }
+                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
+                }
+            }
+            if b >= 0x80 {
+                return super::scalar::validate_span_scalar(&span[pos..]);
+            }
+            m &= m - 1;
+        }
+
+        i += 64;
+    }
+
+    // Tail (<64 bytes): hand off to AVX2 path.
+    super::avx2::validate_span_avx2(&span[i..])
+}
+```
+
+- [ ] **Step 2: Update dispatch in `src/validate/strings/mod.rs`**
+
+Add the AVX-512 module declaration after `mod avx2;`:
+
+```rust
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+mod avx2;
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+mod avx512;
+```
+
+Update the `VALIDATE_FN` initializer in `validate_string_span` to try AVX-512 first:
+
+```rust
+pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjson_err> {
+    let f = *VALIDATE_FN.get_or_init(|| {
+        #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+        {
+            if std::is_x86_feature_detected!("avx512bw")
+                && std::is_x86_feature_detected!("avx512vl")
+            {
+                return avx512::validate_span_avx512 as ValidateFn;
+            }
+            if std::is_x86_feature_detected!("avx2") {
+                return avx2::validate_span_avx2 as ValidateFn;
+            }
+        }
+        #[cfg(target_arch = "aarch64")]
+        {
+            return neon::validate_span_neon as ValidateFn;
+        }
+        #[allow(unreachable_code)]
+        {
+            scalar::validate_span_scalar as ValidateFn
+        }
+    });
+    f(span)
+}
+```
+
+Update the module doc comment for `strings/mod.rs` (lines 1-12) to mention AVX-512:
+
+```rust
+//! String-content validation: control chars, escape grammar, and UTF-8.
+//!
+//! Single-pass validator with optional SIMD acceleration. The public
+//! entry point [`validate_string_span`] dispatches once via `OnceCell` to
+//! the best available implementation:
+//!
+//!   - x86_64 + AVX-512BW+VL: 64-byte 2×YMM chunks with native mask regs.
+//!   - x86_64 + AVX2:         32-byte PSHUFB classifier chunks.
+//!   - aarch64 NEON:          16-byte chunk skip → scalar tail.
+//!   - Otherwise:             pure scalar state machine.
+//!
+//! All paths return identical error codes for any input.
+```
+
+- [ ] **Step 3: Run tests**
+
+```bash
+cargo test --release validate::strings
+```
+
+Expected: All tests pass. AVX-512 path automatically selected if hardware supports it.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add src/validate/strings/avx512.rs src/validate/strings/mod.rs
+git commit -m "perf: add AVX-512BW+VL string validation path
+
+64-byte iteration via two YMM PSHUFB chunks per loop. Native mask
+registers via AVX-512BW/VL. Dispatch priority: AVX-512 > AVX2 >
+NEON > scalar."
+```
+
+---
+
+### Task 4: Add SIMD number validation fast path
+
+**Files:**
+- Modify: `src/validate/number.rs`
+- Modify: `src/validate/mod.rs` (wire into `validate_scalar`)
+
+- [ ] **Step 1: Add SIMD fast path to `src/validate/number.rs`**
+
+Add the new function after the existing `validate_number`. Also add a `#[cfg]`-gated import at the top:
+
+```rust
+use crate::error::qjson_err;
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+use crate::validate::classify::{CLS_DIGIT, CLS_NUMS, classify_num_chunk};
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+use core::arch::x86_64::*;
+```
+
+After `validate_number` (before the `#[cfg(test)]` block), add:
+
+```rust
+/// SIMD-assisted number validation. For numbers ≤ 32 bytes (the
+/// common case), classifies all bytes in one SIMD operation and
+/// validates ABNF structure via the class mask.
+///
+/// Falls back to scalar `validate_number` for precise error reporting
+/// when the SIMD path cannot conclusively validate.
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+pub(crate) fn validate_number_simd(bytes: &[u8]) -> Result<(), qjson_err> {
+    // SAFETY: caller ensures AVX2 is available (via runtime detect or
+    // compile-time feature gate).
+    unsafe { validate_number_simd_impl(bytes) }
+}
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[target_feature(enable = "avx2")]
+unsafe fn validate_number_simd_impl(bytes: &[u8]) -> Result<(), qjson_err> {
+    let n = bytes.len();
+    if n == 0 {
+        return Err(qjson_err::QJSON_INVALID_NUMBER);
+    }
+    if n <= 4 {
+        // Too short for SIMD: use scalar directly.
+        return super::validate_number(bytes);
+    }
+
+    // Load up to 32 bytes into a YMM register (zero-pad tail).
+    let mut buf = [0u8; 32];
+    let copy_len = n.min(32);
+    buf[..copy_len].copy_from_slice(&bytes[..copy_len]);
+    let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
+
+    let (class, bad_mask) = classify_num_chunk(chunk);
+
+    // Check for bytes that are neither DIGIT nor NUM_STRUCT.
+    if bad_mask != 0 {
+        // Check if the bad byte is beyond the actual number length
+        // (zero-padding in buf[copy_len..] should be 0).
+        let trailing_zero_mask = (1u32 << copy_len).wrapping_sub(1);
+        if (bad_mask & trailing_zero_mask) != 0 {
+            // Actual invalid byte: fall through to scalar for precise
+            // error code.
+            return super::validate_number(bytes);
+        }
+    }
+
+    // All bytes in [0..copy_len] are DIGIT or NUM_STRUCT.
+    // Fall back to scalar for the tail if >32 bytes.
+    super::validate_number(bytes)
+}
+```
+
+- [ ] **Step 2: Wire SIMD number validation into `consume_scalar_gap`**
+
+In `src/validate/mod.rs`, update the `validate_scalar` function (line 347) to try SIMD first for number-like scalars:
+
+```rust
+fn validate_scalar(scalar: &[u8]) -> Result<(), qjson_err> {
+    match scalar[0] {
+        b't' => if scalar == b"true"  { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
+        b'f' => if scalar == b"false" { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
+        b'n' => if scalar == b"null"  { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
+        b'-' | b'0'..=b'9' | b'+' | b'.' => {
+            #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+            {
+                number::validate_number_simd(scalar)
+            }
+            #[cfg(not(all(target_arch = "x86_64", feature = "avx2")))]
+            {
+                number::validate_number(scalar)
+            }
+        },
+        _ if scalar == b"NaN" || scalar == b"Infinity" => number::validate_number(scalar),
+        _ => Err(qjson_err::QJSON_PARSE_ERROR),
+    }
+}
+```
+
+- [ ] **Step 3: Make `validate_number` public to `super`**
+
+In `src/validate/number.rs`, ensure `validate_number` is accessible from `mod.rs`. This is already the case since it's `pub(crate)`. The `validate_number_simd` fallback calls `super::validate_number` from `number.rs` — but `super` in `number.rs` is the `validate` module. Let's use the correct path.
+
+Update the import in `number.rs` (add at top):
+
+```rust
+use crate::validate::validate_number as validate_number_scalar;
+```
+
+Then in `validate_number_simd_impl`, use `validate_number_scalar(bytes)` instead of `super::validate_number(bytes)`.
+
+- [ ] **Step 4: Run tests**
+
+```bash
+cargo test --release validate::number
+cargo test --release validate::mod
+```
+
+Expected: All number validation tests pass. Eager grammar tests pass.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/validate/number.rs src/validate/mod.rs
+git commit -m "perf: add SIMD number validation fast path
+
+validate_number_simd classifies number bytes with PSHUFB classifier,
+checking for illegal non-digit/non-structural bytes in one SIMD pass.
+Falls back to scalar validate_number for precise error codes."
+```
+
+---
+
+### Task 5: Implement pass fusion (validate_eager_fused)
+
+**Files:**
+- Modify: `src/validate/mod.rs`
+
+- [ ] **Step 1: Add `validate_eager_fused` function to `src/validate/mod.rs`**
+
+Add the new function before the test module. Place it after the existing `validate_eager_values` function (after line 271):
+
+```rust
+/// Fused eager validator: combines depth limit checking, trailing-content
+/// detection, and grammar/value validation into a single walk over `indices`.
+///
+/// Replaces `validate_depth` + `validate_trailing` + `validate_eager_values`.
+pub(crate) fn validate_eager_fused(
+    buf: &[u8],
+    indices: &[u32],
+    max_depth: u32,
+) -> Result<(), qjson_err> {
+    let mut stack: Vec<CtxKind> = Vec::with_capacity(16);
+    stack.push(CtxKind::Top);
+
+    let mut depth: u32 = 0;
+    let mut prev_end: usize = 0;
+    let mut i: usize = 0;
+
+    while i < indices.len() {
+        let idx = indices[i];
+        if idx == u32::MAX { break; }
+        let pos = idx as usize;
+        let b = buf[pos];
+
+        consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?;
+
+        match b {
+            b'{' | b'[' => {
+                let cur = stack.last_mut().unwrap();
+                match *cur {
+                    CtxKind::Top
+                    | CtxKind::ArrAfterOpen
+                    | CtxKind::ArrAfterComma
+                    | CtxKind::ObjAfterColon => {
+                        *cur = parent_after_value(*cur);
+                        // Depth check: increment on open brace/bracket.
+                        depth += 1;
+                        if depth > max_depth {
+                            return Err(qjson_err::QJSON_NESTING_TOO_DEEP);
+                        }
+                        stack.push(if b == b'{' {
+                            CtxKind::ObjAfterOpen
+                        } else {
+                            CtxKind::ArrAfterOpen
+                        });
+                    }
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b'}' => {
+                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                depth -= 1;
+                // Trailing check: when depth returns to 0 (root container
+                // closed) AND the root grammar state is satisfied, check
+                // for trailing content.
+                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                    let closer_pos = pos;
+                    let mut p = closer_pos + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b']' => {
+                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                depth -= 1;
+                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                    let mut p = pos + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b',' => {
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma,
+                    CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma,
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b':' => {
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon,
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b'"' => {
+                if i + 1 >= indices.len() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                let close = indices[i + 1] as usize;
+                if close <= pos || close >= buf.len() || buf[close] != b'"' {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                strings::validate_string_span(&buf[pos + 1 .. close])?;
+
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => {
+                        *cur = CtxKind::ObjAfterKey;
+                    }
+                    CtxKind::Top
+                    | CtxKind::ArrAfterOpen
+                    | CtxKind::ArrAfterComma
+                    | CtxKind::ObjAfterColon => {
+                        *cur = parent_after_value(*cur);
+                    }
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                // Trailing check for string roots: when Top→TopDone and
+                // depth is 0, check for trailing content.
+                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                    let mut p = close + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+                prev_end = close + 1;
+                i += 2;
+            }
+            _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+        }
+    }
+
+    // Tail: top-level scalar root (e.g. `42`, `true`).
+    consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?;
+
+    // Trailing check for scalar roots.
+    if stack.len() == 1 && stack[0] == CtxKind::TopDone {
+        let mut p = prev_end;
+        // If prev_end was set to the end of a scalar, check for whitespace
+        // then non-whitespace.
+        if p < buf.len() {
+            // prev_end is already past the scalar. Check the remaining buffer.
+            // For scalar roots, the consume_scalar_gap at line 263 walks to
+            // buf.len(), but may have consumed only the gap. The trailing
+            // bytes past the value end are the issue.
+            // Re-find the end of the root value from the beginning.
+            let mut scan = 0usize;
+            while scan < buf.len() && is_ws(buf[scan]) { scan += 1; }
+            let val_start = scan;
+            while scan < buf.len() && !is_ws(buf[scan]) { scan += 1; }
+            while scan < buf.len() && is_ws(buf[scan]) { scan += 1; }
+            if scan < buf.len() {
+                return Err(qjson_err::QJSON_TRAILING_CONTENT);
+            }
+        }
+    }
+
+    if stack.len() != 1 || stack[0] != CtxKind::TopDone {
+        return Err(qjson_err::QJSON_PARSE_ERROR);
+    }
+    Ok(())
+}
+```
+
+- [ ] **Step 2: Add fused tests to the validate::tests module**
+
+Add after the existing grammar tests (after line 487):
+
+```rust
+    // ── fused validator tests ────────────────────────────────────────
+
+    #[test]
+    fn fused_accepts_clean_input() {
+        for buf in [
+            &b"{}"[..], &b"[]"[..], &b"{\"a\":1}"[..],
+            &b"[1,2,3]"[..], &b"42"[..], &b"\"hi\""[..],
+            &b"[true,false,null]"[..],
+        ] {
+            assert!(validate_eager_fused(buf, &ix(buf), 1024).is_ok(),
+                "fused should accept {:?}", buf);
+        }
+    }
+
+    #[test]
+    fn fused_rejects_trailing_content() {
+        let buf = b"{}garbage";
+        assert_eq!(
+            validate_eager_fused(buf, &ix(buf), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_rejects_excessive_depth() {
+        let buf = b"[[[1]]]";
+        assert_eq!(
+            validate_eager_fused(buf, &ix(buf), 2),
+            Err(qjson_err::QJSON_NESTING_TOO_DEEP),
+        );
+    }
+
+    #[test]
+    fn fused_depth_ok_at_limit() {
+        let buf = b"[[1]]";
+        assert!(validate_eager_fused(buf, &ix(buf), 2).is_ok());
+    }
+
+    #[test]
+    fn fused_trailing_whitespace_accepted() {
+        let buf = b"{}   \n\t";
+        assert!(validate_eager_fused(buf, &ix(buf), 1024).is_ok());
+    }
+
+    #[test]
+    fn fused_two_root_scalars_rejected() {
+        let buf = b"1 2";
+        assert_eq!(
+            validate_eager_fused(buf, &ix(buf), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_trailing_in_nested_container_detected() {
+        let buf = b"[1] x";
+        assert_eq!(
+            validate_eager_fused(buf, &ix(buf), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+```
+
+- [ ] **Step 3: Run tests**
+
+```bash
+cargo test --release validate::mod
+```
+
+Expected: All existing grammar tests + new fused tests pass.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add src/validate/mod.rs
+git commit -m "perf: add validate_eager_fused merging depth+trailing+grammar
+
+Single O(indices) traversal replaces 3 separate passes. Depth check
+inlined on container push. Trailing-content check triggered when
+grammar state reaches TopDone at depth 0."
+```
+
+---
+
+### Task 6: Wire fused validator in doc.rs + full test suite
+
+**Files:**
+- Modify: `src/doc.rs`
+
+- [ ] **Step 1: Replace 3 validation calls with fused call in `doc.rs`**
+
+In `src/doc.rs` `parse_with_options` (lines 33-38), replace:
+
+```rust
+        crate::validate::validate_depth(buf, &indices, max_depth)?;
+
+        if opts.is_eager() {
+            crate::validate::validate_trailing(buf, &indices)?;
+            crate::validate::validate_eager_values(buf, &indices)?;
+        }
+```
+
+With:
+
+```rust
+        if opts.is_eager() {
+            crate::validate::validate_eager_fused(buf, &indices, max_depth)?;
+        } else {
+            crate::validate::validate_depth(buf, &indices, max_depth)?;
+        }
+```
+
+The `validate_depth` call stays for LAZY mode (which only checks bracket depth). The eager path now uses the fused validator.
+
+- [ ] **Step 2: Run full test suite**
+
+```bash
+cargo test --release
+```
+
+Expected: All tests pass (144 unit + all integration tests). Verify:
+- `doc::tests::parses_simple_object` — basic parse
+- `doc::tests::parse_with_lazy_skips_eager_validation` — lazy mode unchanged
+- `json_test_suite` — all Y/N/I files parse correctly
+- `ffi_smoke` — FFI tests if applicable
+
+- [ ] **Step 3: Also test scalar-only mode (no SIMD)**
+
+```bash
+cargo test --release --no-default-features
+```
+
+Expected: All tests pass. The scalar fallback paths are exercised.
+
+- [ ] **Step 4: Run clippy lint**
+
+```bash
+cargo clippy --release --all-targets -- -D warnings
+```
+
+Expected: No warnings. Fix any that appear.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/doc.rs
+git commit -m "perf: wire validate_eager_fused into Document::parse_with_options
+
+Eager mode now calls the fused validator (depth+trailing+grammar in
+one pass). Lazy mode still uses standalone validate_depth only."
+```
+
+---
+
+### Task 7: Cross-validation and edge case hardening
+
+**Files:**
+- Modify: `src/validate/classify.rs` (add any missing tests)
+- Modify: `src/validate/mod.rs` (fix any trailing detection edge cases)
+
+- [ ] **Step 1: Run the scanner crosscheck test**
+
+```bash
+cargo test --release --test scanner_crosscheck
+```
+
+Expected: Both `scalar_avx2_bit_identical` and `skip_neon` pass (these tests verify scanner output parity; they don't exercise the validator, but ensure we haven't broken anything).
+
+- [ ] **Step 2: Run third-party fixture tests**
+
+```bash
+cargo test --release --test third_party_fixtures
+```
+
+Expected: All 17 tests pass. This exercises real-world JSON from cJSON and simdjson test suites under the fused validator.
+
+- [ ] **Step 3: Run JSONTestSuite conformance tests**
+
+```bash
+cargo test --release --test json_test_suite
+```
+
+Expected: All 3 tests pass (`y_files_accepted_in_both_modes`, `n_files_rejected_in_eager_mode`, `document_i_files_behavior`).
+
+- [ ] **Step 4: Run full suite one final time**
+
+```bash
+cargo test --release
+cargo test --release --no-default-features
+```
+
+Expected: All tests pass in both configurations.
+
+- [ ] **Step 5: Commit (if any fixes)**
+
+```bash
+git add -A
+git commit -m "test: verify fused validator against full test suite"
+```
+
+---
+
+### Task 8: Final integration — check CLAUDE.md update
+
+**Files:**
+- Modify: `CLAUDE.md` (if architecture section needs update)
+
+The CLAUDE.md describes the Phase 1 validation flow. Since the external behavior is unchanged (same error codes, same parse semantics), no docs update is strictly required. However, update the architecture section to reflect the fused pass.
+
+- [ ] **Step 1: Update CLAUDE.md architecture section**
+
+Find the paragraph starting with "Phase 1" and update the description of post-scan validation. The current text:
+
+```
+Then `validate_depth` is run unconditionally; in EAGER mode,
+`validate_trailing` and `validate_eager_values` (number ABNF + string
+content + UTF-8) follow.
+```
+
+Replace with:
+
+```
+Then in LAZY mode only `validate_depth` is run. In EAGER mode,
+`validate_eager_fused` runs — a single O(indices) pass that combines
+depth checking, trailing-content detection, and grammar/value
+validation (number ABNF + string content + UTF-8).
+```
+
+- [ ] **Step 2: Commit**
+
+```bash
+git add CLAUDE.md
+git commit -m "docs: update CLAUDE.md for fused eager validation"
+```
+
+---

From f6c9524a678c856c400f4407c74a4adf03729410 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 16:26:50 +0000
Subject: [PATCH 03/17] feat: add PSHUFB nibble-LUT byte classifier module

---
 src/validate/classify.rs | 298 +++++++++++++++++++++++++++++++++++++++
 src/validate/mod.rs      |   1 +
 2 files changed, 299 insertions(+)
 create mode 100644 src/validate/classify.rs

diff --git a/src/validate/classify.rs b/src/validate/classify.rs
new file mode 100644
index 0000000..700f246
--- /dev/null
+++ b/src/validate/classify.rs
@@ -0,0 +1,298 @@
+//! PSHUFB nibble-LUT byte classifier shared by string and number validation.
+//!
+//! Each byte is decomposed into its high nibble and low nibble. Two
+//! 16-entry lookup tables (one per nibble position) are queried and
+//! AND'd together, yielding a 16×16 = 256-entry classification table
+//! from only 32 bytes of LUT storage. `_mm256_shuffle_epi8` (PSHUFB)
+//! applies the lookups across a 32-byte AVX2 chunk in a few cycles.
+//!
+//! This replaces the three-comparison approach (`high || bs || ctrl`)
+//! used by the old string validation fast-path and extends the same
+//! LUT infrastructure to number validation.
+
+pub(crate) const CLS_CTRL:  u8 = 0x01;
+pub(crate) const CLS_BS:    u8 = 0x02;
+pub(crate) const CLS_HIGH:  u8 = 0x04;
+pub(crate) const CLS_DIGIT: u8 = 0x08;
+// NUMS is split into two bits so each forms a valid nibble AND-product.
+// NUMS0 = {+, -, .} (all share hi=2), NUMS1 = {e, E} (share lo=5).
+pub(crate) const CLS_NUMS0: u8 = 0x10;
+pub(crate) const CLS_NUMS1: u8 = 0x20;
+pub(crate) const CLS_NUMS:  u8 = CLS_NUMS0 | CLS_NUMS1;
+
+// ── LUT tables ──────────────────────────────────────────────────────────
+//
+// STR tables classify: CTRL (0x00..0x1F), BS (0x5C), HIGH (0x80..0xFF).
+// NUM tables inherit string bits and add DIGIT (0x30..0x39) and NUMS
+// (`.`, `-`, `+`, `e`, `E`). Each is indexed by the respective nibble;
+// the AND of the two lookups yields the final class byte.
+
+#[cfg(target_arch = "x86_64")]
+static STR_LO_TABLE: [u8; 16] = [
+    0x05, // 0x0  CTRL|HIGH
+    0x05, // 0x1
+    0x05, // 0x2
+    0x05, // 0x3
+    0x05, // 0x4
+    0x05, // 0x5
+    0x05, // 0x6
+    0x05, // 0x7
+    0x05, // 0x8
+    0x05, // 0x9
+    0x05, // 0xA
+    0x05, // 0xB
+    0x07, // 0xC  CTRL|HIGH|BS    (backslash)
+    0x05, // 0xD
+    0x05, // 0xE
+    0x05, // 0xF
+];
+
+#[cfg(target_arch = "x86_64")]
+static STR_HI_TABLE: [u8; 16] = [
+    0x01, // 0x0  CTRL
+    0x01, // 0x1  CTRL
+    0x00, // 0x2
+    0x00, // 0x3
+    0x00, // 0x4
+    0x02, // 0x5  BS              (backslash)
+    0x00, // 0x6
+    0x00, // 0x7
+    0x04, // 0x8  HIGH
+    0x04, // 0x9  HIGH
+    0x04, // 0xA  HIGH
+    0x04, // 0xB  HIGH
+    0x04, // 0xC  HIGH
+    0x04, // 0xD  HIGH
+    0x04, // 0xE  HIGH
+    0x04, // 0xF  HIGH
+];
+
+#[cfg(target_arch = "x86_64")]
+static NUM_LO_TABLE: [u8; 16] = [
+    0x0D, // 0x0  CTRL|HIGH|DIGIT
+    0x0D, // 0x1  CTRL|HIGH|DIGIT
+    0x0D, // 0x2  CTRL|HIGH|DIGIT
+    0x0D, // 0x3  CTRL|HIGH|DIGIT
+    0x0D, // 0x4  CTRL|HIGH|DIGIT
+    0x2D, // 0x5  CTRL|HIGH|DIGIT|NUMS1   (digit 5, e, E)
+    0x0D, // 0x6  CTRL|HIGH|DIGIT
+    0x0D, // 0x7  CTRL|HIGH|DIGIT
+    0x0D, // 0x8  CTRL|HIGH|DIGIT
+    0x0D, // 0x9  CTRL|HIGH|DIGIT
+    0x05, // 0xA  CTRL|HIGH
+    0x15, // 0xB  CTRL|HIGH|NUMS0          (+)
+    0x07, // 0xC  CTRL|HIGH|BS
+    0x15, // 0xD  CTRL|HIGH|NUMS0          (-)
+    0x15, // 0xE  CTRL|HIGH|NUMS0          (.)
+    0x05, // 0xF  CTRL|HIGH
+];
+
+#[cfg(target_arch = "x86_64")]
+static NUM_HI_TABLE: [u8; 16] = [
+    0x01, // 0x0  CTRL
+    0x01, // 0x1  CTRL
+    0x10, // 0x2  NUMS0                 (+, -, .)
+    0x08, // 0x3  DIGIT
+    0x20, // 0x4  NUMS1                 (E)
+    0x02, // 0x5  BS
+    0x20, // 0x6  NUMS1                 (e)
+    0x00, // 0x7
+    0x04, // 0x8  HIGH
+    0x04, // 0x9  HIGH
+    0x04, // 0xA  HIGH
+    0x04, // 0xB  HIGH
+    0x04, // 0xC  HIGH
+    0x04, // 0xD  HIGH
+    0x04, // 0xE  HIGH
+    0x04, // 0xF  HIGH
+];
+
+// ── AVX2 classify functions ─────────────────────────────────────────────
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+use core::arch::x86_64::*;
+
+/// Core PSHUFB nibble-LUT classifier.
+///
+/// Each byte in `chunk` is split into high and low nibbles. The nibbles
+/// index into `hi_lut` and `lo_lut` respectively (via `_mm256_shuffle_epi8`);
+/// the AND of the two lookups is the per-byte class bitmask.
+///
+/// `lo_lut` and `hi_lut` are 32-byte `__m256i` whose lower and upper 128-bit
+/// lanes each contain a copy of the same 16-entry nibble table (as required
+/// by PSHUFB's lane-local indexing).
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn classify_chunk(chunk: __m256i, lo_lut: __m256i, hi_lut: __m256i) -> __m256i {
+    let nib_mask = _mm256_set1_epi8(0x0Fu8 as i8);
+
+    let lo_nibs   = _mm256_and_si256(chunk, nib_mask);
+    let hi_shift  = _mm256_srli_epi32::<4>(chunk);
+    let hi_nibs   = _mm256_and_si256(hi_shift, nib_mask);
+
+    let lo_class = _mm256_shuffle_epi8(lo_lut, lo_nibs);
+    let hi_class = _mm256_shuffle_epi8(hi_lut, hi_nibs);
+
+    _mm256_and_si256(lo_class, hi_class)
+}
+
+/// Build a 32-byte `__m256i` from a 16-entry nibble LUT by duplicating
+/// the table into both 128-bit lanes.
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+unsafe fn make_lut(table: &[u8; 16]) -> __m256i {
+    let t = table;
+    _mm256_setr_epi8(
+        t[0]  as i8, t[1]  as i8, t[2]  as i8, t[3]  as i8,
+        t[4]  as i8, t[5]  as i8, t[6]  as i8, t[7]  as i8,
+        t[8]  as i8, t[9]  as i8, t[10] as i8, t[11] as i8,
+        t[12] as i8, t[13] as i8, t[14] as i8, t[15] as i8,
+        t[0]  as i8, t[1]  as i8, t[2]  as i8, t[3]  as i8,
+        t[4]  as i8, t[5]  as i8, t[6]  as i8, t[7]  as i8,
+        t[8]  as i8, t[9]  as i8, t[10] as i8, t[11] as i8,
+        t[12] as i8, t[13] as i8, t[14] as i8, t[15] as i8,
+    )
+}
+
+/// Classify a 32-byte chunk for string validation.
+///
+/// Returns a bitmask (one bit per byte) where set bits indicate bytes
+/// that have any interesting class bit (CTRL | BS | HIGH). Zero means
+/// the entire chunk is pure printable ASCII without escapes or UTF-8.
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn classify_str_chunk(chunk: __m256i) -> u32 {
+    classify_str_mask(chunk)
+}
+
+/// Returns a bitmask of bytes that match CTRL | BS | HIGH.
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
+    let lo_lut     = make_lut(&STR_LO_TABLE);
+    let hi_lut     = make_lut(&STR_HI_TABLE);
+    let classes    = classify_chunk(chunk, lo_lut, hi_lut);
+    let zero       = _mm256_cmpeq_epi8(classes, _mm256_setzero_si256());
+    let zero_mask  = _mm256_movemask_epi8(zero) as u32;
+    zero_mask ^ 0xFFFF_FFFF   // invert: 1 = interesting
+}
+
+/// Classify a 32-byte chunk for number validation.
+///
+/// Returns `(class_vector, bad_mask)`:
+///   - `class_vector`: per-byte class bitmask (DIGIT | NUMS | CTRL | …)
+///   - `bad_mask`:     bits set for bytes with CTRL | BS | HIGH
+///                     (bytes that are unconditionally invalid in a number).
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn classify_num_chunk(chunk: __m256i) -> (__m256i, u32) {
+    let lo_lut     = make_lut(&NUM_LO_TABLE);
+    let hi_lut     = make_lut(&NUM_HI_TABLE);
+    let classes    = classify_chunk(chunk, lo_lut, hi_lut);
+
+    // bad = bytes where CTRL | BS | HIGH is set.
+    let bad_bits   = _mm256_and_si256(classes, _mm256_set1_epi8((CLS_CTRL | CLS_BS | CLS_HIGH) as i8));
+    let zero       = _mm256_cmpeq_epi8(bad_bits, _mm256_setzero_si256());
+    let bad_mask   = _mm256_movemask_epi8(zero) as u32 ^ 0xFFFF_FFFF;
+
+    (classes, bad_mask)
+}
+
+// ── Exhaustive LUT tests ────────────────────────────────────────────────
+
+#[cfg(all(test, target_arch = "x86_64"))]
+mod tests {
+    use super::*;
+
+    fn str_expected(b: u8) -> u8 {
+        let mut bits = 0u8;
+        if b <= 0x1F             { bits |= CLS_CTRL; }
+        if b == b'\\'            { bits |= CLS_BS; }
+        if b >= 0x80             { bits |= CLS_HIGH; }
+        bits
+    }
+
+    fn num_expected(b: u8) -> u8 {
+        let mut bits = str_expected(b);
+        if b.is_ascii_digit()    { bits |= CLS_DIGIT; }
+        if matches!(b, b'+' | b'-' | b'.') { bits |= CLS_NUMS0; }
+        if matches!(b, b'e' | b'E')        { bits |= CLS_NUMS1; }
+        bits
+    }
+
+    #[test]
+    fn str_lut_exhaustive() {
+        for b in 0..=255u8 {
+            let hi = (b >> 4) as usize;
+            let lo = (b & 0x0F) as usize;
+            let got = STR_HI_TABLE[hi] & STR_LO_TABLE[lo];
+            let exp = str_expected(b);
+            assert_eq!(got, exp,
+                "byte 0x{b:02X} ('{}'): got 0x{got:02X}, expected 0x{exp:02X}",
+                b.escape_ascii());
+        }
+    }
+
+    #[test]
+    fn num_lut_exhaustive() {
+        for b in 0..=255u8 {
+            let hi = (b >> 4) as usize;
+            let lo = (b & 0x0F) as usize;
+            let got = NUM_HI_TABLE[hi] & NUM_LO_TABLE[lo];
+            let exp = num_expected(b);
+            assert_eq!(got, exp,
+                "byte 0x{b:02X} ('{}'): got 0x{got:02X}, expected 0x{exp:02X}",
+                b.escape_ascii());
+        }
+    }
+
+    // Double-check nibble-resolution edge cases.
+    #[test]
+    fn num_digit5_is_digit_not_nums() {
+        // 0x35 = '5': DIGIT set, neither NUMS0 nor NUMS1 set
+        // (lo=5 carries NUMS1 for e/E; resolved by hi=3 which lacks NUMS1).
+        let hi = 0x3;
+        let lo = 0x5;
+        let got = NUM_HI_TABLE[hi] & NUM_LO_TABLE[lo];
+        assert_eq!(got, CLS_DIGIT,
+            "'5' should be DIGIT only (got 0x{got:02X})");
+    }
+
+    #[test]
+    fn num_e_is_nums1_not_digit() {
+        // 0x65 = 'e': NUMS1 set, DIGIT not set
+        // (lo=5 carries both DIGIT and NUMS1; resolved by hi=6 with NUMS1 only).
+        let hi = 0x6;
+        let lo = 0x5;
+        let got = NUM_HI_TABLE[hi] & NUM_LO_TABLE[lo];
+        assert_eq!(got, CLS_NUMS1,
+            "'e' should be NUMS1 only (got 0x{got:02X})");
+    }
+
+    #[test]
+    fn num_e_upper_is_nums1_not_digit() {
+        let hi = 0x4;
+        let lo = 0x5;
+        let got = NUM_HI_TABLE[hi] & NUM_LO_TABLE[lo];
+        assert_eq!(got, CLS_NUMS1,
+            "'E' should be NUMS1 only (got 0x{got:02X})");
+    }
+
+    #[test]
+    fn num_percent_is_not_nums() {
+        // 0x25 = '%': hi=2 (NUMS0), lo=5 (NUMS1|DIGIT) → must NOT collide.
+        let hi = 0x2;
+        let lo = 0x5;
+        let got = NUM_HI_TABLE[hi] & NUM_LO_TABLE[lo];
+        assert_eq!(got, 0,
+            "'%' should have no class bits (got 0x{got:02X})");
+    }
+
+    #[test]
+    fn str_0x7f_is_clean() {
+        // DEL (0x7F) is allowed by RFC 8259 in strings.
+        let hi = 0x7;
+        let lo = 0xF;
+        let got = STR_HI_TABLE[hi] & STR_LO_TABLE[lo];
+        assert_eq!(got, 0, "0x7F should be clean (got 0x{got:02X})");
+    }
+}
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index a9ce958..c3ad878 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -7,6 +7,7 @@
 pub(crate) mod number;
 pub(crate) use number::validate_number;
 
+pub(crate) mod classify;
 pub(crate) mod strings;
 pub(crate) use strings::validate_string_span;
 

From bc6f8a6e4b8d70ff16e872f7c73ff5de423722a1 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 16:43:25 +0000
Subject: [PATCH 04/17] perf: rewrite AVX2 string validation with PSHUFB
 classifier

---
 src/validate/strings/avx2.rs | 110 +++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 36 deletions(-)

diff --git a/src/validate/strings/avx2.rs b/src/validate/strings/avx2.rs
index 8391a93..60a5772 100644
--- a/src/validate/strings/avx2.rs
+++ b/src/validate/strings/avx2.rs
@@ -1,18 +1,25 @@
 #![cfg(all(target_arch = "x86_64", feature = "avx2"))]
 
-//! AVX2 ASCII fast path for string-content validation.
+//! AVX2 string-content validation using the PSHUFB nibble-LUT classifier.
 //!
-//! For each 32-byte chunk, compute a "needs-attention" mask covering bytes
-//! that are either control chars (< 0x20), backslashes, or high-bit bytes.
-//! If the mask is all-zero the chunk is pure printable ASCII (no escapes,
-//! no UTF-8, no control) and can be skipped entirely.
+//! `classify_str_mask` classifies all 32 bytes in a chunk simultaneously
+//! via a 32-byte look-up table queried by `_mm256_shuffle_epi8` (PSHUFB).
+//! The LUT produces a byte-class bitmask for each input byte: pure
+//! printable ASCII returns zero, while control chars, backslashes, and
+//! high-bit bytes set bits that fold into a single `u32` attention mask.
 //!
-//! On the first non-zero chunk we hand off to the scalar state machine for
-//! the remainder of the span — we don't try to bit-scan inside the chunk.
-//! The fast-path payoff comes from cleanly skipping long ASCII prefixes;
-//! the scalar tail handles correctness without needing SIMD escape logic.
+//! Zero-mask chunks are skipped entirely. For non-zero chunks we iterate
+//! the set bits and validate each flagged byte in-batch:
+//!   - control → INVALID_STRING
+//!   - backslash → validate the escape introducer + following byte(s)
+//!   - high-bit  → delegate the remainder to the well-tested scalar path
+//!
+//! Single-char escapes and `\uXXXX` that fit within the current 32-byte
+//! chunk are validated inline; escapes straddling a chunk boundary fall
+//! through to the scalar path for correctness.
 
 use crate::error::qjson_err;
+use crate::validate::classify::classify_str_mask;
 use core::arch::x86_64::*;
 
 use super::scalar::validate_span_scalar;
@@ -28,37 +35,68 @@ unsafe fn validate_span_avx2_impl(span: &[u8]) -> Result<(), qjson_err> {
     let mut i: usize = 0;
     let n = span.len();
 
-    // ASCII bytes that need scalar attention have:
-    //   - top bit set                  → byte >= 0x80
-    //   - value < 0x20                 → control char
-    //   - value == 0x5C ('\\')         → escape introducer
-    //
-    // Detection via three SIMD compares OR'd together.
-    let backslash = _mm256_set1_epi8(b'\\' as i8);
-    // For "< 0x20" we use a signed unsigned trick: compare against 0x1F via
-    // unsigned MAX. _mm256_cmpgt_epi8 is signed, but bytes <0x20 are also
-    // <0x20 as signed positive values, so signed cmpgt works here for the
-    // 0x00..=0x1F range (none of which has the high bit set).
-    let ctrl_thresh = _mm256_set1_epi8(0x20_i8);
-
     while i + 32 <= n {
         let chunk = _mm256_loadu_si256(span.as_ptr().add(i) as *const __m256i);
+        let mask = classify_str_mask(chunk);
+
+        if mask != 0 {
+            let mut m = mask;
+            let mut consumed: usize = 0; // bytes from chunk start already handled
+            while m != 0 {
+                let offset = m.trailing_zeros() as usize;
+                m &= m - 1;
+
+                if offset < consumed {
+                    continue; // already consumed as part of a prior escape
+                }
+
+                let pos = i + offset;
+                let b = span[pos];
+
+                if b < 0x20 {
+                    return Err(qjson_err::QJSON_INVALID_STRING);
+                }
+
+                if b >= 0x80 {
+                    return validate_span_scalar(&span[pos..]);
+                }
 
-        // high bit set?
-        let high  = _mm256_movemask_epi8(chunk) as u32;
-        // byte == '\\' ?
-        let bs    = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, backslash)) as u32;
-        // byte < 0x20 ?  (signed cmpgt: ctrl_thresh > chunk for 0x00..=0x1F bytes)
-        let ctrl  = _mm256_movemask_epi8(_mm256_cmpgt_epi8(ctrl_thresh, chunk)) as u32;
+                // b == b'\\' (mask only has bits for ctrl|bs|high)
+                if pos + 1 >= n {
+                    return Err(qjson_err::QJSON_INVALID_STRING);
+                }
 
-        let interesting = high | bs | ctrl;
-        if interesting != 0 {
-            // Hand off to the scalar state machine starting at the first
-            // interesting byte in this chunk. We don't try to validate any
-            // already-cleared bytes — those are pure printable ASCII and
-            // self-terminating so it's safe to resume there.
-            let offset = interesting.trailing_zeros() as usize;
-            return validate_span_scalar(&span[i + offset..]);
+                let next = span[pos + 1];
+                match next {
+                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {
+                        // Escape straddles chunk boundary: delegate to scalar
+                        // so consumed tracking doesn't lose sync.
+                        if pos + 2 > i + 32 {
+                            return validate_span_scalar(&span[pos..]);
+                        }
+                        consumed = offset + 2;
+                    }
+                    b'u' => {
+                        let hex_start = pos + 2;
+                        let hex_end = hex_start + 4;
+                        if hex_end > n {
+                            return Err(qjson_err::QJSON_INVALID_STRING);
+                        }
+                        // If the full \uXXXX straddles the chunk boundary,
+                        // hand off to scalar.
+                        if hex_end > i + 32 {
+                            return validate_span_scalar(&span[pos..]);
+                        }
+                        for &h in &span[hex_start..hex_end] {
+                            if !h.is_ascii_hexdigit() {
+                                return Err(qjson_err::QJSON_INVALID_STRING);
+                            }
+                        }
+                        consumed = offset + 6;
+                    }
+                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
+                }
+            }
         }
 
         i += 32;

From f06c5fa0b6601863e64955f884c7082309c58fcd Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 16:51:53 +0000
Subject: [PATCH 05/17] perf: add validate_eager_fused merging
 depth+trailing+grammar

---
 src/doc.rs          |   3 +-
 src/validate/mod.rs | 239 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 239 insertions(+), 3 deletions(-)

diff --git a/src/doc.rs b/src/doc.rs
index 82226f5..5244275 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -32,8 +32,7 @@ impl<'a> Document<'a> {
         indices.push(u32::MAX);
 
         if opts.is_eager() {
-            crate::validate::validate_trailing(buf, &indices)?;
-            crate::validate::validate_eager_values(buf, &indices, max_depth)?;
+            crate::validate::validate_eager_fused(buf, &indices, max_depth)?;
         } else {
             crate::validate::validate_depth(buf, &indices, max_depth)?;
         }
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index c3ad878..706b0bc 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -126,6 +126,171 @@ pub(crate) fn validate_trailing(
     Ok(())
 }
 
+/// Fused eager validator: depth, trailing-content, and grammar/value
+/// checks in a single O(indices) traversal. Equivalent to calling
+/// `validate_depth` + `validate_trailing` + `validate_eager_values`
+/// but avoids three separate walks.
+pub(crate) fn validate_eager_fused(
+    buf: &[u8],
+    indices: &[u32],
+    max_depth: u32,
+) -> Result<(), qjson_err> {
+    let mut depth: u32 = 0;
+
+    let mut stack: Vec<CtxKind> = Vec::with_capacity(16);
+    stack.push(CtxKind::Top);
+
+    let mut prev_end: usize = 0;
+
+    let mut i: usize = 0;
+    while i < indices.len() {
+        let idx = indices[i];
+        if idx == u32::MAX { break; }
+        let pos = idx as usize;
+        let b = buf[pos];
+
+        consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?;
+
+        match b {
+            b'{' | b'[' => {
+                let cur = stack.last_mut().unwrap();
+                match *cur {
+                    CtxKind::Top
+                    | CtxKind::ArrAfterOpen
+                    | CtxKind::ArrAfterComma
+                    | CtxKind::ObjAfterColon => {
+                        *cur = parent_after_value(*cur);
+                        stack.push(if b == b'{' {
+                            CtxKind::ObjAfterOpen
+                        } else {
+                            CtxKind::ArrAfterOpen
+                        });
+                    }
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                depth += 1;
+                if depth > max_depth {
+                    return Err(qjson_err::QJSON_NESTING_TOO_DEEP);
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b'}' => {
+                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                depth -= 1;
+                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                    let mut p = pos + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b']' => {
+                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                depth -= 1;
+                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                    let mut p = pos + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b',' => {
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma,
+                    CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma,
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b':' => {
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon,
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = pos + 1;
+                i += 1;
+            }
+            b'"' => {
+                if i + 1 >= indices.len() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                let close = indices[i + 1] as usize;
+                if close <= pos || close >= buf.len() || buf[close] != b'"' {
+                    return Err(qjson_err::QJSON_PARSE_ERROR);
+                }
+                strings::validate_string_span(&buf[pos + 1 .. close])?;
+
+                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                match *cur {
+                    CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => {
+                        *cur = CtxKind::ObjAfterKey;
+                    }
+                    CtxKind::Top
+                    | CtxKind::ArrAfterOpen
+                    | CtxKind::ArrAfterComma
+                    | CtxKind::ObjAfterColon => {
+                        *cur = parent_after_value(*cur);
+                        if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+                            let mut p = close + 1;
+                            while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                            if p < buf.len() {
+                                return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                            }
+                        }
+                    }
+                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+                }
+                prev_end = close + 1;
+                i += 2;
+            }
+            _ => return Err(qjson_err::QJSON_PARSE_ERROR),
+        }
+    }
+
+    // Tail: handle any remaining content.
+    // For scalar roots (depth == 0, still in Top), find the first
+    // token, validate it, then check for trailing content beyond it.
+    if matches!(*stack.last().unwrap(), CtxKind::Top) && depth == 0 {
+        let mut scan = prev_end;
+        while scan < buf.len() && is_ws(buf[scan]) { scan += 1; }
+        if scan < buf.len() {
+            let mut end = scan;
+            while end < buf.len() && !is_ws(buf[end]) { end += 1; }
+            validate_scalar(&buf[scan..end])?;
+            *stack.last_mut().unwrap() = CtxKind::TopDone;
+
+            let mut p = end;
+            while p < buf.len() && is_ws(buf[p]) { p += 1; }
+            if p < buf.len() {
+                return Err(qjson_err::QJSON_TRAILING_CONTENT);
+            }
+        }
+    } else {
+        consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?;
+    }
+
+    if stack.len() != 1 || stack[0] != CtxKind::TopDone {
+        return Err(qjson_err::QJSON_PARSE_ERROR);
+    }
+    Ok(())
+}
+
 /// Grammar-aware eager pass: walk `indices` once and validate every
 /// structural transition, key/value string, and scalar value.
 ///
@@ -515,4 +680,76 @@ mod tests {
             validate_eager_values(&buf, &ix(&buf), 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP),
         );
     }
-}
\ No newline at end of file
+
+    // ── fused validator tests ────────────────────────────────────────
+
+    #[test]
+    fn fused_accepts_clean_input() {
+        for buf in [
+            &b"{}"[..], &b"[]"[..], &b"{\"a\":1}"[..],
+            &b"[1,2,3]"[..], &b"42"[..], &b"\"hi\""[..],
+            &b"[true,false,null]"[..],
+        ] {
+            assert!(validate_eager_fused(buf, &ix(buf), 1024).is_ok(),
+                "fused should accept {:?}", std::str::from_utf8(buf).unwrap_or("(non-utf8)"));
+        }
+    }
+
+    #[test]
+    fn fused_rejects_trailing_content() {
+        assert_eq!(
+            validate_eager_fused(b"{}garbage", &ix(b"{}garbage"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_rejects_excessive_depth() {
+        assert_eq!(
+            validate_eager_fused(b"[[[1]]]", &ix(b"[[[1]]]"), 2),
+            Err(qjson_err::QJSON_NESTING_TOO_DEEP),
+        );
+    }
+
+    #[test]
+    fn fused_depth_ok_at_limit() {
+        assert!(validate_eager_fused(b"[[1]]", &ix(b"[[1]]"), 2).is_ok());
+    }
+
+    #[test]
+    fn fused_trailing_whitespace_accepted() {
+        assert!(validate_eager_fused(b"{}   \n\t", &ix(b"{}   \n\t"), 1024).is_ok());
+    }
+
+    #[test]
+    fn fused_two_root_scalars_rejected() {
+        assert_eq!(
+            validate_eager_fused(b"1 2", &ix(b"1 2"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_trailing_in_nested_container_detected() {
+        assert_eq!(
+            validate_eager_fused(b"[1] x", &ix(b"[1] x"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_grammar_rejects_missing_colon() {
+        assert_eq!(
+            validate_eager_fused(b"{\"a\"}", &ix(b"{\"a\"}"), 1024),
+            Err(qjson_err::QJSON_PARSE_ERROR),
+        );
+    }
+
+    #[test]
+    fn fused_grammar_rejects_trailing_garbage_inside_object() {
+        assert_eq!(
+            validate_eager_fused(b"{\"a\":\"a\" 123}", &ix(b"{\"a\":\"a\" 123}"), 1024),
+            Err(qjson_err::QJSON_PARSE_ERROR),
+        );
+    }
+}

From af73d5b3ee0c943b0a4068dc8e165afb29f80db0 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:31:04 +0000
Subject: [PATCH 06/17] chore: fix clippy dead_code and doc warnings

Add #[allow(dead_code)] to classify.rs module and to validate_trailing /
validate_eager_values (kept for tests and planned future use). Fix
overindented doc list item.
---
 src/validate/classify.rs | 8 ++++++--
 src/validate/mod.rs      | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/validate/classify.rs b/src/validate/classify.rs
index 700f246..c75d2db 100644
--- a/src/validate/classify.rs
+++ b/src/validate/classify.rs
@@ -9,6 +9,11 @@
 //! This replaces the three-comparison approach (`high || bs || ctrl`)
 //! used by the old string validation fast-path and extends the same
 //! LUT infrastructure to number validation.
+//!
+//! Some items (number LUTs, constants) are kept for planned number
+//! validation SIMD path.
+
+#![allow(dead_code)]
 
 pub(crate) const CLS_CTRL:  u8 = 0x01;
 pub(crate) const CLS_BS:    u8 = 0x02;
@@ -180,8 +185,7 @@ pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
 ///
 /// Returns `(class_vector, bad_mask)`:
 ///   - `class_vector`: per-byte class bitmask (DIGIT | NUMS | CTRL | …)
-///   - `bad_mask`:     bits set for bytes with CTRL | BS | HIGH
-///                     (bytes that are unconditionally invalid in a number).
+///   - `bad_mask`:     bits set for bytes with CTRL | BS | HIGH (unconditionally invalid in a number).
 #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn classify_num_chunk(chunk: __m256i) -> (__m256i, u32) {
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index 706b0bc..ef7d2dc 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -49,6 +49,7 @@ pub(crate) fn validate_depth(
 /// bracket where nesting depth returns to zero — that is the actual root
 /// end, regardless of how many additional structural chars the buffer has.
 /// For scalar roots (no opening bracket), we scan the raw bytes.
+#[allow(dead_code)]
 pub(crate) fn validate_trailing(
     buf: &[u8],
     indices: &[u32],
@@ -306,6 +307,7 @@ pub(crate) fn validate_eager_fused(
 /// `validate_number` or matched against the three literal keywords;
 /// the error-code precedence matches the previous heuristic-based
 /// `check_gap` so existing tests keep their current error codes.
+#[allow(dead_code)]
 pub(crate) fn validate_eager_values(
     buf: &[u8],
     indices: &[u32],

From c62b28d05b5e96da60e29115132b7fd18ae56543 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:38:48 +0000
Subject: [PATCH 07/17] chore: fix clippy warnings from rebase (collapsed if,
 same_item_push, PI literal)

---
 src/decode/number.rs |  8 +++-----
 src/validate/mod.rs  | 23 ++++++++++-------------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/decode/number.rs b/src/decode/number.rs
index 74839ff..e7a561f 100644
--- a/src/decode/number.rs
+++ b/src/decode/number.rs
@@ -48,10 +48,8 @@ pub(crate) fn parse_f64(bytes: &[u8], skip_validation: bool) -> Result<f64, qjso
     // When validation is skipped, do a cheap precheck to avoid returning
     // a mode-dependent error code for non-number input.  The leading
     // byte must plausibly start a JSON number: `-`, `.`, or digit.
-    if skip_validation {
-        if bytes.is_empty() || !matches!(bytes[0], b'-' | b'.' | b'0'..=b'9') {
-            return Err(qjson_err::QJSON_INVALID_NUMBER);
-        }
+    if skip_validation && (bytes.is_empty() || !matches!(bytes[0], b'-' | b'.' | b'0'..=b'9')) {
+        return Err(qjson_err::QJSON_INVALID_NUMBER);
     }
 
     let s = std::str::from_utf8(bytes).map_err(|_| qjson_err::QJSON_DECODE_FAILED)?;
@@ -121,7 +119,7 @@ mod tests {
 
     #[test]
     fn f64_skip_validation_valid_input() {
-        assert_eq!(parse_f64(b"3.14", true).unwrap(), 3.14);
+        assert_eq!(parse_f64(b"3.15", true).unwrap(), 3.15);
     }
 
     #[test]
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index ef7d2dc..69abeca 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -662,24 +662,21 @@ mod tests {
 
     #[test]
     fn grammar_accepts_at_max_depth() {
-        // 1024 nested arrays at the default max_depth limit.
-        let mut buf = Vec::new();
-        for _ in 0..1024 { buf.push(b'['); }
-        for _ in 0..1024 { buf.push(b']'); }
-        assert!(
-            validate_eager_values(&buf, &ix(&buf), 1024).is_ok(),
-            "should accept exactly at max_depth"
-        );
+        let buf = [b'['].repeat(1024).into_iter()
+            .chain([b']'].repeat(1024))
+            .collect::<Vec<u8>>();
+        let indices = ix(&buf);
+        assert!(validate_eager_values(&buf, &indices, 1024).is_ok());
     }
 
     #[test]
     fn grammar_rejects_over_max_depth() {
-        // 1025 nested arrays — one past the default max_depth limit.
-        let mut buf = Vec::new();
-        for _ in 0..1025 { buf.push(b'['); }
-        for _ in 0..1025 { buf.push(b']'); }
+        let buf = [b'['].repeat(1025).into_iter()
+            .chain([b']'].repeat(1025))
+            .collect::<Vec<u8>>();
+        let indices = ix(&buf);
         assert_eq!(
-            validate_eager_values(&buf, &ix(&buf), 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP),
+            validate_eager_values(&buf, &indices, 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP),
         );
     }
 

From 5d0bc0f6acb6197175a5b8f690d54062f517f0a4 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:40:27 +0000
Subject: [PATCH 08/17] chore: remove docs/superpowers and gitignore entry

---
 .gitignore                                    |    1 -
 .../plans/2026-05-22-fuse-eager-simd-plan.md  | 1158 -----------------
 .../2026-05-22-fuse-eager-simd-design.md      |  131 --
 3 files changed, 1290 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md
 delete mode 100644 docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md

diff --git a/.gitignore b/.gitignore
index c352974..3d337aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 /target/
 **/*.rs.bk
 .worktrees/
-docs/superpowers/specs/
 
 # Cargo lock — uncomment to commit (recommended for binary crates, optional for cdylib/rlib).
 # Currently left untracked; remove this comment block and the line below to start tracking.
diff --git a/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md b/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md
deleted file mode 100644
index 8910091..0000000
--- a/docs/superpowers/plans/2026-05-22-fuse-eager-simd-plan.md
+++ /dev/null
@@ -1,1158 +0,0 @@
-# Fuse & Accelerate: Eager SIMD Optimization — Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Fuse the 4 eager-validation passes over `indices` into 1 pass, and accelerate string/number validation with PSHUFB nibble-LUT byte classification (AVX2 + AVX-512).
-
-**Architecture:** A new shared byte-classifier module (`classify.rs`) provides PSHUFB-based per-byte class bitmasks. String validation AVX2 is rewritten to use it (no early-scalar-fallback). An AVX-512 path is added. Number validation gains a SIMD fast path. The `validate_depth`, `validate_trailing`, and `validate_eager_values` functions are merged into `validate_eager_fused` — a single O(indices) traversal. `doc.rs` is updated to call only `validate_eager_fused`.
-
-**Tech Stack:** Rust, x86_64 intrinsics (AVX2 + AVX-512BW/VL), existing `once_cell` dispatch.
-
----
-
-### Task 1: PSHUFB byte classifier module
-
-**Files:**
-- Create: `src/validate/classify.rs`
-- Modify: `src/validate/mod.rs` (add module declaration)
-
-- [ ] **Step 1: Create `src/validate/classify.rs`**
-
-```rust
-//! PSHUFB nibble-LUT byte classifier shared by string and number
-//! validation. Maps each byte to a class bitmask in a single SIMD
-//! instruction sequence.
-//!
-//! Classification: split each byte into high/low nibble, lookup two
-//! 16-entry LUTs via `_mm256_shuffle_epi8`, AND the results. The AND
-//! means a classification bit is set only if BOTH nibbles allow it.
-
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-/// Class bits for string validation.
-pub(crate) const CLS_CTRL:  u8 = 1 << 0; // control char 0x00..=0x1F
-pub(crate) const CLS_BS:    u8 = 1 << 1; // backslash 0x5C
-pub(crate) const CLS_HIGH:  u8 = 1 << 2; // high-bit byte >= 0x80
-
-/// Class bits for number validation (includes string bits for reuse).
-pub(crate) const CLS_DIGIT: u8 = 1 << 3; // digit 0x30..=0x39
-pub(crate) const CLS_NUMS:  u8 = 1 << 4; // number structural: . - e E +
-
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-pub(crate) unsafe fn classify_str_chunk(chunk: __m256i) -> u32 {
-    classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT)
-}
-
-/// Classify each byte in the 32-byte chunk. Returns a u32 mask where
-/// bit i is set if byte i has any "interesting" class bits
-/// (CTRL | BS | HIGH).
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
-    let class = classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT);
-    // Extract the "interesting" bits: any non-zero class byte means
-    // attention needed. We check CTRL | BS | HIGH bits.
-    let want = _mm256_set1_epi8((CLS_CTRL | CLS_BS | CLS_HIGH) as i8);
-    let match_mask = _mm256_and_si256(class, want);
-    _mm256_movemask_epi8(_mm256_cmpeq_epi8(match_mask, _mm256_setzero_si256())) as u32 ^ 0xFFFFFFFFu32
-}
-
-/// Classify a number chunk. Returns the per-byte class vector so the
-/// caller can check DIGIT | NUMS validity.
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-pub(crate) unsafe fn classify_num_chunk(chunk: __m256i) -> (__m256i, u32) {
-    let class = classify_chunk(chunk, &NUM_LO_LUT, &NUM_HI_LUT);
-    // Check which bytes are NOT (DIGIT | NUMS).
-    let valid = _mm256_set1_epi8((CLS_DIGIT | CLS_NUMS) as i8);
-    let ok = _mm256_cmpeq_epi8(_mm256_and_si256(class, valid), _mm256_setzero_si256());
-    let bad_mask = _mm256_movemask_epi8(ok) as u32 ^ 0xFFFFFFFFu32;
-    (class, bad_mask)
-}
-
-/// Core PSHUFB nibble-LUT classifier: returns per-byte class bitmask.
-#[cfg(target_arch = "x86_64")]
-#[inline(always)]
-unsafe fn classify_chunk(chunk: __m256i, lo_lut: &[u8; 16], hi_lut: &[u8; 16]) -> __m256i {
-    let zero = _mm256_setzero_si256();
-    let nib_mask = _mm256_set1_epi8(0x0F_i8);
-    let lo_lut_vec = _mm256_loadu_si256(
-        [lo_lut[0], lo_lut[1], lo_lut[2], lo_lut[3],
-         lo_lut[4], lo_lut[5], lo_lut[6], lo_lut[7],
-         lo_lut[8], lo_lut[9], lo_lut[10], lo_lut[11],
-         lo_lut[12], lo_lut[13], lo_lut[14], lo_lut[15],
-         lo_lut[0], lo_lut[1], lo_lut[2], lo_lut[3],
-         lo_lut[4], lo_lut[5], lo_lut[6], lo_lut[7],
-         lo_lut[8], lo_lut[9], lo_lut[10], lo_lut[11],
-         lo_lut[12], lo_lut[13], lo_lut[14], lo_lut[15],
-        ].as_ptr() as *const __m256i,
-    );
-    // Build hi_lut vector (same layout).
-    let hi_lut_vec = _mm256_loadu_si256(
-        [hi_lut[0], hi_lut[1], hi_lut[2], hi_lut[3],
-         hi_lut[4], hi_lut[5], hi_lut[6], hi_lut[7],
-         hi_lut[8], hi_lut[9], hi_lut[10], hi_lut[11],
-         hi_lut[12], hi_lut[13], hi_lut[14], hi_lut[15],
-         hi_lut[0], hi_lut[1], hi_lut[2], hi_lut[3],
-         hi_lut[4], hi_lut[5], hi_lut[6], hi_lut[7],
-         hi_lut[8], hi_lut[9], hi_lut[10], hi_lut[11],
-         hi_lut[12], hi_lut[13], hi_lut[14], hi_lut[15],
-        ].as_ptr() as *const __m256i,
-    );
-
-    let lo_nib = _mm256_and_si256(chunk, nib_mask);
-    let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), nib_mask);
-
-    let lo_class = _mm256_shuffle_epi8(lo_lut_vec, lo_nib);
-    let hi_class = _mm256_shuffle_epi8(hi_lut_vec, hi_nib);
-
-    _mm256_and_si256(lo_class, hi_class)
-}
-
-// ── String classification LUTs ──────────────────────────────────────
-// CTRL: 0x00..=0x1F (high nibble 0x0..=0x1, any low nibble)
-// BS:   0x5C          (high nibble 0x5,   low nibble 0xC)
-// HIGH: 0x80..=0xFF   (high nibble 0x8..=0xF, any low nibble)
-
-#[cfg(target_arch = "x86_64")]
-static STR_LO_LUT: [u8; 16] = {
-    let mut l = [0u8; 16];
-    let mut i = 0usize;
-    while i < 16 {
-        l[i] = CLS_CTRL | CLS_HIGH;
-        i += 1;
-    }
-    l[0xC] |= CLS_BS; // backslash low nibble
-    l
-};
-
-#[cfg(target_arch = "x86_64")]
-static STR_HI_LUT: [u8; 16] = {
-    let mut l = [0u8; 16];
-    l[0x0] = CLS_CTRL;
-    l[0x1] = CLS_CTRL;
-    l[0x5] = CLS_BS; // backslash high nibble
-    l[0x8] = CLS_HIGH;
-    l[0x9] = CLS_HIGH;
-    l[0xA] = CLS_HIGH;
-    l[0xB] = CLS_HIGH;
-    l[0xC] = CLS_HIGH;
-    l[0xD] = CLS_HIGH;
-    l[0xE] = CLS_HIGH;
-    l[0xF] = CLS_HIGH;
-    l
-};
-
-// ── Number classification LUTs ──────────────────────────────────────
-// DIGIT:      0x30..=0x39 (high nibble 0x3, low nibble 0x0..=0x9)
-// NUM_STRUCT: 0x2E '.', 0x2D '-', 0x2B '+', 0x65 'e', 0x45 'E'
-
-#[cfg(target_arch = "x86_64")]
-static NUM_LO_LUT: [u8; 16] = {
-    let mut l = STR_LO_LUT;
-    // digits: low nibble 0..9
-    l[0x0] |= CLS_DIGIT;
-    l[0x1] |= CLS_DIGIT;
-    l[0x2] |= CLS_DIGIT;
-    l[0x3] |= CLS_DIGIT;
-    l[0x4] |= CLS_DIGIT;
-    l[0x5] |= CLS_DIGIT | CLS_NUMS; // also 'e'/'E' low nibble
-    l[0x6] |= CLS_DIGIT;
-    l[0x7] |= CLS_DIGIT;
-    l[0x8] |= CLS_DIGIT;
-    l[0x9] |= CLS_DIGIT;
-    // number structural low nibbles
-    l[0xB] |= CLS_NUMS; // '+'
-    l[0xD] |= CLS_NUMS; // '-'
-    l[0xE] |= CLS_NUMS; // '.'
-    l
-};
-
-#[cfg(target_arch = "x86_64")]
-static NUM_HI_LUT: [u8; 16] = {
-    let mut l = STR_HI_LUT;
-    // digits: high nibble 0x3
-    l[0x3] |= CLS_DIGIT;
-    // number structural high nibbles
-    l[0x2] |= CLS_NUMS; // '.', '-', '+'
-    l[0x4] |= CLS_NUMS; // 'E'
-    l[0x6] |= CLS_NUMS; // 'e'
-    l
-};
-
-#[cfg(test)]
-#[cfg(target_arch = "x86_64")]
-mod tests {
-    use super::*;
-
-    /// Verify the classifier against the scalar string validator for
-    /// all 256 possible byte values. The classifier's bits must be
-    /// consistent with the ground-truth ranges.
-    #[test]
-    fn lut_exhaustive_consistency() {
-        if !std::is_x86_feature_detected!("avx2") { return; }
-        let mut buf = [0u8; 32];
-        for b in 0..=255u8 {
-            buf[0] = b;
-            unsafe {
-                let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
-                let class = classify_chunk(chunk, &STR_LO_LUT, &STR_HI_LUT);
-                let class_byte = _mm256_extract_epi8(class, 0) as u8;
-
-                let expect_ctrl = if b < 0x20 { CLS_CTRL } else { 0 };
-                let expect_bs   = if b == b'\\' { CLS_BS } else { 0 };
-                let expect_high = if b >= 0x80 { CLS_HIGH } else { 0 };
-                let expected = expect_ctrl | expect_bs | expect_high;
-
-                assert_eq!(
-                    class_byte, expected,
-                    "byte 0x{:02X}: got 0x{:02X}, expected 0x{:02X}",
-                    b, class_byte, expected,
-                );
-            }
-        }
-    }
-
-    /// Verify number classification for all 256 byte values.
-    #[test]
-    fn num_lut_exhaustive_consistency() {
-        if !std::is_x86_feature_detected!("avx2") { return; }
-        let mut buf = [0u8; 32];
-        for b in 0..=255u8 {
-            buf[0] = b;
-            unsafe {
-                let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
-                let class = classify_chunk(chunk, &NUM_LO_LUT, &NUM_HI_LUT);
-                let class_byte = _mm256_extract_epi8(class, 0) as u8;
-
-                let expect_digit = if matches!(b, b'0'..=b'9') { CLS_DIGIT } else { 0 };
-                let expect_nums = if matches!(b, b'.' | b'-' | b'+' | b'e' | b'E') { CLS_NUMS } else { 0 };
-                // NUM LUT inherits STR bits too.
-                let expect_str = {
-                    let c = if b < 0x20 { CLS_CTRL } else { 0 };
-                    let s = if b == b'\\' { CLS_BS } else { 0 };
-                    let h = if b >= 0x80 { CLS_HIGH } else { 0 };
-                    c | s | h
-                };
-                let expected = expect_str | expect_digit | expect_nums;
-                assert_eq!(
-                    class_byte, expected,
-                    "byte 0x{:02X}: got 0x{:02X}, expected 0x{:02X}",
-                    b, class_byte, expected,
-                );
-            }
-        }
-    }
-}
-```
-
-- [ ] **Step 2: Add module declaration to `src/validate/mod.rs`**
-
-Add after the existing `mod` declarations (after line 10 `pub(crate) use strings::validate_string_span;`):
-
-```rust
-pub(crate) mod classify;
-```
-
-- [ ] **Step 3: Run classifier tests**
-
-```bash
-cargo test --release validate::classify
-```
-
-Expected: 2 tests pass (exhaustive LUT consistency).
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add src/validate/classify.rs src/validate/mod.rs
-git commit -m "feat: add PSHUFB nibble-LUT byte classifier module
-
-Provides classify_str_chunk/classify_num_chunk for SIMD byte
-classification. Exhaustive LUT consistency tests for all 256 byte
-values against ground-truth ranges (control, backslash, high-bit,
-digit, number structural)."
-```
-
----
-
-### Task 2: Rewrite AVX2 string validation to use classifier
-
-**Files:**
-- Modify: `src/validate/strings/avx2.rs`
-
-- [ ] **Step 1: Replace `src/validate/strings/avx2.rs`**
-
-Replace the entire file content:
-
-```rust
-#![cfg(all(target_arch = "x86_64", feature = "avx2"))]
-
-//! AVX2 string-content validation using PSHUFB nibble-LUT byte classifier.
-//!
-//! Each 32-byte chunk is classified via `classify_str_mask`. Control chars
-//! (CLS_CTRL) are immediately rejected. Backslashes (CLS_BS) trigger
-//! escape-sequence validation statefully. High-bit bytes (CLS_HIGH)
-//! trigger scalar UTF-8 sequence validation.
-//!
-//! Unlike the previous "find-first-interesting-then-scalar" approach,
-//! this validator processes backslash/UTF-8 in-batch: after classifying
-//! a chunk, it walks the CLS_BS/CLS_HIGH mask to validate each position
-//! while the chunk data is still hot in registers. Pure printable-ASCII
-//! chunks are fully skipped.
-
-use crate::error::qjson_err;
-use core::arch::x86_64::*;
-use crate::validate::classify::{CLS_CTRL, CLS_BS, CLS_HIGH, classify_str_mask};
-
-/// Validate the string span using AVX2 with PSHUFB classifier.
-pub(crate) fn validate_span_avx2(span: &[u8]) -> Result<(), qjson_err> {
-    // SAFETY: dispatcher has verified AVX2 feature presence.
-    unsafe { validate_span_avx2_impl(span) }
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn validate_span_avx2_impl(span: &[u8]) -> Result<(), qjson_err> {
-    let mut i: usize = 0;
-    let n = span.len();
-
-    while i + 32 <= n {
-        let chunk = _mm256_loadu_si256(span.as_ptr().add(i) as *const __m256i);
-        let mask = classify_str_mask(chunk);
-
-        if mask == 0 {
-            i += 32;
-            continue;
-        }
-
-        // Walk each flagged byte position.
-        let mut m = mask;
-        while m != 0 {
-            let off = m.trailing_zeros() as usize;
-            let pos = i + off;
-            let b = span[pos];
-
-            if b < 0x20 {
-                return Err(qjson_err::QJSON_INVALID_STRING);
-            }
-            if b == b'\\' {
-                // Validate escape: the escape target is at pos+1.
-                if pos + 1 >= n {
-                    return Err(qjson_err::QJSON_INVALID_STRING);
-                }
-                match span[pos + 1] {
-                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {
-                        // Standard escape: consume both bytes.
-                        // Continue scanning this chunk after the escape.
-                    }
-                    b'u' => {
-                        if pos + 6 > n
-                            || !span[pos + 2].is_ascii_hexdigit()
-                            || !span[pos + 3].is_ascii_hexdigit()
-                            || !span[pos + 4].is_ascii_hexdigit()
-                            || !span[pos + 5].is_ascii_hexdigit()
-                        {
-                            return Err(qjson_err::QJSON_INVALID_STRING);
-                        }
-                    }
-                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
-                }
-            }
-            if b >= 0x80 {
-                // For high-bit bytes detected in a chunk, hand off to the
-                // scalar UTF-8 validator. Since UTF-8 sequences can be up
-                // to 4 bytes long and have complex overlong/surrogate
-                // constraints, we delegate to the well-tested scalar path.
-                return super::scalar::validate_span_scalar(&span[pos..]);
-            }
-
-            m &= m - 1;
-        }
-
-        i += 32;
-    }
-
-    // Tail (<32 bytes): scalar validator.
-    super::scalar::validate_span_scalar(&span[i..])
-}
-```
-
-- [ ] **Step 2: Run existing string validation tests**
-
-```bash
-cargo test --release validate::strings
-```
-
-Expected: All existing tests pass (the classifier handles the same byte ranges).
-
-- [ ] **Step 3: Commit**
-
-```bash
-git add src/validate/strings/avx2.rs
-git commit -m "perf: rewrite AVX2 string validation with PSHUFB classifier
-
-Replace first-interesting-byte-then-scalar approach with per-byte
-classification via classify_str_mask. Escapes and UTF-8 triggers
-are processed in-batch while chunk data is hot in registers."
-```
-
----
-
-### Task 3: Add AVX-512 string validation path
-
-**Files:**
-- Create: `src/validate/strings/avx512.rs`
-- Modify: `src/validate/strings/mod.rs`
-
-- [ ] **Step 1: Create `src/validate/strings/avx512.rs`**
-
-```rust
-#![cfg(all(target_arch = "x86_64", feature = "avx2"))]
-
-//! AVX-512BW+VL string-content validation.
-//!
-//! Uses 64-byte ZMM registers via two 32-byte YMM halves, since we
-//! require AVX-512BW (byte operations) and AVX-512VL (512-bit ops
-//! on YMM registers via EVEX encoding). The PSHUFB classifier still
-//! uses YMM since AVX-512VBMI (zmm-wide shuffle) is not assumed.
-//! Native mask registers (_k*_, `__mmask32`) replace manual `u32`
-//! bitmask operations for zero-cost conditional ops.
-
-use crate::error::qjson_err;
-use core::arch::x86_64::*;
-use crate::validate::classify::{
-    CLS_CTRL, CLS_BS, CLS_HIGH,
-    classify_str_mask,
-};
-
-pub(crate) fn validate_span_avx512(span: &[u8]) -> Result<(), qjson_err> {
-    // SAFETY: dispatcher verifies AVX-512BW+VL feature presence.
-    unsafe { validate_span_avx512_impl(span) }
-}
-
-#[target_feature(enable = "avx2,avx512bw,avx512vl")]
-unsafe fn validate_span_avx512_impl(span: &[u8]) -> Result<(), qjson_err> {
-    let mut i: usize = 0;
-    let n = span.len();
-
-    // Process 64 bytes per outer iteration: two 32B YMM chunks.
-    while i + 64 <= n {
-        let lo = _mm256_loadu_si256(span.as_ptr().add(i)       as *const __m256i);
-        let hi = _mm256_loadu_si256(span.as_ptr().add(i + 32)  as *const __m256i);
-
-        let mask_lo = classify_str_mask(lo);
-        let mask_hi = classify_str_mask(hi);
-
-        if (mask_lo | mask_hi) == 0 {
-            i += 64;
-            continue;
-        }
-
-        // Process flagged bytes in both halves.
-        // Half 0.
-        let mut m = mask_lo;
-        while m != 0 {
-            let off = m.trailing_zeros() as usize;
-            let pos = i + off;
-            let b = span[pos];
-            if b < 0x20 {
-                return Err(qjson_err::QJSON_INVALID_STRING);
-            }
-            if b == b'\\' {
-                if pos + 1 >= n { return Err(qjson_err::QJSON_INVALID_STRING); }
-                match span[pos + 1] {
-                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
-                    b'u' => {
-                        if pos + 6 > n
-                            || !span[pos+2].is_ascii_hexdigit()
-                            || !span[pos+3].is_ascii_hexdigit()
-                            || !span[pos+4].is_ascii_hexdigit()
-                            || !span[pos+5].is_ascii_hexdigit()
-                        { return Err(qjson_err::QJSON_INVALID_STRING); }
-                    }
-                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
-                }
-            }
-            if b >= 0x80 {
-                return super::scalar::validate_span_scalar(&span[pos..]);
-            }
-            m &= m - 1;
-        }
-
-        // Half 1.
-        let mut m = mask_hi;
-        while m != 0 {
-            let off = m.trailing_zeros() as usize;
-            let pos = i + 32 + off;
-            let b = span[pos];
-            if b < 0x20 {
-                return Err(qjson_err::QJSON_INVALID_STRING);
-            }
-            if b == b'\\' {
-                if pos + 1 >= n { return Err(qjson_err::QJSON_INVALID_STRING); }
-                match span[pos + 1] {
-                    b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
-                    b'u' => {
-                        if pos + 6 > n
-                            || !span[pos+2].is_ascii_hexdigit()
-                            || !span[pos+3].is_ascii_hexdigit()
-                            || !span[pos+4].is_ascii_hexdigit()
-                            || !span[pos+5].is_ascii_hexdigit()
-                        { return Err(qjson_err::QJSON_INVALID_STRING); }
-                    }
-                    _ => return Err(qjson_err::QJSON_INVALID_STRING),
-                }
-            }
-            if b >= 0x80 {
-                return super::scalar::validate_span_scalar(&span[pos..]);
-            }
-            m &= m - 1;
-        }
-
-        i += 64;
-    }
-
-    // Tail (<64 bytes): hand off to AVX2 path.
-    super::avx2::validate_span_avx2(&span[i..])
-}
-```
-
-- [ ] **Step 2: Update dispatch in `src/validate/strings/mod.rs`**
-
-Add the AVX-512 module declaration after `mod avx2;`:
-
-```rust
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-mod avx2;
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-mod avx512;
-```
-
-Update the `VALIDATE_FN` initializer in `validate_string_span` to try AVX-512 first:
-
-```rust
-pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjson_err> {
-    let f = *VALIDATE_FN.get_or_init(|| {
-        #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-        {
-            if std::is_x86_feature_detected!("avx512bw")
-                && std::is_x86_feature_detected!("avx512vl")
-            {
-                return avx512::validate_span_avx512 as ValidateFn;
-            }
-            if std::is_x86_feature_detected!("avx2") {
-                return avx2::validate_span_avx2 as ValidateFn;
-            }
-        }
-        #[cfg(target_arch = "aarch64")]
-        {
-            return neon::validate_span_neon as ValidateFn;
-        }
-        #[allow(unreachable_code)]
-        {
-            scalar::validate_span_scalar as ValidateFn
-        }
-    });
-    f(span)
-}
-```
-
-Update the module doc comment for `strings/mod.rs` (lines 1-12) to mention AVX-512:
-
-```rust
-//! String-content validation: control chars, escape grammar, and UTF-8.
-//!
-//! Single-pass validator with optional SIMD acceleration. The public
-//! entry point [`validate_string_span`] dispatches once via `OnceCell` to
-//! the best available implementation:
-//!
-//!   - x86_64 + AVX-512BW+VL: 64-byte 2×YMM chunks with native mask regs.
-//!   - x86_64 + AVX2:         32-byte PSHUFB classifier chunks.
-//!   - aarch64 NEON:          16-byte chunk skip → scalar tail.
-//!   - Otherwise:             pure scalar state machine.
-//!
-//! All paths return identical error codes for any input.
-```
-
-- [ ] **Step 3: Run tests**
-
-```bash
-cargo test --release validate::strings
-```
-
-Expected: All tests pass. AVX-512 path automatically selected if hardware supports it.
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add src/validate/strings/avx512.rs src/validate/strings/mod.rs
-git commit -m "perf: add AVX-512BW+VL string validation path
-
-64-byte iteration via two YMM PSHUFB chunks per loop. Native mask
-registers via AVX-512BW/VL. Dispatch priority: AVX-512 > AVX2 >
-NEON > scalar."
-```
-
----
-
-### Task 4: Add SIMD number validation fast path
-
-**Files:**
-- Modify: `src/validate/number.rs`
-- Modify: `src/validate/mod.rs` (wire into `validate_scalar`)
-
-- [ ] **Step 1: Add SIMD fast path to `src/validate/number.rs`**
-
-Add the new function after the existing `validate_number`. Also add a `#[cfg]`-gated import at the top:
-
-```rust
-use crate::error::qjson_err;
-
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-use crate::validate::classify::{CLS_DIGIT, CLS_NUMS, classify_num_chunk};
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-use core::arch::x86_64::*;
-```
-
-After `validate_number` (before the `#[cfg(test)]` block), add:
-
-```rust
-/// SIMD-assisted number validation. For numbers ≤ 32 bytes (the
-/// common case), classifies all bytes in one SIMD operation and
-/// validates ABNF structure via the class mask.
-///
-/// Falls back to scalar `validate_number` for precise error reporting
-/// when the SIMD path cannot conclusively validate.
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-pub(crate) fn validate_number_simd(bytes: &[u8]) -> Result<(), qjson_err> {
-    // SAFETY: caller ensures AVX2 is available (via runtime detect or
-    // compile-time feature gate).
-    unsafe { validate_number_simd_impl(bytes) }
-}
-
-#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-#[target_feature(enable = "avx2")]
-unsafe fn validate_number_simd_impl(bytes: &[u8]) -> Result<(), qjson_err> {
-    let n = bytes.len();
-    if n == 0 {
-        return Err(qjson_err::QJSON_INVALID_NUMBER);
-    }
-    if n <= 4 {
-        // Too short for SIMD: use scalar directly.
-        return super::validate_number(bytes);
-    }
-
-    // Load up to 32 bytes into a YMM register (zero-pad tail).
-    let mut buf = [0u8; 32];
-    let copy_len = n.min(32);
-    buf[..copy_len].copy_from_slice(&bytes[..copy_len]);
-    let chunk = _mm256_loadu_si256(buf.as_ptr() as *const __m256i);
-
-    let (class, bad_mask) = classify_num_chunk(chunk);
-
-    // Check for bytes that are neither DIGIT nor NUM_STRUCT.
-    if bad_mask != 0 {
-        // Check if the bad byte is beyond the actual number length
-        // (zero-padding in buf[copy_len..] should be 0).
-        let trailing_zero_mask = (1u32 << copy_len).wrapping_sub(1);
-        if (bad_mask & trailing_zero_mask) != 0 {
-            // Actual invalid byte: fall through to scalar for precise
-            // error code.
-            return super::validate_number(bytes);
-        }
-    }
-
-    // All bytes in [0..copy_len] are DIGIT or NUM_STRUCT.
-    // Fall back to scalar for the tail if >32 bytes.
-    super::validate_number(bytes)
-}
-```
-
-- [ ] **Step 2: Wire SIMD number validation into `consume_scalar_gap`**
-
-In `src/validate/mod.rs`, update the `validate_scalar` function (line 347) to try SIMD first for number-like scalars:
-
-```rust
-fn validate_scalar(scalar: &[u8]) -> Result<(), qjson_err> {
-    match scalar[0] {
-        b't' => if scalar == b"true"  { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
-        b'f' => if scalar == b"false" { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
-        b'n' => if scalar == b"null"  { Ok(()) } else { Err(qjson_err::QJSON_PARSE_ERROR) },
-        b'-' | b'0'..=b'9' | b'+' | b'.' => {
-            #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
-            {
-                number::validate_number_simd(scalar)
-            }
-            #[cfg(not(all(target_arch = "x86_64", feature = "avx2")))]
-            {
-                number::validate_number(scalar)
-            }
-        },
-        _ if scalar == b"NaN" || scalar == b"Infinity" => number::validate_number(scalar),
-        _ => Err(qjson_err::QJSON_PARSE_ERROR),
-    }
-}
-```
-
-- [ ] **Step 3: Make `validate_number` public to `super`**
-
-In `src/validate/number.rs`, ensure `validate_number` is accessible from `mod.rs`. This is already the case since it's `pub(crate)`. The `validate_number_simd` fallback calls `super::validate_number` from `number.rs` — but `super` in `number.rs` is the `validate` module. Let's use the correct path.
-
-Update the import in `number.rs` (add at top):
-
-```rust
-use crate::validate::validate_number as validate_number_scalar;
-```
-
-Then in `validate_number_simd_impl`, use `validate_number_scalar(bytes)` instead of `super::validate_number(bytes)`.
-
-- [ ] **Step 4: Run tests**
-
-```bash
-cargo test --release validate::number
-cargo test --release validate::mod
-```
-
-Expected: All number validation tests pass. Eager grammar tests pass.
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add src/validate/number.rs src/validate/mod.rs
-git commit -m "perf: add SIMD number validation fast path
-
-validate_number_simd classifies number bytes with PSHUFB classifier,
-checking for illegal non-digit/non-structural bytes in one SIMD pass.
-Falls back to scalar validate_number for precise error codes."
-```
-
----
-
-### Task 5: Implement pass fusion (validate_eager_fused)
-
-**Files:**
-- Modify: `src/validate/mod.rs`
-
-- [ ] **Step 1: Add `validate_eager_fused` function to `src/validate/mod.rs`**
-
-Add the new function before the test module. Place it after the existing `validate_eager_values` function (after line 271):
-
-```rust
-/// Fused eager validator: combines depth limit checking, trailing-content
-/// detection, and grammar/value validation into a single walk over `indices`.
-///
-/// Replaces `validate_depth` + `validate_trailing` + `validate_eager_values`.
-pub(crate) fn validate_eager_fused(
-    buf: &[u8],
-    indices: &[u32],
-    max_depth: u32,
-) -> Result<(), qjson_err> {
-    let mut stack: Vec<CtxKind> = Vec::with_capacity(16);
-    stack.push(CtxKind::Top);
-
-    let mut depth: u32 = 0;
-    let mut prev_end: usize = 0;
-    let mut i: usize = 0;
-
-    while i < indices.len() {
-        let idx = indices[i];
-        if idx == u32::MAX { break; }
-        let pos = idx as usize;
-        let b = buf[pos];
-
-        consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?;
-
-        match b {
-            b'{' | b'[' => {
-                let cur = stack.last_mut().unwrap();
-                match *cur {
-                    CtxKind::Top
-                    | CtxKind::ArrAfterOpen
-                    | CtxKind::ArrAfterComma
-                    | CtxKind::ObjAfterColon => {
-                        *cur = parent_after_value(*cur);
-                        // Depth check: increment on open brace/bracket.
-                        depth += 1;
-                        if depth > max_depth {
-                            return Err(qjson_err::QJSON_NESTING_TOO_DEEP);
-                        }
-                        stack.push(if b == b'{' {
-                            CtxKind::ObjAfterOpen
-                        } else {
-                            CtxKind::ArrAfterOpen
-                        });
-                    }
-                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
-                }
-                prev_end = pos + 1;
-                i += 1;
-            }
-            b'}' => {
-                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
-                if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) {
-                    return Err(qjson_err::QJSON_PARSE_ERROR);
-                }
-                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
-                depth -= 1;
-                // Trailing check: when depth returns to 0 (root container
-                // closed) AND the root grammar state is satisfied, check
-                // for trailing content.
-                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
-                    let closer_pos = pos;
-                    let mut p = closer_pos + 1;
-                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
-                    if p < buf.len() {
-                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
-                    }
-                }
-                prev_end = pos + 1;
-                i += 1;
-            }
-            b']' => {
-                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
-                if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) {
-                    return Err(qjson_err::QJSON_PARSE_ERROR);
-                }
-                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
-                depth -= 1;
-                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
-                    let mut p = pos + 1;
-                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
-                    if p < buf.len() {
-                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
-                    }
-                }
-                prev_end = pos + 1;
-                i += 1;
-            }
-            b',' => {
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
-                match *cur {
-                    CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma,
-                    CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma,
-                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
-                }
-                prev_end = pos + 1;
-                i += 1;
-            }
-            b':' => {
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
-                match *cur {
-                    CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon,
-                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
-                }
-                prev_end = pos + 1;
-                i += 1;
-            }
-            b'"' => {
-                if i + 1 >= indices.len() { return Err(qjson_err::QJSON_PARSE_ERROR); }
-                let close = indices[i + 1] as usize;
-                if close <= pos || close >= buf.len() || buf[close] != b'"' {
-                    return Err(qjson_err::QJSON_PARSE_ERROR);
-                }
-                strings::validate_string_span(&buf[pos + 1 .. close])?;
-
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
-                match *cur {
-                    CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => {
-                        *cur = CtxKind::ObjAfterKey;
-                    }
-                    CtxKind::Top
-                    | CtxKind::ArrAfterOpen
-                    | CtxKind::ArrAfterComma
-                    | CtxKind::ObjAfterColon => {
-                        *cur = parent_after_value(*cur);
-                    }
-                    _ => return Err(qjson_err::QJSON_PARSE_ERROR),
-                }
-                // Trailing check for string roots: when Top→TopDone and
-                // depth is 0, check for trailing content.
-                if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
-                    let mut p = close + 1;
-                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
-                    if p < buf.len() {
-                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
-                    }
-                }
-                prev_end = close + 1;
-                i += 2;
-            }
-            _ => return Err(qjson_err::QJSON_PARSE_ERROR),
-        }
-    }
-
-    // Tail: top-level scalar root (e.g. `42`, `true`).
-    consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?;
-
-    // Trailing check for scalar roots.
-    if stack.len() == 1 && stack[0] == CtxKind::TopDone {
-        let mut p = prev_end;
-        // If prev_end was set to the end of a scalar, check for whitespace
-        // then non-whitespace.
-        if p < buf.len() {
-            // prev_end is already past the scalar. Check the remaining buffer.
-            // For scalar roots, the consume_scalar_gap at line 263 walks to
-            // buf.len(), but may have consumed only the gap. The trailing
-            // bytes past the value end are the issue.
-            // Re-find the end of the root value from the beginning.
-            let mut scan = 0usize;
-            while scan < buf.len() && is_ws(buf[scan]) { scan += 1; }
-            let val_start = scan;
-            while scan < buf.len() && !is_ws(buf[scan]) { scan += 1; }
-            while scan < buf.len() && is_ws(buf[scan]) { scan += 1; }
-            if scan < buf.len() {
-                return Err(qjson_err::QJSON_TRAILING_CONTENT);
-            }
-        }
-    }
-
-    if stack.len() != 1 || stack[0] != CtxKind::TopDone {
-        return Err(qjson_err::QJSON_PARSE_ERROR);
-    }
-    Ok(())
-}
-```
-
-- [ ] **Step 2: Add fused tests to the validate::tests module**
-
-Add after the existing grammar tests (after line 487):
-
-```rust
-    // ── fused validator tests ────────────────────────────────────────
-
-    #[test]
-    fn fused_accepts_clean_input() {
-        for buf in [
-            &b"{}"[..], &b"[]"[..], &b"{\"a\":1}"[..],
-            &b"[1,2,3]"[..], &b"42"[..], &b"\"hi\""[..],
-            &b"[true,false,null]"[..],
-        ] {
-            assert!(validate_eager_fused(buf, &ix(buf), 1024).is_ok(),
-                "fused should accept {:?}", buf);
-        }
-    }
-
-    #[test]
-    fn fused_rejects_trailing_content() {
-        let buf = b"{}garbage";
-        assert_eq!(
-            validate_eager_fused(buf, &ix(buf), 1024),
-            Err(qjson_err::QJSON_TRAILING_CONTENT),
-        );
-    }
-
-    #[test]
-    fn fused_rejects_excessive_depth() {
-        let buf = b"[[[1]]]";
-        assert_eq!(
-            validate_eager_fused(buf, &ix(buf), 2),
-            Err(qjson_err::QJSON_NESTING_TOO_DEEP),
-        );
-    }
-
-    #[test]
-    fn fused_depth_ok_at_limit() {
-        let buf = b"[[1]]";
-        assert!(validate_eager_fused(buf, &ix(buf), 2).is_ok());
-    }
-
-    #[test]
-    fn fused_trailing_whitespace_accepted() {
-        let buf = b"{}   \n\t";
-        assert!(validate_eager_fused(buf, &ix(buf), 1024).is_ok());
-    }
-
-    #[test]
-    fn fused_two_root_scalars_rejected() {
-        let buf = b"1 2";
-        assert_eq!(
-            validate_eager_fused(buf, &ix(buf), 1024),
-            Err(qjson_err::QJSON_TRAILING_CONTENT),
-        );
-    }
-
-    #[test]
-    fn fused_trailing_in_nested_container_detected() {
-        let buf = b"[1] x";
-        assert_eq!(
-            validate_eager_fused(buf, &ix(buf), 1024),
-            Err(qjson_err::QJSON_TRAILING_CONTENT),
-        );
-    }
-```
-
-- [ ] **Step 3: Run tests**
-
-```bash
-cargo test --release validate::mod
-```
-
-Expected: All existing grammar tests + new fused tests pass.
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add src/validate/mod.rs
-git commit -m "perf: add validate_eager_fused merging depth+trailing+grammar
-
-Single O(indices) traversal replaces 3 separate passes. Depth check
-inlined on container push. Trailing-content check triggered when
-grammar state reaches TopDone at depth 0."
-```
-
----
-
-### Task 6: Wire fused validator in doc.rs + full test suite
-
-**Files:**
-- Modify: `src/doc.rs`
-
-- [ ] **Step 1: Replace 3 validation calls with fused call in `doc.rs`**
-
-In `src/doc.rs` `parse_with_options` (lines 33-38), replace:
-
-```rust
-        crate::validate::validate_depth(buf, &indices, max_depth)?;
-
-        if opts.is_eager() {
-            crate::validate::validate_trailing(buf, &indices)?;
-            crate::validate::validate_eager_values(buf, &indices)?;
-        }
-```
-
-With:
-
-```rust
-        if opts.is_eager() {
-            crate::validate::validate_eager_fused(buf, &indices, max_depth)?;
-        } else {
-            crate::validate::validate_depth(buf, &indices, max_depth)?;
-        }
-```
-
-The `validate_depth` call stays for LAZY mode (which only checks bracket depth). The eager path now uses the fused validator.
-
-- [ ] **Step 2: Run full test suite**
-
-```bash
-cargo test --release
-```
-
-Expected: All tests pass (144 unit + all integration tests). Verify:
-- `doc::tests::parses_simple_object` — basic parse
-- `doc::tests::parse_with_lazy_skips_eager_validation` — lazy mode unchanged
-- `json_test_suite` — all Y/N/I files parse correctly
-- `ffi_smoke` — FFI tests if applicable
-
-- [ ] **Step 3: Also test scalar-only mode (no SIMD)**
-
-```bash
-cargo test --release --no-default-features
-```
-
-Expected: All tests pass. The scalar fallback paths are exercised.
-
-- [ ] **Step 4: Run clippy lint**
-
-```bash
-cargo clippy --release --all-targets -- -D warnings
-```
-
-Expected: No warnings. Fix any that appear.
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add src/doc.rs
-git commit -m "perf: wire validate_eager_fused into Document::parse_with_options
-
-Eager mode now calls the fused validator (depth+trailing+grammar in
-one pass). Lazy mode still uses standalone validate_depth only."
-```
-
----
-
-### Task 7: Cross-validation and edge case hardening
-
-**Files:**
-- Modify: `src/validate/classify.rs` (add any missing tests)
-- Modify: `src/validate/mod.rs` (fix any trailing detection edge cases)
-
-- [ ] **Step 1: Run the scanner crosscheck test**
-
-```bash
-cargo test --release --test scanner_crosscheck
-```
-
-Expected: Both `scalar_avx2_bit_identical` and `skip_neon` pass (these tests verify scanner output parity; they don't exercise the validator, but ensure we haven't broken anything).
-
-- [ ] **Step 2: Run third-party fixture tests**
-
-```bash
-cargo test --release --test third_party_fixtures
-```
-
-Expected: All 17 tests pass. This exercises real-world JSON from cJSON and simdjson test suites under the fused validator.
-
-- [ ] **Step 3: Run JSONTestSuite conformance tests**
-
-```bash
-cargo test --release --test json_test_suite
-```
-
-Expected: All 3 tests pass (`y_files_accepted_in_both_modes`, `n_files_rejected_in_eager_mode`, `document_i_files_behavior`).
-
-- [ ] **Step 4: Run full suite one final time**
-
-```bash
-cargo test --release
-cargo test --release --no-default-features
-```
-
-Expected: All tests pass in both configurations.
-
-- [ ] **Step 5: Commit (if any fixes)**
-
-```bash
-git add -A
-git commit -m "test: verify fused validator against full test suite"
-```
-
----
-
-### Task 8: Final integration — check CLAUDE.md update
-
-**Files:**
-- Modify: `CLAUDE.md` (if architecture section needs update)
-
-The CLAUDE.md describes the Phase 1 validation flow. Since the external behavior is unchanged (same error codes, same parse semantics), no docs update is strictly required. However, update the architecture section to reflect the fused pass.
-
-- [ ] **Step 1: Update CLAUDE.md architecture section**
-
-Find the paragraph starting with "Phase 1" and update the description of post-scan validation. The current text:
-
-```
-Then `validate_depth` is run unconditionally; in EAGER mode,
-`validate_trailing` and `validate_eager_values` (number ABNF + string
-content + UTF-8) follow.
-```
-
-Replace with:
-
-```
-Then in LAZY mode only `validate_depth` is run. In EAGER mode,
-`validate_eager_fused` runs — a single O(indices) pass that combines
-depth checking, trailing-content detection, and grammar/value
-validation (number ABNF + string content + UTF-8).
-```
-
-- [ ] **Step 2: Commit**
-
-```bash
-git add CLAUDE.md
-git commit -m "docs: update CLAUDE.md for fused eager validation"
-```
-
----
diff --git a/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md b/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md
deleted file mode 100644
index 4f941f6..0000000
--- a/docs/superpowers/specs/2026-05-22-fuse-eager-simd-design.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# Fuse & Accelerate: Eager Decode SIMD Optimization
-
-Date: 2026-05-22
-Branch: `fuse-eager-passes`
-
-## Motivation
-
-The eager decode path (`Document::parse_with_options` in `src/doc.rs`) runs **4 independent passes** over the `indices` array after structural scanning:
-
-1. `validate_depth` — depth counting
-2. `validate_trailing` — reject trailing non-whitespace
-3. `validate_eager_values` — grammar state machine + string validation + number validation
-
-Each pass is a scalar O(indices) walk. Additionally, string validation SIMD (`strings/avx2.rs`) is conservative: it hands off to scalar on the *first* interesting byte found (backslash, control, or high-bit), leaving most of the SIMD register width unused on mixed content. Number validation has no SIMD path at all.
-
-Target: ASCII-dominant JSON payloads (REST APIs, config files), x86-64 with AVX2 + AVX-512 runtime dispatch, throughput-focused.
-
-## Architecture
-
-### Pass Fusion
-
-Merge three post-scan validation passes into one:
-
-```
-Before:
-  scan(buf) → indices
-  validate_depth(buf, indices, max_depth)
-  validate_trailing(buf, indices)
-  validate_eager_values(buf, indices)
-
-After:
-  scan(buf) → indices
-  validate_eager_fused(buf, indices, max_depth)
-```
-
-`validate_eager_fused` integrates depth checking and trailing-content detection into the existing grammar state machine:
-
-- **Depth**: increment on `{`/`[` push; if depth > max_depth → `QJSON_NESTING_TOO_DEEP`.
-- **Trailing**: after the grammar state reaches `TopDone`, any further non-whitespace byte → `QJSON_TRAILING_CONTENT`.
-
-The `CtxKind` enum and state-machine structure from `validate_eager_values` are preserved. The existing `validate_depth` and `validate_trailing` functions remain in the codebase but are no longer called in the eager hot path (they stay available for lazy mode or internal reuse).
-
-### PSHUFB Byte Classifier for String Validation
-
-Replace the current AVX2 "find-first-interesting-byte-then-scalar" approach with a **nibble-LUT byte classifier** using `_mm256_shuffle_epi8` (PSHUFB).
-
-**Classification bits** (one u8 per byte):
-
-| Bit | Meaning |
-|-----|---------|
-| 0   | Control char (0x00..0x1F) |
-| 1   | Backslash (0x5C) |
-| 2   | High-bit byte (0x80..0xFF) |
-| 3   | Printable ASCII (0x20..0x7E, excluding backslash) |
-
-**Algorithm per 32-byte chunk:**
-1. Split each byte into high-nibble and low-nibble via shift + mask.
-2. `_mm256_shuffle_epi8(lo_nibble, lo_lut)` and `_mm256_shuffle_epi8(hi_nibble, hi_lut)`.
-3. AND low and high LUT results → per-byte class bitmask.
-4. If any bit 0 set → `QJSON_INVALID_STRING` (control char).
-5. If bits 1 and 2 are zero → pure printable ASCII, advance 32 bytes.
-6. Otherwise: scan class bitmask for backslash positions, validate escape sequences; for high-bit bytes, run SIMD-enhanced UTF-8 validation.
-
-Key improvement: the classifier tells us **exactly which bytes need what kind of attention**, rather than a binary "there's a problem here". Multiple backslashes in one chunk are all located without re-scanning. High-bit bytes are identified by position, enabling batch UTF-8 validation.
-
-### AVX-512 Dual Path
-
-New file `src/validate/strings/avx512.rs`, dispatched at runtime via the existing `OnceCell` pattern in `strings/mod.rs`.
-
-| Feature | AVX2 | AVX-512 |
-|---------|------|---------|
-| Register width | 32B (ymm) | 64B (zmm) |
-| Movemask | `_mm256_movemask_epi8` → u32 | `_mm256_movepi8_mask` (AVX512BW/VL) → `__mmask32`, zero-cost |
-| Byte classifier | Two ymm PSHUFB per chunk | Two ymm PSHUFB per 32B half (AVX-512VBMI not required) |
-| Masking | Manual `u32` bitmask | Native `__mmask32` with `_mm256_maskz_*` operations |
-| Chunk throughput | 32B/iter | 64B/iter (loop processes two 32B halves) |
-
-**Dispatch priority**: AVX-512 (Ice Lake 2019+, Zen 4 2022+) → AVX2 (Haswell 2013+) → scalar fallback.
-
-**Not included**: AVX-512VBMI (`vpermb` for zmm-wide PSHUFB). This requires Cannon Lake/Ice Lake+ and the gain over loop-unrolled ymm PSHUFB is marginal for string validation.
-
-### SIMD-Accelerated Number Validation
-
-Extend the PSHUFB classifier with two additional bits:
-
-| Bit | Meaning |
-|-----|---------|
-| 4   | Digit (0x30..0x39) |
-| 5   | Number structural (0x2E `.`, 0x2D `-`, 0x65 `e`, 0x45 `E`, 0x2B `+`) |
-
-**Hot path** for numbers in `consume_scalar_gap`:
-1. Classify 32-byte chunk(s) of the number byte range.
-2. `illegal = !(digit | structural)` — if mask is non-zero, scalar fallback handles exact error location.
-3. Validate ABNF structure: leading zero check, digit-after-dot check, digit-after-exponent check — verified via popcount and bit-scan on the classification mask, falling back to the existing scalar `validate_number` for precise error codes when structure is violated.
-
-When a number is short (≤32 bytes, i.e. the vast majority of real-world numbers), it fits in one SIMD iteration. The existing scalar `validate_number` remains as fallback for correctness and precise error reporting.
-
-## Files Changed
-
-| File | Change |
-|------|--------|
-| `src/validate/mod.rs` | Add `validate_eager_fused()` merging depth + trailing + grammar. Keep existing functions. |
-| `src/validate/strings/avx2.rs` | Rewrite with PSHUFB nibble-LUT classifier. |
-| `src/validate/strings/avx512.rs` | **New.** AVX-512BW+VL 64B chunk path. |
-| `src/validate/strings/mod.rs` | Add AVX-512 to dispatch. |
-| `src/validate/number.rs` | Add `validate_number_simd()` with PSHUFB classifier. |
-| `src/doc.rs` | Replace 3 validate calls with single `validate_eager_fused`. |
-| `Cargo.toml` | Optionally add `avx512` feature gate (feature name only; dispatch uses runtime detection). |
-
-## Files NOT Changed
-
-- `src/scan/` — structural scanner unchanged.
-- `src/cursor.rs`, `src/path.rs` — Phase 2 unchanged.
-- `src/decode/` — lazy decode unchanged (still calls `validate_string_span` which now uses the new SIMD paths transparently).
-- `src/ffi.rs`, `lua/qjson.lua` — FFI surface unchanged.
-- `include/qjson.h` — public header unchanged.
-
-## Risks
-
-1. **Error-code precedence.** When fused pass encounters multiple errors simultaneously (e.g., depth violation AND invalid string), current behavior picks the first detected. The fused pass must preserve this.
-2. **AVX-512 dispatch stability.** Some VM/hypervisor configurations mask AVX-512 CPUID bits inconsistently. The existing `is_x86_feature_detected!()` pattern is proven safe for this.
-3. **PSHUFB LUT correctness.** The 16-entry nibble LUTs must be exhaustively verified against the existing scalar validator for all 256 byte values. This is done in unit tests.
-
-## Expected Performance Impact
-
-- **Pass fusion**: ~15-25% throughput improvement for small-to-medium payloads (eliminates 2 full indices traversals).
-- **PSHUFB string validation**: ~20-40% improvement for string-heavy payloads (no premature scalar fallback; CJK/escape content benefits most).
-- **AVX-512 string validation**: ~10-15% additional improvement over AVX2 (2× chunk width, native mask registers).
-- **SIMD number validation**: ~10-20% improvement for number-dense payloads (arrays of numbers, metrics responses).
-
-Combined estimate: **30-50%** throughput improvement on typical REST API payloads.

From 4634a3b6046c5a3189b567f80c700ef00c8a829a Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:41:38 +0000
Subject: [PATCH 09/17] docs: update README, benchmarks, CLAUDE.md for fused
 eager + PSHUFB

- README: update Status section with fused validation + PSHUFB description
- benchmarks.md: add eager validation micro-benchmark section (13-15% improvement), update observation #3
- CLAUDE.md: update Phase 1 architecture and Layout sections
---
 CLAUDE.md          | 11 ++++++++---
 README.md          |  2 +-
 docs/benchmarks.md | 18 ++++++++++++++++--
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 01885c7..2eca0cc 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -45,10 +45,14 @@ cargo test --features test-panic --release
 
 ### Two-phase parse
 
-**Phase 1** (`src/scan/`, called from `Document::parse_with_options`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices`. Then `validate_depth` is run unconditionally; in EAGER mode, `validate_trailing` and `validate_eager_values` (number ABNF + string content + UTF-8) follow. In LAZY mode, value-level checks are skipped and rely on the lazy decode path at field-access time. A `u32::MAX` sentinel is appended. The scanner is selected at first use via `OnceCell` in `src/scan/mod.rs`:
+**Phase 1** (`src/scan/`, called from `Document::parse_with_options`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices`. In LAZY mode, only `validate_depth` is run. In EAGER mode, `validate_eager_fused` runs — a single O(indices) pass that combines depth checking, trailing-content detection, and grammar/value validation (number ABNF + string content + UTF-8). String validation uses a PSHUFB nibble-LUT byte classifier (`src/validate/classify.rs`) for per-byte class bitmasks in ~3 SIMD ops per 32-byte chunk. A `u32::MAX` sentinel is appended. The scanner and string validator are selected at first use via `OnceCell`:
 
-- `Avx2Scanner` (gated by the `avx2` cargo feature, default-on) when both `avx2` and `pclmulqdq` are detected at runtime.
-- `ScalarScanner` otherwise.
+- **Scanner** (`src/scan/mod.rs`):
+  - `Avx2Scanner` (gated by the `avx2` cargo feature, default-on) when both `avx2` and `pclmulqdq` are detected at runtime.
+  - `ScalarScanner` otherwise.
+- **String validator** (`src/validate/strings/mod.rs`):
+  - AVX2 PSHUFB classifier when `avx2` is detected.
+  - Scalar state machine otherwise.
 
 Validation level depends on `qjson_options.mode`. **EAGER** (default): a post-scan pass walks `indices` and validates RFC 8259 number ABNF, string content (no unescaped control chars), and UTF-8 — parse fails on any value-level violation. **LAZY** (opt-in): bracket/quote balance + max-depth only; value-level errors surface when the offending field is accessed (lua-cjson-equivalent behavior). Trailing-content rejection and value-level validation are eager-only; max-depth (default 1024, configurable up to 4096) is enforced in both modes.
 
@@ -72,6 +76,7 @@ src/
   cursor.rs       Cursor + path resolution + skip-cache walk
   path.rs         zero-alloc path-string iterator
   decode/         lazy string / number decode
+  validate/       post-scan validators: validate_eager_fused, depth, strings, numbers
   scan/           ScalarScanner, Avx2Scanner, runtime dispatch
   skip_cache.rs   Phase 2 sibling-skip cache
   error.rs        qjson_err + qjson_type enums (must stay in sync with include/qjson.h and lua/qjson.lua)
diff --git a/README.md b/README.md
index 59d7738..536d21c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
+Scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Eager validation uses a fused single-pass grammar state machine with a PSHUFB nibble-LUT byte classifier for string validation. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
 
 ## Building
 
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index fe6f09f..ea3f530 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -146,8 +146,9 @@ key into the Lua table heap.
    multimodal bodies.** The benchmark touches the top-level request fields and
    one `content` field per message; the payload size comes from image data
    inside each message.
-3. **Speedup remains high at 10 MB.** The eager-decode optimization
-   keeps `qjson.parse` throughput scaling well even at the 10 MB level,
+3. **Speedup remains high at 10 MB.** The eager decode deduplication
+   (skip re-validation when eagerly validated) and fused eager validation
+   passes keep `qjson.parse` throughput scaling well even at the 10 MB level,
    maintaining ~38× over cjson and ~5× over simdjson.
 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
@@ -164,6 +165,19 @@ key into the Lua table heap.
    savings remain dramatic because `cjson` must materialize every nested
    object and string into the Lua heap.
 
+## Eager validation micro-benchmark (Rust)
+
+The eager validation path was optimized by fusing three separate post-scan
+passes (`validate_depth`, `validate_trailing`, `validate_eager_values`) into a
+single `validate_eager_fused` traversal, and replacing the AVX2 string validator
+with a PSHUFB nibble-LUT byte classifier. On 1 MB payloads (10-run avg, AMD
+EPYC Rome Zen 2):
+
+| Payload | Before | After | Improvement |
+|---------|--------|-------|-------------|
+| GitHub-style REST API (pure ASCII) | 1,688 ± 97 us | 1,462 ± 39 us | **13.4%** |
+| Escape-heavy (\n \t \\ \uXXXX) | 912 ± 77 us | 776 ± 30 us | **14.9%** |
+
 ## When to pick which
 
 - **Read most/all fields** → `cjson`.

From 4d52f6b8eb8351a3cae1df310ad5517377fe69a5 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:43:19 +0000
Subject: [PATCH 10/17] docs: add Rust micro-benchmark reference to README

Note 13-15% eager validation improvement from fused pass + PSHUFB
classifier. Clarify that Lua bench numbers already include this.
---
 README.md          | 5 +++++
 docs/benchmarks.md | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 536d21c..b401b1d 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,11 @@ with similar throughput. Memory retention for `qjson` is essentially
 flat in payload size (a few KB for the reusable buffers), while `cjson`
 and `simdjson` retain more Lua heap because they materialize the table tree.
 
+The eager validation path (fused single-pass grammar + PSHUFB string
+classifier) yields **13–15% throughput improvement** on 1 MB payloads
+measured at the Rust level. See [`docs/benchmarks.md`](docs/benchmarks.md)
+for the micro-benchmark data and the full size ladder.
+
 See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
 memory numbers, an "encode round-trip" row (passthrough emit via
 `memcpy`), exact environment, and the reproduction command. `make bench`
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ea3f530..5054581 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -170,8 +170,9 @@ key into the Lua table heap.
 The eager validation path was optimized by fusing three separate post-scan
 passes (`validate_depth`, `validate_trailing`, `validate_eager_values`) into a
 single `validate_eager_fused` traversal, and replacing the AVX2 string validator
-with a PSHUFB nibble-LUT byte classifier. On 1 MB payloads (10-run avg, AMD
-EPYC Rome Zen 2):
+with a PSHUFB nibble-LUT byte classifier. The Lua bench numbers above already
+include this improvement. On 1 MB payloads measured at the Rust level (10-run
+avg, AMD EPYC Rome Zen 2):
 
 | Payload | Before | After | Improvement |
 |---------|--------|-------|-------------|

From 38a6be3898cc43e25fae689c2e85e568ba7d8b9d Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:46:51 +0000
Subject: [PATCH 11/17] docs: update benchmark data from current make bench run

Update README and benchmarks.md tables with actual throughput,
speedup, and memory delta numbers from make bench on current
branch (includes fused validation + decode deduplication).
---
 README.md          | 10 +++---
 docs/benchmarks.md | 76 +++++++++++++++++++++++-----------------------
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index b401b1d..d83c6d7 100644
--- a/README.md
+++ b/README.md
@@ -105,11 +105,11 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
 
 | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB |  94,075 | 108,108 | 127,214 | 120,398 |  1.4× /  1.3× |
-|  60 KB |   9,041 |  83,043 | 123,487 | 214,500 | 13.7× / 23.7× |
-| 100 KB |   5,302 |  32,248 | 109,649 | 102,564 | 20.7× / 19.3× |
-|   1 MB |     517 |   3,538 |  16,520 |  16,988 | 32.0× / 32.9× |
-|  10 MB |      50 |     402 |   1,899 |   1,918 | 38.0× / 38.4× |
+|   2 KB |  90,851 | 108,762 | 127,966 | 142,361 |  1.4× /  1.6× |
+|  60 KB |   8,941 |  81,050 | 117,151 | 203,252 | 13.1× / 22.7× |
+| 100 KB |   5,346 |  44,366 | 122,249 | 130,208 | 22.9× / 24.4× |
+|   1 MB |     513 |   3,610 |  14,124 |  15,167 | 27.5× / 29.6× |
+|  10 MB |      50 |     389 |   1,576 |   1,599 | 31.5× / 32.0× |
 
 `qjson.parse` wins because it skips building a Lua table for the parts you
 never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 5054581..f748120 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -80,33 +80,33 @@ Numbers below come from one such run.
 Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---:|---:|---:|---:|---:|---:|
-| small      |   2.1 KB |  94,075 | 108,108 | 127,214 | 120,398 | 203,666 |
-| medium     |  60.4 KB |   9,041 |  83,043 | 123,487 | 214,500 | 214,408 |
-| github-100k |   100 KB |   2,238 |   2,047 |   6,010 |   5,994 |   6,701 |
-| 100k       |   100 KB |   5,302 |  32,248 | 109,649 | 102,564 | 114,548 |
-| 200k       |   200 KB |   2,659 |  19,040 |  90,090 |  92,251 | 106,383 |
-| 500k       |   500 KB |   1,052 |   7,062 |  34,722 |  35,336 |  37,453 |
-| 1m         |  1.00 MB |     517 |   3,538 |  16,520 |  16,988 |  17,261 |
-| 2m         |  2.00 MB |     258 |   2,026 |   9,021 |   8,580 |   9,033 |
-| 5m         |  5.00 MB |     102 |     663 |   2,982 |   3,728 |   3,829 |
-| 10m        | 10.00 MB |      50 |     402 |   1,899 |   1,918 |   1,925 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 |
+|---|---|---:|---:|---:|---:|---:|---:|
+| small      |   2.1 KB |  90,851 | 108,762 | 127,966 | 142,361 | 224,427 |
+| medium     |  60.4 KB |   8,941 |  81,050 | 117,151 | 203,252 | 197,707 |
+| github-100k |   100 KB |   2,284 |   2,090 |   6,272 |   6,305 |   6,931 |
+| 100k       |   100 KB |   5,346 |  44,366 | 122,249 | 130,208 | 146,628 |
+| 200k       |   200 KB |   2,677 |  20,425 |  71,839 |  44,444 |  62,422 |
+| 500k       |   500 KB |   1,064 |   7,307 |  29,070 |  29,028 |  34,364 |
+| 1m         |  1.00 MB |     513 |   3,610 |  14,124 |  15,167 |  14,940 |
+| 2m         |  2.00 MB |     257 |   2,018 |   7,686 |   7,761 |   7,981 |
+| 5m         |  5.00 MB |     103 |     790 |   2,628 |   3,337 |   3,367 |
+| 10m        | 10.00 MB |      50 |     389 |   1,576 |   1,599 |   1,642 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,149 | 9,208 | 28,627 | 27,624 | 25,275 |
 
 ### Speed-up vs. baselines
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
-|---|---:|---:|---:|---:|
-| small  |  1.4× |  1.2× |  1.3× |  1.1× |
-| medium | 13.7× |  1.5× | 23.7× |  2.6× |
-| github-100k | 2.7× |  2.9× | 2.7× |  2.9× |
-| 100k   | 20.7× |  3.4× | 19.3× |  3.2× |
-| 200k   | 33.9× |  4.7× | 34.7× |  4.8× |
-| 500k   | 33.0× |  4.9× | 33.6× |  5.0× |
-| 1m     | 32.0× |  4.7× | 32.9× |  4.8× |
-| 2m     | 35.0× |  4.5× | 33.3× |  4.2× |
-| 5m     | 29.2× |  4.5× | 36.5× |  5.6× |
-| 10m    | 38.0× |  4.7× | 38.4× |  4.8× |
+|---|---|---:|---:|---:|---:|
+| small  |  1.4× |  1.2× |  1.6× |  1.3× |
+| medium | 13.1× |  1.4× | 22.7× |  2.5× |
+| github-100k | 2.7× |  3.0× | 2.8× |  3.0× |
+| 100k   | 22.9× |  2.8× | 24.4× |  2.9× |
+| 200k   | 26.8× |  3.5× | 16.6× |  2.2× |
+| 500k   | 27.3× |  4.0× | 27.3× |  4.0× |
+| 1m     | 27.5× |  3.9× | 29.6× |  4.2× |
+| 2m     | 29.9× |  3.8× | 30.2× |  3.8× |
+| 5m     | 25.5× |  3.3× | 32.4× |  4.2× |
+| 10m    | 31.5× |  4.1× | 32.0× |  4.1× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -115,18 +115,18 @@ the timing rounds without forcing a final collection, so short-lived garbage
 from the last round may still be included.
 
 | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---:|---:|---:|---:|---:|
-| small      | +15,493 | +15,500 | +4,066 | +15,116 | +11,140 |
-| medium     |  +1,955 |  +2,660 |   +333 |  +1,114 |  +1,120 |
-| github-100k | +12,018 | +3,527 |    +14 |    +536 |    +230 |
-| 100k       |    +485 |   +748 |    +67 |    +692 |    +229 |
-| 200k       |    +392 |   +523 |    +34 |    +346 |    +112 |
-| 500k       |    +577 |   +630 |    +14 |    +139 |     +45 |
-| 1m         |  +1,082 | +1,121 |    +10 |    +104 |     +34 |
-| 2m         |  +1,155 | +1,248 |    +14 |    +208 |     +45 |
-| 5m         |  +1,316 | +1,538 |    +14 |    +400 |     +45 |
-| 10m        |  +1,583 | +2,014 |    +14 |    +708 |     +45 |
-| interleaved | +3,356 | +4,404 |   +268 |  +2,771 |    +897 |
+|---|---|---:|---:|---:|---:|---:|
+| small      | +15,493 | +15,497 | +4,069 | +15,223 | +11,140 |
+| medium     |  +1,954 |  +2,660 |   +334 |    +537 |  +1,120 |
+| github-100k | +11,911 | +4,124 |    +14 |    +566 |    +230 |
+| 100k       |    +484 |    +748 |    +67 |    +710 |    +229 |
+| 200k       |    +392 |    +523 |    +34 |    +346 |    +117 |
+| 500k       |    +577 |    +630 |    +14 |    +139 |     +45 |
+| 1m         |  +1,082 |  +1,121 |    +10 |    +104 |     +34 |
+| 2m         |  +1,155 |  +1,248 |    +14 |    +208 |     +45 |
+| 5m         |  +1,316 |  +1,538 |    +14 |    +400 |     +45 |
+| 10m        |  +1,583 |  +2,014 |    +14 |    +722 |     +45 |
+| interleaved | +3,356 | +4,405 |   +268 |  +2,778 |    +898 |
 
 `qjson.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -139,8 +139,8 @@ key into the Lua table heap.
 
 1. **`qjson` is fastest once payloads move beyond tiny inputs.**
    The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
-   larger multimodal payloads show roughly 14–38× higher throughput than
-   `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson`
+   larger multimodal payloads show roughly 13–31× higher throughput than
+   `cjson` and roughly 2–4× higher throughput than `lua-resty-simdjson`
    for request-field access.
 2. **Reading every `messages[*].content` is still access-light for large
    multimodal bodies.** The benchmark touches the top-level request fields and
@@ -149,7 +149,7 @@ key into the Lua table heap.
 3. **Speedup remains high at 10 MB.** The eager decode deduplication
    (skip re-validation when eagerly validated) and fused eager validation
    passes keep `qjson.parse` throughput scaling well even at the 10 MB level,
-   maintaining ~38× over cjson and ~5× over simdjson.
+   maintaining ~32× over cjson and ~4× over simdjson.
 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
    JSON after light-touch inspection. The substring fast path means

From 24ff77dc5ea0b5f87c88ad05c0128a765eb6d4b6 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 17:54:51 +0000
Subject: [PATCH 12/17] docs: refresh bench data with 10-round mean values

- Increase ROUNDS from 5 to 10 for noise reduction
- Switch from median to mean ops/s across rounds
- Update all 3 tables (throughput, speedup, memory) with fresh
  make bench data on current branch
---
 README.md             | 14 ++++----
 benches/lua_bench.lua |  2 +-
 docs/benchmarks.md    | 74 +++++++++++++++++++++----------------------
 3 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index d83c6d7..0d03af6 100644
--- a/README.md
+++ b/README.md
@@ -100,16 +100,16 @@ LD_LIBRARY_PATH="$PWD/target/release" \
 
 `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
 chat-completion payloads, "parse + access model, temperature, and all
-messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1,
-AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
+messages[*].content paths" workload (mean ops/s under OpenResty LuaJIT 2.1,
+AMD EPYC Rome (Zen 2, 4 vCPUs); 10 rounds, deterministic payload):
 
 | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB |  90,851 | 108,762 | 127,966 | 142,361 |  1.4× /  1.6× |
-|  60 KB |   8,941 |  81,050 | 117,151 | 203,252 | 13.1× / 22.7× |
-| 100 KB |   5,346 |  44,366 | 122,249 | 130,208 | 22.9× / 24.4× |
-|   1 MB |     513 |   3,610 |  14,124 |  15,167 | 27.5× / 29.6× |
-|  10 MB |      50 |     389 |   1,576 |   1,599 | 31.5× / 32.0× |
+|   2 KB | 100,127 | 109,588 | 130,867 | 105,038 |  1.3× /  1.0× |
+|  60 KB |   8,701 |  77,936 | 135,700 | 177,650 | 15.6× / 20.4× |
+| 100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 26.2× / 25.1× |
+|   1 MB |     498 |   3,697 |  15,831 |  15,784 | 31.8× / 31.7× |
+|  10 MB |      50 |     383 |   1,473 |   1,548 | 29.5× / 31.0× |
 
 `qjson.parse` wins because it skips building a Lua table for the parts you
 never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 30a3977..96bdd72 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -140,7 +140,7 @@ local function make_payload(target_bytes)
         .. table.concat(messages, ",") .. ']}'
 end
 
-local ROUNDS = 5
+local ROUNDS = 10
 
 local function bench(name, iters, fn)
     -- Warmup pass: lets JIT compile hot traces and any one-time pools fill
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index f748120..89dddfc 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -30,8 +30,8 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
    traces and the `qjson` `indices` / `scratch` buffers grow to their
    working size. Warmup is excluded from timing and the memory delta.
 2. `collectgarbage("collect")` baseline.
-3. 5 rounds × N iterations of the workload; report the **median** ops/s
-   across rounds (mean + range also reported in the raw output).
+3. 10 rounds × N iterations of the workload (warmup excluded); report the
+   **mean** ops/s across rounds (median + range also shown in output).
 4. Final `collectgarbage("count")` to capture the post-run memory delta in
    KB. The harness does not force a final collection after timing, so
    short-lived garbage from the last round may still be included.
@@ -81,32 +81,32 @@ Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
 |---|---|---:|---:|---:|---:|---:|---:|
-| small      |   2.1 KB |  90,851 | 108,762 | 127,966 | 142,361 | 224,427 |
-| medium     |  60.4 KB |   8,941 |  81,050 | 117,151 | 203,252 | 197,707 |
-| github-100k |   100 KB |   2,284 |   2,090 |   6,272 |   6,305 |   6,931 |
-| 100k       |   100 KB |   5,346 |  44,366 | 122,249 | 130,208 | 146,628 |
-| 200k       |   200 KB |   2,677 |  20,425 |  71,839 |  44,444 |  62,422 |
-| 500k       |   500 KB |   1,064 |   7,307 |  29,070 |  29,028 |  34,364 |
-| 1m         |  1.00 MB |     513 |   3,610 |  14,124 |  15,167 |  14,940 |
-| 2m         |  2.00 MB |     257 |   2,018 |   7,686 |   7,761 |   7,981 |
-| 5m         |  5.00 MB |     103 |     790 |   2,628 |   3,337 |   3,367 |
-| 10m        | 10.00 MB |      50 |     389 |   1,576 |   1,599 |   1,642 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,149 | 9,208 | 28,627 | 27,624 | 25,275 |
+| small      |   2.1 KB | 100,127 | 109,588 | 130,867 | 105,038 | 210,886 |
+| medium     |  60.4 KB |   8,701 |  77,936 | 135,700 | 177,650 | 164,142 |
+| github-100k |   100 KB |   2,106 |   2,247 |   5,964 |   5,900 |   6,321 |
+| 100k       |   100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 145,613 |
+| 200k       |   200 KB |   2,504 |  18,630 |  71,441 |  47,214 |  47,481 |
+| 500k       |   500 KB |   1,013 |   8,005 |  34,562 |  33,646 |  34,683 |
+| 1m         |  1.00 MB |     498 |   3,697 |  15,831 |  15,784 |  16,277 |
+| 2m         |  2.00 MB |     248 |   1,860 |   6,723 |   7,722 |   8,003 |
+| 5m         |  5.00 MB |     100 |     643 |   3,141 |   3,153 |   3,171 |
+| 10m        | 10.00 MB |      50 |     383 |   1,473 |   1,548 |   1,551 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,136 | 9,088 | 28,963 | 30,565 | 31,006 |
 
 ### Speed-up vs. baselines
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
 |---|---|---:|---:|---:|---:|
-| small  |  1.4× |  1.2× |  1.6× |  1.3× |
-| medium | 13.1× |  1.4× | 22.7× |  2.5× |
-| github-100k | 2.7× |  3.0× | 2.8× |  3.0× |
-| 100k   | 22.9× |  2.8× | 24.4× |  2.9× |
-| 200k   | 26.8× |  3.5× | 16.6× |  2.2× |
-| 500k   | 27.3× |  4.0× | 27.3× |  4.0× |
-| 1m     | 27.5× |  3.9× | 29.6× |  4.2× |
-| 2m     | 29.9× |  3.8× | 30.2× |  3.8× |
-| 5m     | 25.5× |  3.3× | 32.4× |  4.2× |
-| 10m    | 31.5× |  4.1× | 32.0× |  4.1× |
+| small  |  1.3× |  1.2× |  1.0× |  1.0× |
+| medium | 15.6× |  1.7× | 20.4× |  2.3× |
+| github-100k | 2.8× |  2.7× | 2.8× |  2.6× |
+| 100k   | 26.2× |  4.1× | 25.1× |  3.9× |
+| 200k   | 28.5× |  3.8× | 18.9× |  2.5× |
+| 500k   | 34.1× |  4.3× | 33.2× |  4.2× |
+| 1m     | 31.8× |  4.3× | 31.7× |  4.3× |
+| 2m     | 27.1× |  3.6× | 31.1× |  4.2× |
+| 5m     | 31.4× |  4.9× | 31.5× |  4.9× |
+| 10m    | 29.5× |  3.8× | 31.0× |  4.0× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -116,17 +116,17 @@ from the last round may still be included.
 
 | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
 |---|---|---:|---:|---:|---:|---:|
-| small      | +15,493 | +15,497 | +4,069 | +15,223 | +11,140 |
-| medium     |  +1,954 |  +2,660 |   +334 |    +537 |  +1,120 |
-| github-100k | +11,911 | +4,124 |    +14 |    +566 |    +230 |
-| 100k       |    +484 |    +748 |    +67 |    +710 |    +229 |
-| 200k       |    +392 |    +523 |    +34 |    +346 |    +117 |
-| 500k       |    +577 |    +630 |    +14 |    +139 |     +45 |
-| 1m         |  +1,082 |  +1,121 |    +10 |    +104 |     +34 |
-| 2m         |  +1,155 |  +1,248 |    +14 |    +208 |     +45 |
-| 5m         |  +1,316 |  +1,538 |    +14 |    +400 |     +45 |
-| 10m        |  +1,583 |  +2,014 |    +14 |    +722 |     +45 |
-| interleaved | +3,356 | +4,405 |   +268 |  +2,778 |    +898 |
+| small      | -2,359 |  +8,055 |  +8,159 |  +8,643 |  +2,701 |
+| medium     |  +3,850 |  +5,259 |    +124 |  +2,228 |  +2,234 |
+| github-100k | +19,936 | +15,164 |     +32 |  +1,072 |    +452 |
+| 100k       |    +867 |  +1,393 |    +138 |  +1,384 |    +452 |
+| 200k       |    +583 |    +845 |     +67 |    +692 |    +223 |
+| 500k       |    +654 |    +759 |     +27 |    +277 |     +89 |
+| 1m         |  +1,139 |  +1,218 |     +20 |    +208 |     +67 |
+| 2m         |  +1,284 |  +1,472 |     +28 |    +409 |     +89 |
+| 5m         |  +1,607 |  +2,050 |     +27 |    +792 |     +89 |
+| 10m        |  +2,142 |  +3,004 |     +27 |  +1,416 |     +89 |
+| interleaved | +4,888 | +6,983 |    +533 |  +5,533 |  +1,788 |
 
 `qjson.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -139,8 +139,8 @@ key into the Lua table heap.
 
 1. **`qjson` is fastest once payloads move beyond tiny inputs.**
    The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
-   larger multimodal payloads show roughly 13–31× higher throughput than
-   `cjson` and roughly 2–4× higher throughput than `lua-resty-simdjson`
+   larger multimodal payloads show roughly 16–34× higher throughput than
+   `cjson` and roughly 2–5× higher throughput than `lua-resty-simdjson`
    for request-field access.
 2. **Reading every `messages[*].content` is still access-light for large
    multimodal bodies.** The benchmark touches the top-level request fields and
@@ -149,7 +149,7 @@ key into the Lua table heap.
 3. **Speedup remains high at 10 MB.** The eager decode deduplication
    (skip re-validation when eagerly validated) and fused eager validation
    passes keep `qjson.parse` throughput scaling well even at the 10 MB level,
-   maintaining ~32× over cjson and ~4× over simdjson.
+   maintaining ~30× over cjson and ~4× over simdjson.
 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
    JSON after light-touch inspection. The substring fast path means

From 9f5f0f1a604018d863a953444181c2ce63d064d8 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 18:04:23 +0000
Subject: [PATCH 13/17] bench: add cjk-100k scenario with CJK+emoji content
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

~100 KB array of objects with Chinese text, emoji, and mixed
ASCII/CJK field names. Stresses PSHUFB byte classifier (high-bit bytes)
and UTF-8 validation path.
Pre: 4,720 ops/s → Post: 5,064 ops/s (+7.3%)
---
 benches/lua_bench.lua | 50 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 96bdd72..d0c2e6e 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -220,6 +220,52 @@ local function default_table_access(t)
     end
 end
 
+-- CJK+emoji payload: array of objects with Chinese text, emoji, and
+-- mixed ASCII/CJK field names. Each object has ~1 KB of string content,
+-- roughly 50% multi-byte UTF-8. Stresses the PSHUFB byte classifier
+-- (high-bit bytes) and UTF-8 validation path.
+local function make_cjk_payload(target_bytes)
+    local items = {}
+    local current = 2  -- "["
+    local n = 1
+    local cjk_text = "这是一段中文测试文本包含各种常用汉字以及标点符号用于模拟真实的中文API返回数据"
+        .. "😀🎉💡✨🚀🌟🔥🎊💯👍❤️🌍📱🎵🏆🍕🎮📚💻🔑🎁"
+    -- Build a pool of short tags from the text
+    local tag_chars = { "标签1", "标签2", "中文", "测试", "数据", "API", "返回", "响应" }
+    while current < target_bytes do
+        local name = string.format("用户%d", n)
+        local bio = "简介：" .. cjk_text
+        local tag = tag_chars[(n - 1) % #tag_chars + 1]
+        local item = string.format(
+            [[{"id":%d,"name":"%s","bio":"%s","tags":["%s","中文","emoji"],"score":%d}]],
+            n, name, bio, tag, n * 13 % 100)
+        if current + #item + 3 > target_bytes then break end
+        if n > 1 then items[#items + 1] = "," end
+        items[#items + 1] = item
+        current = current + #item + 1
+        n = n + 1
+    end
+    return "[" .. table.concat(items) .. "]"
+end
+
+local function cjk_qjson_access(d)
+    local _ = d:get_str("[0].name")
+    local _ = d:get_str("[0].bio")
+    local _ = d:get_str("[0].tags[0]")
+end
+
+local function cjk_table_access(t)
+    local _ = t[1] and t[1].name
+    local _ = t[1] and t[1].bio
+    local _ = t[1] and t[1].tags and t[1].tags[1]
+end
+
+local function cjk_cjson_access(obj)
+    local _ = obj[1] and obj[1].name
+    local _ = obj[1] and obj[1].bio
+    local _ = obj[1] and obj[1].tags and obj[1].tags[1]
+end
+
 -- GitHub issues accessors: array of issues, access first issue's fields
 local function github_cjson_access(obj)
     local _ = obj[1] and obj[1].id
@@ -242,8 +288,10 @@ end
 local scenarios = {
     {name = "small",  iters = 5000, payload = read_file("benches/fixtures/small_api.json")},
     {name = "medium", iters = 500,  payload = read_file("benches/fixtures/medium_resp.json")},
-    {name = "github-100k", iters = 100, payload = make_github_issues_payload(100 * 1024),
+    {name = "github-100k",  iters = 100, payload = make_github_issues_payload(100 * 1024),
      cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access},
+    {name = "cjk-100k",     iters = 100, payload = make_cjk_payload(100 * 1024),
+     cjson_access = cjk_cjson_access, qjson_access = cjk_qjson_access, table_access = cjk_table_access},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
     {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},

From 96c13c952bda7f23ca8aad148448af217709bdec Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 18:06:08 +0000
Subject: [PATCH 14/17] docs: add cjk-100k CJK+emoji benchmark row to tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

100 KB array of objects with Chinese text and emoji. Stresses PSHUFB
UTF-8/high-bit classification. Pre-opt: 4,720 ops/s → Post-opt: 4,605
ops/s. qjson memory delta: +26 KB (cjson: +17 MB).
---
 README.md          | 1 +
 docs/benchmarks.md | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 0d03af6..6e8cd4b 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,7 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 10 rounds, deterministic payload):
 |---:|---:|---:|---:|---:|---:|
 |   2 KB | 100,127 | 109,588 | 130,867 | 105,038 |  1.3× /  1.0× |
 |  60 KB |   8,701 |  77,936 | 135,700 | 177,650 | 15.6× / 20.4× |
+|  100 KB (CJK) |   1,985 |   2,301 |   4,605 |   4,518 |  2.3× /  2.3× |
 | 100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 26.2× / 25.1× |
 |   1 MB |     498 |   3,697 |  15,831 |  15,784 | 31.8× / 31.7× |
 |  10 MB |      50 |     383 |   1,473 |   1,548 | 29.5× / 31.0× |
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 89dddfc..d6241c3 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -84,6 +84,7 @@ Each row is "parse + access request fields" on the named payload.
 | small      |   2.1 KB | 100,127 | 109,588 | 130,867 | 105,038 | 210,886 |
 | medium     |  60.4 KB |   8,701 |  77,936 | 135,700 | 177,650 | 164,142 |
 | github-100k |   100 KB |   2,106 |   2,247 |   5,964 |   5,900 |   6,321 |
+| cjk-100k  |   100 KB |   1,985 |   2,301 |   4,605 |   4,518 |   5,300 |
 | 100k       |   100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 145,613 |
 | 200k       |   200 KB |   2,504 |  18,630 |  71,441 |  47,214 |  47,481 |
 | 500k       |   500 KB |   1,013 |   8,005 |  34,562 |  33,646 |  34,683 |
@@ -100,6 +101,7 @@ Each row is "parse + access request fields" on the named payload.
 | small  |  1.3× |  1.2× |  1.0× |  1.0× |
 | medium | 15.6× |  1.7× | 20.4× |  2.3× |
 | github-100k | 2.8× |  2.7× | 2.8× |  2.6× |
+| cjk-100k  | 2.3× |  2.0× | 2.3× |  2.0× |
 | 100k   | 26.2× |  4.1× | 25.1× |  3.9× |
 | 200k   | 28.5× |  3.8× | 18.9× |  2.5× |
 | 500k   | 34.1× |  4.3× | 33.2× |  4.2× |
@@ -119,6 +121,7 @@ from the last round may still be included.
 | small      | -2,359 |  +8,055 |  +8,159 |  +8,643 |  +2,701 |
 | medium     |  +3,850 |  +5,259 |    +124 |  +2,228 |  +2,234 |
 | github-100k | +19,936 | +15,164 |     +32 |  +1,072 |    +452 |
+| cjk-100k  | +17,405 |  +3,284 |     +26 |  +1,083 |    +446 |
 | 100k       |    +867 |  +1,393 |    +138 |  +1,384 |    +452 |
 | 200k       |    +583 |    +845 |     +67 |    +692 |    +223 |
 | 500k       |    +654 |    +759 |     +27 |    +277 |     +89 |

From 4b74b6c06df5493ca5232711f6e1334820e64a18 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 18:23:58 +0000
Subject: [PATCH 15/17] bench: fix cjk-100k UTF-8 truncation, use whole-body
 repeats

- Replace safe_sub-based truncation with integer multiples of cjk_body
  to avoid splitting multi-byte sequences
- Skip simdjson for cjk scenario (no_simdjson flag)
- Add safe_sub utility (unused now, kept for potential future use)
- Update cjk-100k data: qjson.parse mean 5,018 ops/s (2.3x vs cjson)
---
 README.md             |   2 +-
 benches/lua_bench.lua | 116 +++++++++++++++++++++++++++++++-----------
 docs/benchmarks.md    |   6 +--
 3 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 6e8cd4b..d028fa7 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 10 rounds, deterministic payload):
 |---:|---:|---:|---:|---:|---:|
 |   2 KB | 100,127 | 109,588 | 130,867 | 105,038 |  1.3× /  1.0× |
 |  60 KB |   8,701 |  77,936 | 135,700 | 177,650 | 15.6× / 20.4× |
-|  100 KB (CJK) |   1,985 |   2,301 |   4,605 |   4,518 |  2.3× /  2.3× |
+|  100 KB (CJK) |   2,220 |      — |   5,018 |   5,390 |  2.3× /  2.4× |
 | 100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 26.2× / 25.1× |
 |   1 MB |     498 |   3,697 |  15,831 |  15,784 | 31.8× / 31.7× |
 |  10 MB |      50 |     383 |   1,473 |   1,548 | 29.5× / 31.0× |
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index d0c2e6e..b7cb05f 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -220,50 +220,103 @@ local function default_table_access(t)
     end
 end
 
--- CJK+emoji payload: array of objects with Chinese text, emoji, and
--- mixed ASCII/CJK field names. Each object has ~1 KB of string content,
--- roughly 50% multi-byte UTF-8. Stresses the PSHUFB byte classifier
--- (high-bit bytes) and UTF-8 validation path.
+-- Safe UTF-8 truncation: backs up past incomplete multi-byte sequences.
+local function safe_sub(s, len)
+    if #s <= len then return s end
+    local pos = len
+    while pos > 0 and s:byte(pos) >= 0x80 and s:byte(pos) < 0xC0 do pos = pos - 1 end
+    if pos > 0 then
+        local lead = s:byte(pos)
+        local need = 0
+        if lead >= 0xF0 then need = 3
+        elseif lead >= 0xE0 then need = 2
+        elseif lead >= 0xC2 then need = 1
+        end
+        if len - pos < need then pos = pos - 1 end
+        while pos > 0 and s:byte(pos) >= 0x80 and s:byte(pos) < 0xC0 do pos = pos - 1 end
+    end
+    return s:sub(1, pos)
+end
+
+-- CJK GitHub-issues payload: same 20-field structure as github-100k but
+-- with Chinese text and emoji in body/title/labels. Directly comparable
+-- to github-100k — isolates the UTF-8 / high-bit byte impact.
 local function make_cjk_payload(target_bytes)
-    local items = {}
-    local current = 2  -- "["
+    local issues = {}
+    local current = 2
     local n = 1
-    local cjk_text = "这是一段中文测试文本包含各种常用汉字以及标点符号用于模拟真实的中文API返回数据"
+    local cjk_body = "这是一段用于模拟GitHub Issues中文描述的测试文本包含常见的开发术语问题报告功能请求以及Bug修复记录"
         .. "😀🎉💡✨🚀🌟🔥🎊💯👍❤️🌍📱🎵🏆🍕🎮📚💻🔑🎁"
-    -- Build a pool of short tags from the text
-    local tag_chars = { "标签1", "标签2", "中文", "测试", "数据", "API", "返回", "响应" }
+    local cjk_title = "修复用户登录页面在移动端的显示问题并优化响应式布局"
     while current < target_bytes do
-        local name = string.format("用户%d", n)
-        local bio = "简介：" .. cjk_text
-        local tag = tag_chars[(n - 1) % #tag_chars + 1]
-        local item = string.format(
-            [[{"id":%d,"name":"%s","bio":"%s","tags":["%s","中文","emoji"],"score":%d}]],
-            n, name, bio, tag, n * 13 % 100)
-        if current + #item + 3 > target_bytes then break end
-        if n > 1 then items[#items + 1] = "," end
-        items[#items + 1] = item
-        current = current + #item + 1
+        local labels = {}
+        local label_count = (n % 4)
+        local label_names = { "缺陷bug", "功能增强", "文档优化", "性能改进" }
+        for i = 1, label_count do
+            labels[#labels + 1] = string.format(
+                [[{"id":%d,"name":"%s","color":"%06x","description":"标签分类描述"}]],
+                10000 + n * 10 + i, label_names[i], (n * 12345 + i) % 0xFFFFFF)
+        end
+        -- Use whole multiples of cjk_body to avoid UTF-8 truncation
+        local reps = 1 + (n % 3)
+        local body = string.rep(cjk_body, reps)
+        local issue = string.format([[{
+"id":%d,
+"number":%d,
+"title":"%s #%d",
+"body":"%s",
+"state":"%s",
+"locked":%s,
+"comments":%d,
+"user":{"login":"用户%d","id":%d,"avatar_url":"https://avatars.githubusercontent.com/u/%d?v=4","type":"用户","site_admin":false},
+"labels":[%s],
+"assignees":[],
+"milestone":null,
+"created_at":"2024-%02d-%02dT%02d:%02d:%02dZ",
+"updated_at":"2024-%02d-%02dT%02d:%02d:%02dZ",
+"closed_at":null,
+"author_association":"贡献者",
+"html_url":"https://github.com/example/中文仓库/issues/%d",
+"url":"https://api.github.com/repos/example/中文仓库/issues/%d",
+"repository_url":"https://api.github.com/repos/example/中文仓库",
+"labels_url":"https://api.github.com/repos/example/中文仓库/issues/%d/labels{/名称}",
+"comments_url":"https://api.github.com/repos/example/中文仓库/issues/%d/评论",
+"events_url":"https://api.github.com/repos/example/中文仓库/issues/%d/事件"
+}]],
+            1000000 + n, n, cjk_title, n, body,
+            n % 3 == 0 and "已关闭" or "进行中",
+            n % 7 == 0 and "true" or "false",
+            n % 50, n % 100, 100000 + n, 100000 + n,
+            table.concat(labels, ","),
+            (n % 12) + 1, (n % 28) + 1, n % 24, n % 60, n % 60,
+            (n % 12) + 1, (n % 28) + 1, (n + 1) % 24, (n + 5) % 60, (n + 10) % 60,
+            n, n, n, n, n)
+        issue = issue:gsub("\n", "")
+        if current + #issue + 3 > target_bytes then break end
+        issues[#issues + 1] = issue
+        current = current + #issue + 1
         n = n + 1
     end
-    return "[" .. table.concat(items) .. "]"
+    return "[" .. table.concat(issues, ",") .. "]"
 end
 
 local function cjk_qjson_access(d)
-    local _ = d:get_str("[0].name")
-    local _ = d:get_str("[0].bio")
-    local _ = d:get_str("[0].tags[0]")
+    if not d then return end
+    local _ = d:get_i64("[0].id")
+    local _ = d:get_str("[0].title")
+    local _ = d:get_str("[0].user.login")
 end
 
 local function cjk_table_access(t)
-    local _ = t[1] and t[1].name
-    local _ = t[1] and t[1].bio
-    local _ = t[1] and t[1].tags and t[1].tags[1]
+    local _ = t[1] and t[1].id
+    local _ = t[1] and t[1].title
+    local _ = t[1] and t[1].user and t[1].user.login
 end
 
 local function cjk_cjson_access(obj)
-    local _ = obj[1] and obj[1].name
-    local _ = obj[1] and obj[1].bio
-    local _ = obj[1] and obj[1].tags and obj[1].tags[1]
+    local _ = obj[1] and obj[1].id
+    local _ = obj[1] and obj[1].title
+    local _ = obj[1] and obj[1].user and obj[1].user.login
 end
 
 -- GitHub issues accessors: array of issues, access first issue's fields
@@ -291,7 +344,8 @@ local scenarios = {
     {name = "github-100k",  iters = 100, payload = make_github_issues_payload(100 * 1024),
      cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access},
     {name = "cjk-100k",     iters = 100, payload = make_cjk_payload(100 * 1024),
-     cjson_access = cjk_cjson_access, qjson_access = cjk_qjson_access, table_access = cjk_table_access},
+     cjson_access = cjk_cjson_access, qjson_access = cjk_qjson_access, table_access = cjk_table_access,
+     no_simdjson = true},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
     {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},
@@ -323,7 +377,7 @@ for _, s in ipairs(scenarios) do
         cjson_access(obj)
     end)
 
-    if simdjson then
+    if simdjson and not s.no_simdjson then
         bench("simdjson.decode + access fields", s.iters, function()
             local obj = simdjson:decode(s.payload)
             cjson_access(obj)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index d6241c3..ceaf1a1 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -84,7 +84,7 @@ Each row is "parse + access request fields" on the named payload.
 | small      |   2.1 KB | 100,127 | 109,588 | 130,867 | 105,038 | 210,886 |
 | medium     |  60.4 KB |   8,701 |  77,936 | 135,700 | 177,650 | 164,142 |
 | github-100k |   100 KB |   2,106 |   2,247 |   5,964 |   5,900 |   6,321 |
-| cjk-100k  |   100 KB |   1,985 |   2,301 |   4,605 |   4,518 |   5,300 |
+| cjk-100k  |   100 KB |   2,220 |      — |   5,018 |   5,390 |   5,855 |
 | 100k       |   100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 145,613 |
 | 200k       |   200 KB |   2,504 |  18,630 |  71,441 |  47,214 |  47,481 |
 | 500k       |   500 KB |   1,013 |   8,005 |  34,562 |  33,646 |  34,683 |
@@ -101,7 +101,7 @@ Each row is "parse + access request fields" on the named payload.
 | small  |  1.3× |  1.2× |  1.0× |  1.0× |
 | medium | 15.6× |  1.7× | 20.4× |  2.3× |
 | github-100k | 2.8× |  2.7× | 2.8× |  2.6× |
-| cjk-100k  | 2.3× |  2.0× | 2.3× |  2.0× |
+| cjk-100k  | 2.3× |   — | 2.4× |   — |
 | 100k   | 26.2× |  4.1× | 25.1× |  3.9× |
 | 200k   | 28.5× |  3.8× | 18.9× |  2.5× |
 | 500k   | 34.1× |  4.3× | 33.2× |  4.2× |
@@ -121,7 +121,7 @@ from the last round may still be included.
 | small      | -2,359 |  +8,055 |  +8,159 |  +8,643 |  +2,701 |
 | medium     |  +3,850 |  +5,259 |    +124 |  +2,228 |  +2,234 |
 | github-100k | +19,936 | +15,164 |     +32 |  +1,072 |    +452 |
-| cjk-100k  | +17,405 |  +3,284 |     +26 |  +1,083 |    +446 |
+| cjk-100k  | +10,327 |      — |     +32 |  +1,082 |    +446 |
 | 100k       |    +867 |  +1,393 |    +138 |  +1,384 |    +452 |
 | 200k       |    +583 |    +845 |     +67 |    +692 |    +223 |
 | 500k       |    +654 |    +759 |     +27 |    +277 |     +89 |

From 708625b8db109d299ba48d3931f87e102e2b9acc Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 18:27:17 +0000
Subject: [PATCH 16/17] bench: add simdjson data for cjk-100k scenario
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove no_simdjson flag. simdjson mean 2,367 ops/s on cjk-100k.
qjson.parse/cjson = 2.3×, qjson.parse/simdjson = 2.1×.
---
 README.md             | 2 +-
 benches/lua_bench.lua | 3 +--
 docs/benchmarks.md    | 6 +++---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d028fa7..ee00596 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 10 rounds, deterministic payload):
 |---:|---:|---:|---:|---:|---:|
 |   2 KB | 100,127 | 109,588 | 130,867 | 105,038 |  1.3× /  1.0× |
 |  60 KB |   8,701 |  77,936 | 135,700 | 177,650 | 15.6× / 20.4× |
-|  100 KB (CJK) |   2,220 |      — |   5,018 |   5,390 |  2.3× /  2.4× |
+|  100 KB (CJK) |   2,203 |   2,367 |   4,965 |   5,363 |  2.3× /  2.4× |
 | 100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 26.2× / 25.1× |
 |   1 MB |     498 |   3,697 |  15,831 |  15,784 | 31.8× / 31.7× |
 |  10 MB |      50 |     383 |   1,473 |   1,548 | 29.5× / 31.0× |
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index b7cb05f..e26c5fb 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -344,8 +344,7 @@ local scenarios = {
     {name = "github-100k",  iters = 100, payload = make_github_issues_payload(100 * 1024),
      cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access},
     {name = "cjk-100k",     iters = 100, payload = make_cjk_payload(100 * 1024),
-     cjson_access = cjk_cjson_access, qjson_access = cjk_qjson_access, table_access = cjk_table_access,
-     no_simdjson = true},
+     cjson_access = cjk_cjson_access, qjson_access = cjk_qjson_access, table_access = cjk_table_access},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
     {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ceaf1a1..b1e5bbb 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -84,7 +84,7 @@ Each row is "parse + access request fields" on the named payload.
 | small      |   2.1 KB | 100,127 | 109,588 | 130,867 | 105,038 | 210,886 |
 | medium     |  60.4 KB |   8,701 |  77,936 | 135,700 | 177,650 | 164,142 |
 | github-100k |   100 KB |   2,106 |   2,247 |   5,964 |   5,900 |   6,321 |
-| cjk-100k  |   100 KB |   2,220 |      — |   5,018 |   5,390 |   5,855 |
+| cjk-100k  |    99 KB |   2,203 |   2,367 |   4,965 |   5,363 |   6,063 |
 | 100k       |   100 KB |   4,985 |  32,232 | 130,621 | 125,348 | 145,613 |
 | 200k       |   200 KB |   2,504 |  18,630 |  71,441 |  47,214 |  47,481 |
 | 500k       |   500 KB |   1,013 |   8,005 |  34,562 |  33,646 |  34,683 |
@@ -101,7 +101,7 @@ Each row is "parse + access request fields" on the named payload.
 | small  |  1.3× |  1.2× |  1.0× |  1.0× |
 | medium | 15.6× |  1.7× | 20.4× |  2.3× |
 | github-100k | 2.8× |  2.7× | 2.8× |  2.6× |
-| cjk-100k  | 2.3× |   — | 2.4× |   — |
+| cjk-100k  | 2.3× |  2.1× | 2.4× |  2.3× |
 | 100k   | 26.2× |  4.1× | 25.1× |  3.9× |
 | 200k   | 28.5× |  3.8× | 18.9× |  2.5× |
 | 500k   | 34.1× |  4.3× | 33.2× |  4.2× |
@@ -121,7 +121,7 @@ from the last round may still be included.
 | small      | -2,359 |  +8,055 |  +8,159 |  +8,643 |  +2,701 |
 | medium     |  +3,850 |  +5,259 |    +124 |  +2,228 |  +2,234 |
 | github-100k | +19,936 | +15,164 |     +32 |  +1,072 |    +452 |
-| cjk-100k  | +10,327 |      — |     +32 |  +1,082 |    +446 |
+| cjk-100k  | +10,131 |  +4,500 |     +34 |  +1,092 |    +446 |
 | 100k       |    +867 |  +1,393 |    +138 |  +1,384 |    +452 |
 | 200k       |    +583 |    +845 |     +67 |    +692 |    +223 |
 | 500k       |    +654 |    +759 |     +27 |    +277 |     +89 |

From 3f6822ada932f94ef73f704fecb88fb236ef8429 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Fri, 22 May 2026 18:35:34 +0000
Subject: [PATCH 17/17] =?UTF-8?q?fix:=20address=20review=20comments=20?=
 =?UTF-8?q?=E2=80=94=20error=20precedence,=20LUT=20hoisting,=20docs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fused validator: check trailing content before string/scalar validation
  to preserve old validate_trailing error-code precedence
  (e.g. '\"\\q" x' → QJSON_TRAILING_CONTENT, not QJSON_INVALID_STRING)
- Fused validator: detect TopDone+structural as QJSON_TRAILING_CONTENT
  (e.g. '42 {}' → trailing, not PARSE_ERROR)
- classify_str_mask: precompute LUT vectors as 32-byte aligned statics,
  load with _mm256_load_si256 instead of rebuilding per call
- benchmarks.md: fix table separator column counts, update
  '5 rounds'→'10 rounds', 'median'→'mean' in section titles
- Add 3 regression tests for error-code precedence
---
 docs/benchmarks.md       | 13 ++++----
 src/validate/classify.rs | 52 +++++++++++++++++++++++++++++---
 src/validate/mod.rs      | 65 +++++++++++++++++++++++++++++++++-------
 3 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index b1e5bbb..5d88838 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -33,8 +33,7 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
 3. 10 rounds × N iterations of the workload (warmup excluded); report the
    **mean** ops/s across rounds (median + range also shown in output).
 4. Final `collectgarbage("count")` to capture the post-run memory delta in
-   KB. The harness does not force a final collection after timing, so
-   short-lived garbage from the last round may still be included.
+   KB.
 
 The payload is a synthetic multimodal chat-completion request with one or more
 historical messages. Each message contains one small text part and one
@@ -75,12 +74,12 @@ harness prints a skip message and omits the simdjson rows.
 
 Numbers below come from one such run.
 
-## Results — throughput (median ops/s)
+## Results — throughput (mean ops/s)
 
 Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---|---:|---:|---:|---:|---:|---:|
+|---|---:|---:|---:|---:|---:|---:|
 | small      |   2.1 KB | 100,127 | 109,588 | 130,867 | 105,038 | 210,886 |
 | medium     |  60.4 KB |   8,701 |  77,936 | 135,700 | 177,650 | 164,142 |
 | github-100k |   100 KB |   2,106 |   2,247 |   5,964 |   5,900 |   6,321 |
@@ -97,7 +96,7 @@ Each row is "parse + access request fields" on the named payload.
 ### Speed-up vs. baselines
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
-|---|---|---:|---:|---:|---:|
+|---|---:|---:|---:|---:|
 | small  |  1.3× |  1.2× |  1.0× |  1.0× |
 | medium | 15.6× |  1.7× | 20.4× |  2.3× |
 | github-100k | 2.8× |  2.7× | 2.8× |  2.6× |
@@ -110,14 +109,14 @@ Each row is "parse + access request fields" on the named payload.
 | 5m     | 31.4× |  4.9× | 31.5× |  4.9× |
 | 10m    | 29.5× |  3.8× | 31.0× |  4.0× |
 
-## Results — memory delta (KB retained after 5 rounds)
+## Results — memory delta (KB retained after 10 rounds)
 
 Post-run `collectgarbage("count")` minus baseline. Captures heap usage after
 the timing rounds without forcing a final collection, so short-lived garbage
 from the last round may still be included.
 
 | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---|---:|---:|---:|---:|---:|
+|---|---:|---:|---:|---:|---:|
 | small      | -2,359 |  +8,055 |  +8,159 |  +8,643 |  +2,701 |
 | medium     |  +3,850 |  +5,259 |    +124 |  +2,228 |  +2,234 |
 | github-100k | +19,936 | +15,164 |     +32 |  +1,072 |    +452 |
diff --git a/src/validate/classify.rs b/src/validate/classify.rs
index c75d2db..3e09678 100644
--- a/src/validate/classify.rs
+++ b/src/validate/classify.rs
@@ -169,12 +169,57 @@ pub(crate) unsafe fn classify_str_chunk(chunk: __m256i) -> u32 {
     classify_str_mask(chunk)
 }
 
+/// Precomputed 32-byte LUT vectors (16-entry nibble table duplicated
+/// into both 128-bit lanes), loaded via `_mm256_load_si256`. Avoids
+/// rebuilding the vector on every call.
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[repr(align(32))]
+struct AlignedLut([u8; 32]);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+static STR_LO_LUT_VEC: AlignedLut = build_aligned_lut(&STR_LO_TABLE);
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+static STR_HI_LUT_VEC: AlignedLut = build_aligned_lut(&STR_HI_TABLE);
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+static NUM_LO_LUT_VEC: AlignedLut = build_aligned_lut(&NUM_LO_TABLE);
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+static NUM_HI_LUT_VEC: AlignedLut = build_aligned_lut(&NUM_HI_TABLE);
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+const fn build_aligned_lut(table: &[u8; 16]) -> AlignedLut {
+    let mut a = [0u8; 32];
+    let mut i = 0usize;
+    while i < 16 {
+        a[i] = table[i];
+        a[i + 16] = table[i];
+        i += 1;
+    }
+    AlignedLut(a)
+}
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[inline(always)]
+unsafe fn load_str_luts() -> (__m256i, __m256i) {
+    (
+        _mm256_load_si256(STR_LO_LUT_VEC.0.as_ptr() as *const __m256i),
+        _mm256_load_si256(STR_HI_LUT_VEC.0.as_ptr() as *const __m256i),
+    )
+}
+
+#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
+#[inline(always)]
+unsafe fn load_num_luts() -> (__m256i, __m256i) {
+    (
+        _mm256_load_si256(NUM_LO_LUT_VEC.0.as_ptr() as *const __m256i),
+        _mm256_load_si256(NUM_HI_LUT_VEC.0.as_ptr() as *const __m256i),
+    )
+}
+
 /// Returns a bitmask of bytes that match CTRL | BS | HIGH.
 #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
-    let lo_lut     = make_lut(&STR_LO_TABLE);
-    let hi_lut     = make_lut(&STR_HI_TABLE);
+    let (lo_lut, hi_lut) = load_str_luts();
     let classes    = classify_chunk(chunk, lo_lut, hi_lut);
     let zero       = _mm256_cmpeq_epi8(classes, _mm256_setzero_si256());
     let zero_mask  = _mm256_movemask_epi8(zero) as u32;
@@ -189,8 +234,7 @@ pub(crate) unsafe fn classify_str_mask(chunk: __m256i) -> u32 {
 #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn classify_num_chunk(chunk: __m256i) -> (__m256i, u32) {
-    let lo_lut     = make_lut(&NUM_LO_TABLE);
-    let hi_lut     = make_lut(&NUM_HI_TABLE);
+    let (lo_lut, hi_lut) = load_num_luts();
     let classes    = classify_chunk(chunk, lo_lut, hi_lut);
 
     // bad = bytes where CTRL | BS | HIGH is set.
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index 69abeca..fcb0393 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -152,6 +152,14 @@ pub(crate) fn validate_eager_fused(
 
         consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?;
 
+        // After consuming the gap, if the root has already been fully
+        // consumed (depth==0, TopDone), any subsequent structural token
+        // is trailing content. This matches the old validate_trailing
+        // precedence: e.g. `42 {}` → QJSON_TRAILING_CONTENT, not PARSE_ERROR.
+        if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
+            return Err(qjson_err::QJSON_TRAILING_CONTENT);
+        }
+
         match b {
             b'{' | b'[' => {
                 let cur = stack.last_mut().unwrap();
@@ -235,6 +243,20 @@ pub(crate) fn validate_eager_fused(
                 if close <= pos || close >= buf.len() || buf[close] != b'"' {
                     return Err(qjson_err::QJSON_PARSE_ERROR);
                 }
+
+                let cur = stack.last().copied().unwrap();
+                // For a top-level string root, check trailing content BEFORE
+                // validating the string. This preserves the old validate_trailing
+                // error-code precedence: `"\\q" x` → QJSON_TRAILING_CONTENT, not
+                // QJSON_INVALID_STRING.
+                if matches!(cur, CtxKind::Top) && depth == 0 {
+                    let mut p = close + 1;
+                    while p < buf.len() && is_ws(buf[p]) { p += 1; }
+                    if p < buf.len() {
+                        return Err(qjson_err::QJSON_TRAILING_CONTENT);
+                    }
+                }
+
                 strings::validate_string_span(&buf[pos + 1 .. close])?;
 
                 let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
@@ -247,13 +269,6 @@ pub(crate) fn validate_eager_fused(
                     | CtxKind::ArrAfterComma
                     | CtxKind::ObjAfterColon => {
                         *cur = parent_after_value(*cur);
-                        if depth == 0 && stack.len() == 1 && stack[0] == CtxKind::TopDone {
-                            let mut p = close + 1;
-                            while p < buf.len() && is_ws(buf[p]) { p += 1; }
-                            if p < buf.len() {
-                                return Err(qjson_err::QJSON_TRAILING_CONTENT);
-                            }
-                        }
                     }
                     _ => return Err(qjson_err::QJSON_PARSE_ERROR),
                 }
@@ -273,14 +288,15 @@ pub(crate) fn validate_eager_fused(
         if scan < buf.len() {
             let mut end = scan;
             while end < buf.len() && !is_ws(buf[end]) { end += 1; }
-            validate_scalar(&buf[scan..end])?;
-            *stack.last_mut().unwrap() = CtxKind::TopDone;
-
+            // Check for trailing content BEFORE validating the scalar.
+            // Preserves old validate_trailing precedence: `1a 2` → QJSON_TRAILING_CONTENT.
             let mut p = end;
             while p < buf.len() && is_ws(buf[p]) { p += 1; }
             if p < buf.len() {
                 return Err(qjson_err::QJSON_TRAILING_CONTENT);
             }
+            validate_scalar(&buf[scan..end])?;
+            *stack.last_mut().unwrap() = CtxKind::TopDone;
         }
     } else {
         consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?;
@@ -751,4 +767,33 @@ mod tests {
             Err(qjson_err::QJSON_PARSE_ERROR),
         );
     }
+
+    // ── error-code precedence regression tests ──────────────────────
+
+    #[test]
+    fn fused_string_root_trailing_before_validation() {
+        // Old validate_trailing ran before validate_eager_values.
+        // `"\\q" x` → QJSON_TRAILING_CONTENT, not QJSON_INVALID_STRING.
+        assert_eq!(
+            validate_eager_fused(b"\"\\q\" x", &ix(b"\"\\q\" x"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_scalar_then_structural_is_trailing() {
+        // `42 {}` — scalar root followed by container must be trailing.
+        assert_eq!(
+            validate_eager_fused(b"42 {}", &ix(b"42 {}"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
+
+    #[test]
+    fn fused_scalar_then_array_is_trailing() {
+        assert_eq!(
+            validate_eager_fused(b"42[]", &ix(b"42[]"), 1024),
+            Err(qjson_err::QJSON_TRAILING_CONTENT),
+        );
+    }
 }