From c7574e37859e12f24f5f95642597e31de9f8a83f Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 20:44:13 +0800
Subject: [PATCH 01/11] perf: Lua encode bulk-copy + Rc cache + AVX2 LUT +
 validation stack

- Replace per-byte encode_string with bulk-copy segments and fast
  escape-free path (zero table allocations for clean strings)
- Replace recursive is_dirty with O(1) _dirty flag propagated via
  parent chain on __newindex
- Eliminate is_array pre-scan by tagging materialized tables with
  __qjson_type marker during materialization
- Replace Vec<u32>::clone in SkipCache with Rc<[u32]> for O(1)
  cache-hit access instead of O(n) vector clone
- Replace 7-cmpeq loop in AVX2 structural_mask_chunk with
  PSHUFB-based nibble LUT classification
- Replace heap-allocated validation state stack with fixed-size
  [CtxKind; 64] array for typical JSON depths
- Replace byte-by-byte float-detection in parse_i64 with
  memchr::memchr3
- Add modified-encode benchmark scenarios (modify top / add field /
  modify nested) to bench harness
- Add correctness tests for modified-encode round-trip
---
 benches/lua_bench.lua         |  77 +++++++++++++++++++++++-
 lua/qjson/table.lua           | 107 +++++++++++++++++++++-------------
 src/cursor.rs                 |  14 +++--
 src/decode/number.rs          |   2 +-
 src/scan/avx2.rs              |  71 +++++++++++++++++-----
 src/skip_cache.rs             |  11 ++--
 src/validate/mod.rs           |  80 +++++++++++++++++++------
 tests/lua/lazy_table_spec.lua |  50 ++++++++++++++++
 8 files changed, 329 insertions(+), 83 deletions(-)
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 30a3977..81639b8 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -220,6 +220,21 @@ local function default_table_access(t)
     end
 end
 
+local function default_table_modify_top(t)
+    t.model = "new-model"
+    t.temperature = 0.0
+end
+
+local function default_table_modify_add(t)
+    t.stream = true
+end
+
+local function default_table_modify_nested(t)
+    if t.messages and qjson.len(t.messages) > 0 then
+        t.messages[1].content = "modified"
+    end
+end
+
 -- GitHub issues accessors: array of issues, access first issue's fields
 local function github_cjson_access(obj)
     local _ = obj[1] and obj[1].id
@@ -239,11 +254,26 @@ local function github_table_access(t)
     local _ = t[1] and t[1].user and t[1].user.login
 end
 
+local function github_table_modify_top(t)
+    t[1].title = "modified title"
+end
+
+local function github_table_modify_add(t)
+    t.extra_field = true
+end
+
+local function github_table_modify_nested(t)
+    if t[1] and t[1].user then
+        t[1].user.login = "modified-user"
+    end
+end
+
 local scenarios = {
     {name = "small",  iters = 5000, payload = read_file("benches/fixtures/small_api.json")},
     {name = "medium", iters = 500,  payload = read_file("benches/fixtures/medium_resp.json")},
     {name = "github-100k", iters = 100, payload = make_github_issues_payload(100 * 1024),
-     cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access},
+     cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access,
+     modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
     {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},
@@ -269,6 +299,9 @@ for _, s in ipairs(scenarios) do
     local cjson_access = s.cjson_access or default_cjson_access
     local qjson_access = s.qjson_access or default_qjson_access
     local table_access = s.table_access or default_table_access
+    local modify_top = s.modify_top or default_table_modify_top
+    local modify_add = s.modify_add or default_table_modify_add
+    local modify_nested = s.modify_nested or default_table_modify_nested
 
     bench("cjson.decode + access fields", s.iters, function()
         local obj = cjson.decode(s.payload)
@@ -309,6 +342,24 @@ for _, s in ipairs(scenarios) do
         local t = qjson.decode(s.payload)
         local _ = qjson.encode(t)
     end)
+
+    bench("qjson.decode + modify top + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_top(t)
+        local _ = qjson.encode(t)
+    end)
+
+    bench("qjson.decode + add field + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_add(t)
+        local _ = qjson.encode(t)
+    end)
+
+    bench("qjson.decode + modify nested + encode", s.iters, function()
+        local t = qjson.decode(s.payload)
+        modify_nested(t)
+        local _ = qjson.encode(t)
+    end)
 end
 
 -- Interleaved scenario: cycle through several payloads of different sizes
@@ -386,4 +437,28 @@ do
         local t = qjson.decode(p)
         local _ = qjson.encode(t)
     end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + modify top + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_top(t)
+        local _ = qjson.encode(t)
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + add field + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_add(t)
+        local _ = qjson.encode(t)
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qjson.decode + modify nested + encode", 400, function()
+        local p = next_p()
+        local t = qjson.decode(p)
+        default_table_modify_nested(t)
+        local _ = qjson.encode(t)
+    end)
 end
diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua
index 86f50d0..754e3f8 100644
--- a/lua/qjson/table.lua
+++ b/lua/qjson/table.lua
@@ -65,6 +65,8 @@ local function wrap_child(parent_view, src_box)
     local own_box = ffi.new("qjson_cursor[1]")
     ffi.copy(own_box, src_box, ffi.sizeof("qjson_cursor"))
     return {
+        _parent  = parent_view,
+        _dirty   = false,
         _doc     = parent_view._doc,
         _cur_box = own_box,        -- keep cdata alive
         _cur     = own_box[0],     -- stable reference into own_box
@@ -252,6 +254,7 @@ end
 -- the dirty check and __newindex can share the list.
 local INTERNAL_KEYS = {
     _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true,
+    _parent = true, _dirty = true,
 }
 
 -- On first write, walk all existing key/value pairs into a plain table,
@@ -260,10 +263,14 @@ local INTERNAL_KEYS = {
 -- Existing rawget-cached entries (e.g. previously returned child proxies)
 -- are preserved so callers' references remain valid.
 LazyObject.__newindex = function(t, k, v)
+    -- Mark dirty from this view up to the root.
+    local cur = t
+    while cur do
+        rawset(cur, "_dirty", true)
+        cur = rawget(cur, "_parent")
+    end
     local contents = materialize_object_contents(t)
     -- Snapshot user-key cache BEFORE nilling internals.
-    -- Use next() for raw iteration: pairs() invokes __pairs on lazy tables,
-    -- walking the full JSON via FFI instead of the Lua-side rawget cache.
     local cache = {}
     local ck, cv = next(t)
     while ck ~= nil do
@@ -272,8 +279,11 @@ LazyObject.__newindex = function(t, k, v)
         end
         ck, cv = next(t, ck)
     end
-    t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil
+    for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do
+        rawset(t, f, nil)
+    end
     setmetatable(t, nil)
+    rawset(t, "__qjson_type", "object")
     for _, kv in ipairs(contents) do
         rawset(t, kv[1], cache[kv[1]] or kv[2])
     end
@@ -284,10 +294,14 @@ end
 -- switch to empty_array_mt (no lazy machinery), then apply the assignment.
 -- Existing rawget-cached entries are preserved so callers' references remain valid.
 LazyArray.__newindex = function(t, k, v)
+    -- Mark dirty from this view up to the root.
+    local cur = t
+    while cur do
+        rawset(cur, "_dirty", true)
+        cur = rawget(cur, "_parent")
+    end
     local contents = materialize_array_contents(t)
     -- Snapshot integer-key cache BEFORE nilling internals.
-    -- Use next() for raw iteration: pairs() would invoke __pairs on lazy arrays,
-    -- walking the full JSON via FFI instead of the Lua-side rawget cache.
     local cache = {}
     local ck, cv = next(t)
     while ck ~= nil do
@@ -296,8 +310,11 @@ LazyArray.__newindex = function(t, k, v)
         end
         ck, cv = next(t, ck)
     end
-    t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil
+    for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do
+        rawset(t, f, nil)
+    end
     setmetatable(t, _M.empty_array_mt)
+    rawset(t, "__qjson_type", "array")
     for i, x in ipairs(contents) do
         rawset(t, i, cache[i] or x)
     end
@@ -328,6 +345,7 @@ function _M.decode(json_str)
         error("qjson: root byte-span failed")
     end
     local view = {
+        _dirty   = false,
         _doc     = doc,
         _cur_box = root_box,   -- keep the box alive; _cur is a stable reference
         _cur     = root_box[0],
@@ -370,23 +388,42 @@ _M.materialize = materialize
 local string_byte = string.byte
 local string_format = string.format
 
--- Minimal JSON string escaper covering the cjson default set.
+-- Escape lookup table: byte value → escape sequence string (or nil if safe).
+local ESCAPES = {
+    [0x22] = '\\"',
+    [0x5C] = '\\\\',
+    [0x0A] = '\\n',
+    [0x0D] = '\\r',
+    [0x09] = '\\t',
+    [0x08] = '\\b',
+    [0x0C] = '\\f',
+}
+
+-- JSON string escaper with bulk-copy fast path.
+-- Scans for bytes that need escaping; copies clean segments via s:sub.
+-- For strings with no escapes, returns '"' .. s .. '"' with zero table allocations.
 local function encode_string(s)
-    local out = {'"'}
-    for i = 1, #s do
+    local n = #s
+    local last, i = 1, 1
+    local out = nil   -- lazily create table only when escapes found
+    while i <= n do
         local b = string_byte(s, i)
-        if b == 0x22 then out[#out+1] = '\\"'
-        elseif b == 0x5C then out[#out+1] = '\\\\'
-        elseif b == 0x0A then out[#out+1] = '\\n'
-        elseif b == 0x0D then out[#out+1] = '\\r'
-        elseif b == 0x09 then out[#out+1] = '\\t'
-        elseif b == 0x08 then out[#out+1] = '\\b'
-        elseif b == 0x0C then out[#out+1] = '\\f'
-        elseif b < 0x20 then out[#out+1] = string_format('\\u%04x', b)
-        else out[#out+1] = string.char(b)
+        local esc = ESCAPES[b]
+        if esc or b < 0x20 then
+            if not out then out = {'"'} end
+            if i > last then out[#out + 1] = s:sub(last, i - 1) end
+            if esc then
+                out[#out + 1] = esc
+            else
+                out[#out + 1] = string_format('\\u%04x', b)
+            end
+            last = i + 1
         end
+        i = i + 1
     end
-    out[#out+1] = '"'
+    if not out then return '"' .. s .. '"' end
+    if last <= n then out[#out + 1] = s:sub(last, n) end
+    out[#out + 1] = '"'
     return table.concat(out)
 end
 
@@ -400,27 +437,6 @@ local function encode_number(n)
     return string_format("%.14g", n)
 end
 
--- A lazy subtree is "dirty" if any cached descendant has been materialized
--- (no longer carries Lazy* metatable). Non-cached descendants are guaranteed
--- untouched, so we only need to walk the rawget-cached entries.
-local function is_dirty(v)
-    if type(v) ~= "table" then return false end
-    local mt = getmetatable(v)
-    if mt ~= LazyObject and mt ~= LazyArray then
-        return true  -- materialized
-    end
-    -- Use next() for raw table iteration: pairs() would invoke __pairs on
-    -- lazy tables, walking the full JSON via FFI instead of the Lua cache.
-    local k, child = next(v)
-    while k ~= nil do
-        if not INTERNAL_KEYS[k] then
-            if is_dirty(child) then return true end
-        end
-        k, child = next(v, k)
-    end
-    return false
-end
-
 -- Forward declaration so encode_lazy_object_walking, encode_lazy_array_walking,
 -- and encode_array/encode_object can reference encode before its definition is
 -- complete (Lua resolves upvalues at call time, but the slot must be declared first).
@@ -471,7 +487,7 @@ local function encode_lazy_array_walking(t)
 end
 
 local function encode_proxy(t)
-    if not is_dirty(t) then
+    if not t._dirty then
         -- Fast path: no mutations — slice the original buffer bytes.
         return t._doc._hold:sub(t._bs + 1, t._be)
     end
@@ -530,6 +546,15 @@ encode = function(v)
         if mt == LazyObject or mt == LazyArray then
             return encode_proxy(v)
         end
+        if mt == _M.empty_array_mt then
+            return encode_array(v)
+        end
+        if rawget(v, "__qjson_type") == "object" then
+            return encode_object(v)
+        end
+        if rawget(v, "__qjson_type") == "array" then
+            return encode_array(v)
+        end
         if is_array(v) then
             return encode_array(v)
         end
diff --git a/src/cursor.rs b/src/cursor.rs
index bf38d40..82ce885 100644
--- a/src/cursor.rs
+++ b/src/cursor.rs
@@ -1,6 +1,7 @@
 use crate::doc::Document;
 use crate::error::qjson_err;
 use crate::path::{PathIter, PathSeg};
+use std::rc::Rc;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub(crate) struct Cursor {
@@ -62,9 +63,10 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, q
 
     if was_cached {
         // Fast path: iterate cached (start, end) pairs. No brace counting.
+        // Rc::clone is O(1) — avoids O(n) Vec clone of previous implementation.
         let slot = cache.slot(slot_n);
-        let starts = slot.child_starts.clone();
-        let ends   = slot.child_ends.clone();
+        let starts = Rc::clone(&slot.child_starts);
+        let ends   = Rc::clone(&slot.child_ends);
         drop(cache);
         return resolve_in_known_children(doc, &starts, &ends, is_obj, seg);
     }
@@ -88,8 +90,8 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, q
         }
         if p == closer_byte_pos {
             let slot = cache.slot_mut(slot_n);
-            slot.child_starts = starts;
-            slot.child_ends   = ends;
+            slot.child_starts = starts.into();
+            slot.child_ends   = ends.into();
             return Err(qjson_err::QJSON_NOT_FOUND);
         }
     }
@@ -133,8 +135,8 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, q
     }
 
     let slot = cache.slot_mut(slot_n);
-    slot.child_starts = starts;
-    slot.child_ends   = ends;
+    slot.child_starts = starts.into();
+    slot.child_ends   = ends.into();
 
     match result {
         Some(c) => Ok(c),
diff --git a/src/decode/number.rs b/src/decode/number.rs
index 74839ff..24f8cf7 100644
--- a/src/decode/number.rs
+++ b/src/decode/number.rs
@@ -19,7 +19,7 @@ pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result<i64, qjso
     }
 
     // After ABNF validation, integer-only inputs have no `.`/`e`/`E`.
-    if bytes.iter().any(|&b| b == b'.' || b == b'e' || b == b'E') {
+    if memchr::memchr3(b'.', b'e', b'E', bytes).is_some() {
         return Err(qjson_err::QJSON_TYPE_MISMATCH);
     }
     let (neg, rest) = match bytes[0] {
diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs
index 91d7584..9923705 100644
--- a/src/scan/avx2.rs
+++ b/src/scan/avx2.rs
@@ -105,21 +105,66 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
     super::validate_brackets(buf, out)
 }
 
+// Tag bits for nibble-based structural byte classification (mirrors NEON).
+const TAG_QUOTE: u8         = 0x01;
+const TAG_COMMA: u8         = 0x02;
+const TAG_COLON: u8         = 0x04;
+const TAG_OPEN_BRACKET: u8  = 0x08;
+const TAG_CLOSE_BRACKET: u8 = 0x10;
+const TAG_OPEN_BRACE: u8    = 0x20;
+const TAG_CLOSE_BRACE: u8   = 0x40;
+
 #[inline(always)]
 unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 {
-    // For each byte, set 1 if byte is one of: { } [ ] : , "
-    // Bit-OR results from 7 byte-equality compares.
-    let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"'];
-    let mut mask_lo: i32 = 0;
-    let mut mask_hi: i32 = 0;
-    for c in chars {
-        let v = _mm256_set1_epi8(c as i8);
-        let eq_lo = _mm256_cmpeq_epi8(lo, v);
-        let eq_hi = _mm256_cmpeq_epi8(hi, v);
-        mask_lo |= _mm256_movemask_epi8(eq_lo);
-        mask_hi |= _mm256_movemask_epi8(eq_hi);
-    }
-    (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32)
+    // Nibble-based classification via PSHUFB LUTs.  Each structural byte
+    // has a unique (hi, lo) nibble pair; the LUTs hold disjoint tag bits
+    // so that HI_LUT[hi] & LO_LUT[lo] is non-zero only for the 7
+    // structural bytes: { } [ ] : , "
+    #[rustfmt::skip]
+    const HI_LUT: [u8; 16] = [
+        0, 0,
+        TAG_QUOTE | TAG_COMMA,           // index 2: 0x2_
+        TAG_COLON,                       // index 3: 0x3_
+        0,
+        TAG_OPEN_BRACKET | TAG_CLOSE_BRACKET, // index 5: 0x5_
+        0,
+        TAG_OPEN_BRACE | TAG_CLOSE_BRACE,     // index 7: 0x7_
+        0, 0, 0, 0, 0, 0, 0, 0,
+    ];
+    #[rustfmt::skip]
+    const LO_LUT: [u8; 16] = [
+        0, 0,
+        TAG_QUOTE,                                   // index  2: 0x_2
+        0, 0, 0, 0, 0, 0, 0,
+        TAG_COLON,                                   // index 10: 0x_A
+        TAG_OPEN_BRACKET | TAG_OPEN_BRACE,           // index 11: 0x_B
+        TAG_COMMA,                                   // index 12: 0x_C
+        TAG_CLOSE_BRACKET | TAG_CLOSE_BRACE,         // index 13: 0x_D
+        0, 0,
+    ];
+
+    let hi_lut = _mm256_broadcastsi128_si256(
+        _mm_loadu_si128(HI_LUT.as_ptr() as *const __m128i));
+    let lo_lut = _mm256_broadcastsi128_si256(
+        _mm_loadu_si128(LO_LUT.as_ptr() as *const __m128i));
+    let mask_0f = _mm256_set1_epi8(0x0f);
+    let zero   = _mm256_setzero_si256();
+    let all_ff = _mm256_cmpeq_epi8(zero, zero); // 0xFF in every lane
+
+    let classify = |chunk: __m256i| -> i32 {
+        let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask_0f);
+        let lo_nib = _mm256_and_si256(chunk, mask_0f);
+        let hi_part = _mm256_shuffle_epi8(hi_lut, hi_nib);
+        let lo_part = _mm256_shuffle_epi8(lo_lut, lo_nib);
+        let tags = _mm256_and_si256(hi_part, lo_part);
+        // tags != 0  →  structural.  Map to 0xFF / 0x00 for movemask.
+        let is_zero = _mm256_cmpeq_epi8(tags, zero);
+        _mm256_movemask_epi8(_mm256_xor_si256(is_zero, all_ff))
+    };
+
+    let mlo = classify(lo);
+    let mhi = classify(hi);
+    (mlo as u32 as u64) | ((mhi as u32 as u64) << 32)
 }
 
 /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`.
diff --git a/src/skip_cache.rs b/src/skip_cache.rs
index 6b6b1dd..1b335bc 100644
--- a/src/skip_cache.rs
+++ b/src/skip_cache.rs
@@ -1,4 +1,5 @@
 use rustc_hash::FxHashMap;
+use std::rc::Rc;
 
 #[derive(Default)]
 pub(crate) struct SkipCache {
@@ -12,17 +13,18 @@ pub(crate) struct SkipSlot {
     /// child_starts[i] = position in doc.indices of the i-th child's leading
     /// marker. For object children this is the key's opening '"'; for array
     /// children, the value's first marker.
-    pub(crate) child_starts: Vec<u32>,
+    pub(crate) child_starts: Rc<[u32]>,
     /// child_ends[i] = the `cursor_end` value for the i-th child (i.e. the
     /// idx_end to put in a Cursor pointing at that child's value). Storing
     /// this lets cache-hit resolution skip the brace-counting find_value_span.
-    pub(crate) child_ends:   Vec<u32>,
+    pub(crate) child_ends:   Rc<[u32]>,
 }
 
 impl SkipCache {
     pub(crate) fn new() -> Self {
+        let empty: Rc<[u32]> = Rc::from([]);
         Self {
-            slots: vec![SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() }],
+            slots: vec![SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }],
             by_opener: FxHashMap::default(),
         }
     }
@@ -34,7 +36,8 @@ impl SkipCache {
             return (slot, true);
         }
         let new = self.slots.len() as u32;
-        self.slots.push(SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() });
+        let empty: Rc<[u32]> = Rc::from([]);
+        self.slots.push(SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty });
         self.by_opener.insert(opener_idx, new);
         (new, false)
     }
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index a9ce958..803831e 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -145,10 +145,56 @@ pub(crate) fn validate_eager_values(
     indices: &[u32],
     max_depth: u32,
 ) -> Result<(), qjson_err> {
-    // Stack of container contexts; the top is the current state.
-    // We use a single seed entry `CtxKind::Top` for the root value.
-    let mut stack: Vec<CtxKind> = Vec::with_capacity(16);
-    stack.push(CtxKind::Top);
+    // Fixed-size stack avoids heap allocation for typical JSON depths.
+    const STACK_CAP: usize = 64;
+    let mut stack_buf: [CtxKind; STACK_CAP] = [CtxKind::Top; STACK_CAP];
+    let mut sp: usize = 1; // next free slot (= len)
+    let mut fallback: Option<Vec<CtxKind>> = None;
+
+    macro_rules! push {
+        ($kind:expr) => {
+            if sp < STACK_CAP {
+                stack_buf[sp] = $kind;
+                sp += 1;
+            } else {
+                let fb = fallback.get_or_insert_with(|| {
+                    let mut v: Vec<CtxKind> = Vec::with_capacity(STACK_CAP + 16);
+                    v.extend_from_slice(&stack_buf[..sp]);
+                    v
+                });
+                sp = STACK_CAP.wrapping_add(fb.len() + 1);
+                fb.push($kind);
+            }
+        };
+    }
+    macro_rules! pop {
+        () => {{
+            if sp <= STACK_CAP {
+                if sp == 0 { None }
+                else { sp -= 1; Some(stack_buf[sp]) }
+            } else {
+                let fb = fallback.as_mut().unwrap();
+                let val = fb.pop();
+                if fb.is_empty() { sp = STACK_CAP; }
+                val
+            }
+        }};
+    }
+    macro_rules! last_mut {
+        () => {{
+            if sp <= STACK_CAP {
+                if sp == 0 { None } else { Some(&mut stack_buf[sp - 1]) }
+            } else {
+                fallback.as_mut().unwrap().last_mut()
+            }
+        }};
+    }
+    macro_rules! stack_len {
+        () => { if sp <= STACK_CAP { sp } else { fallback.as_ref().map_or(0, |v| v.len()) } };
+    }
+    macro_rules! stack_is_empty {
+        () => { stack_len!() == 0 };
+    }
 
     // Byte position just past the previous structural we consumed —
     // i.e. the start of the current gap. A gap may contain a scalar
@@ -165,11 +211,11 @@ pub(crate) fn validate_eager_values(
         // First, consume any scalar token sitting in the gap before
         // this structural. This may transition the current state from
         // a value-expecting form to its "AfterValue" form.
-        consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?;
+        consume_scalar_gap(buf, prev_end, pos, last_mut!().unwrap())?;
 
         match b {
             b'{' | b'[' => {
-                let cur = stack.last_mut().unwrap();
+                let cur = last_mut!().unwrap();
                 match *cur {
                     CtxKind::Top
                     | CtxKind::ArrAfterOpen
@@ -178,10 +224,10 @@ pub(crate) fn validate_eager_values(
                         // Transition parent to AfterValue ahead of the
                         // descent; the inner container's close pops back.
                         *cur = parent_after_value(*cur);
-                        if stack.len() > max_depth as usize {
+                        if stack_len!() > max_depth as usize {
                             return Err(qjson_err::QJSON_NESTING_TOO_DEEP);
                         }
-                        stack.push(if b == b'{' {
+                        push!(if b == b'{' {
                             CtxKind::ObjAfterOpen
                         } else {
                             CtxKind::ArrAfterOpen
@@ -193,25 +239,25 @@ pub(crate) fn validate_eager_values(
                 i += 1;
             }
             b'}' => {
-                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
                 if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) {
                     return Err(qjson_err::QJSON_PARSE_ERROR);
                 }
-                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); }
                 prev_end = pos + 1;
                 i += 1;
             }
             b']' => {
-                let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
                 if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) {
                     return Err(qjson_err::QJSON_PARSE_ERROR);
                 }
-                if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); }
+                if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); }
                 prev_end = pos + 1;
                 i += 1;
             }
             b',' => {
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
                 match *cur {
                     CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma,
                     CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma,
@@ -221,7 +267,7 @@ pub(crate) fn validate_eager_values(
                 i += 1;
             }
             b':' => {
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
                 match *cur {
                     CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon,
                     _ => return Err(qjson_err::QJSON_PARSE_ERROR),
@@ -239,7 +285,7 @@ pub(crate) fn validate_eager_values(
                 }
                 strings::validate_string_span(&buf[pos + 1 .. close])?;
 
-                let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
+                let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?;
                 match *cur {
                     // Key position in an object.
                     CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => {
@@ -264,11 +310,11 @@ pub(crate) fn validate_eager_values(
     // Tail: a top-level scalar root (e.g. `42`, `true`) lives in the
     // gap after the last structural — or, if there are no structurals,
     // the whole buffer.
-    consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?;
+    consume_scalar_gap(buf, prev_end, buf.len(), last_mut!().unwrap())?;
 
     // After the walk, the stack must hold exactly one frame: the root
     // context, which must be `TopDone` (root value consumed).
-    if stack.len() != 1 || stack[0] != CtxKind::TopDone {
+    if stack_len!() != 1 || stack_buf[0] != CtxKind::TopDone {
         return Err(qjson_err::QJSON_PARSE_ERROR);
     }
     Ok(())
diff --git a/tests/lua/lazy_table_spec.lua b/tests/lua/lazy_table_spec.lua
index 2769d39..532833c 100644
--- a/tests/lua/lazy_table_spec.lua
+++ b/tests/lua/lazy_table_spec.lua
@@ -390,4 +390,54 @@ describe("qjson.encode — nested mutations propagate", function()
         inner.x = 99
         assert.are.equal(99, t.a.x)
     end)
+
+    it("modifies top-level field and encodes correctly", function()
+        local cjson = require("cjson")
+        local t = qjson.decode('{"model":"gpt-4","temperature":0.7}')
+        t.model = "gpt-5"
+        local out = qjson.encode(t)
+        local parsed = cjson.decode(out)
+        assert.are.equal("gpt-5", parsed.model)
+        assert.are.equal(0.7, parsed.temperature)
+    end)
+
+    it("adds new field and encodes correctly", function()
+        local cjson = require("cjson")
+        local t = qjson.decode('{"a":1}')
+        t.b = true
+        local out = qjson.encode(t)
+        local parsed = cjson.decode(out)
+        assert.are.equal(1, parsed.a)
+        assert.are.equal(true, parsed.b)
+    end)
+
+    it("modifies nested field and encodes correctly", function()
+        local cjson = require("cjson")
+        local t = qjson.decode('{"messages":[{"role":"user","content":"hello"}]}')
+        t.messages[1].content = "world"
+        local out = qjson.encode(t)
+        local parsed = cjson.decode(out)
+        assert.are.equal("user", parsed.messages[1].role)
+        assert.are.equal("world", parsed.messages[1].content)
+    end)
+
+    it("encodes unmodified proxy via fast path", function()
+        local json = '{"a":1,"b":"text","c":true}'
+        local t = qjson.decode(json)
+        local out = qjson.encode(t)
+        local cjson = require("cjson")
+        local parsed = cjson.decode(out)
+        assert.are.equal(1, parsed.a)
+        assert.are.equal("text", parsed.b)
+        assert.are.equal(true, parsed.c)
+    end)
+
+    it("encodes string with escapes correctly", function()
+        local t = qjson.decode('{"key":"value"}')
+        t.key = 'line1\nline2\t"quoted"'
+        local out = qjson.encode(t)
+        local cjson = require("cjson")
+        local parsed = cjson.decode(out)
+        assert.are.equal('line1\nline2\t"quoted"', parsed.key)
+    end)
 end)

From 913f6a868a79c70c89f371e5bd44cd9b1495d7a9 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 20:51:31 +0800
Subject: [PATCH 02/11] fix: prevent __qjson_type and _dirty leakage into
 encode output

- Skip __qjson_type internal marker in encode_object() pairs iteration
- Guard dirty propagation to stop at non-lazy ancestors,
  preventing _dirty rawset on already-materialized tables
---
 lua/qjson/table.lua | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua
index 754e3f8..c19da24 100644
--- a/lua/qjson/table.lua
+++ b/lua/qjson/table.lua
@@ -266,6 +266,8 @@ LazyObject.__newindex = function(t, k, v)
     -- Mark dirty from this view up to the root.
     local cur = t
     while cur do
+        local mt = getmetatable(cur)
+        if mt ~= LazyObject and mt ~= LazyArray then break end
         rawset(cur, "_dirty", true)
         cur = rawget(cur, "_parent")
     end
@@ -297,6 +299,8 @@ LazyArray.__newindex = function(t, k, v)
     -- Mark dirty from this view up to the root.
     local cur = t
     while cur do
+        local mt = getmetatable(cur)
+        if mt ~= LazyObject and mt ~= LazyArray then break end
         rawset(cur, "_dirty", true)
         cur = rawget(cur, "_parent")
     end
@@ -525,7 +529,9 @@ local function encode_object(t)
         if type(k) ~= "string" then
             error("qjson.encode: object key must be a string, got " .. type(k))
         end
-        parts[#parts+1] = encode_string(k) .. ":" .. encode(v)
+        if k ~= "__qjson_type" then
+            parts[#parts+1] = encode_string(k) .. ":" .. encode(v)
+        end
     end
     return "{" .. table.concat(parts, ",") .. "}"
 end

From 113e083b4f32489aa356700d03fa801e6df1a2c6 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 20:55:31 +0800
Subject: [PATCH 03/11] fix: use weak side-table for type hints, fix array-root
 modify_add

- Replace __qjson_type string key with local weak side-table
  (TABLE_TYPE_HINT) to prevent collision with user payload keys
- Fix github_table_modify_add to mutate first element instead of
  array root so the added field appears in encoded output
---
 benches/lua_bench.lua |  4 +++-
 lua/qjson/table.lua   | 16 +++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 81639b8..4e16c03 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -259,7 +259,9 @@ local function github_table_modify_top(t)
 end
 
 local function github_table_modify_add(t)
-    t.extra_field = true
+    if t[1] then
+        t[1].extra_field = true
+    end
 end
 
 local function github_table_modify_nested(t)
diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua
index c19da24..486e28d 100644
--- a/lua/qjson/table.lua
+++ b/lua/qjson/table.lua
@@ -23,6 +23,10 @@ else
     _M.empty_array_mt  = { __jsontype = "array" }
 end
 
+-- Weak side-table for container type hints, avoiding collision with
+-- user-visible keys.  Maps materialized table → "object" | "array".
+local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" })
+
 -- Box scratch used for one-shot FFI returns. Reused across calls to avoid
 -- per-call allocation; safe because the parent Doc / lazy view holds the
 -- buffer alive and these are read-and-copy.
@@ -285,7 +289,7 @@ LazyObject.__newindex = function(t, k, v)
         rawset(t, f, nil)
     end
     setmetatable(t, nil)
-    rawset(t, "__qjson_type", "object")
+    TABLE_TYPE_HINT[t] = "object"
     for _, kv in ipairs(contents) do
         rawset(t, kv[1], cache[kv[1]] or kv[2])
     end
@@ -318,7 +322,7 @@ LazyArray.__newindex = function(t, k, v)
         rawset(t, f, nil)
     end
     setmetatable(t, _M.empty_array_mt)
-    rawset(t, "__qjson_type", "array")
+    TABLE_TYPE_HINT[t] = "array"
     for i, x in ipairs(contents) do
         rawset(t, i, cache[i] or x)
     end
@@ -529,9 +533,7 @@ local function encode_object(t)
         if type(k) ~= "string" then
             error("qjson.encode: object key must be a string, got " .. type(k))
         end
-        if k ~= "__qjson_type" then
-            parts[#parts+1] = encode_string(k) .. ":" .. encode(v)
-        end
+        parts[#parts+1] = encode_string(k) .. ":" .. encode(v)
     end
     return "{" .. table.concat(parts, ",") .. "}"
 end
@@ -555,10 +557,10 @@ encode = function(v)
         if mt == _M.empty_array_mt then
             return encode_array(v)
         end
-        if rawget(v, "__qjson_type") == "object" then
+        if TABLE_TYPE_HINT[v] == "object" then
             return encode_object(v)
         end
-        if rawget(v, "__qjson_type") == "array" then
+        if TABLE_TYPE_HINT[v] == "array" then
             return encode_array(v)
         end
         if is_array(v) then

From c8630b7d39748fee0245cdb1aada9172c40a882d Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 21:19:04 +0800
Subject: [PATCH 04/11] perf: extract encode_plain_table to keep lazy-proxy
 fast path narrow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move TABLE_TYPE_HINT/is_array/empty_array_mt dispatch into a
separate function so the hot encode path (lazy proxy → memcpy)
stays minimal for LuaJIT trace compilation.
---
 lua/qjson/table.lua | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua
index 486e28d..9af2241 100644
--- a/lua/qjson/table.lua
+++ b/lua/qjson/table.lua
@@ -538,6 +538,26 @@ local function encode_object(t)
     return "{" .. table.concat(parts, ",") .. "}"
 end
 
+-- Dispatch for plain (non-lazy) tables. Separated from the main encode
+-- function to keep the lazy-proxy fast path narrow for LuaJIT traces.
+local function encode_plain_table(v)
+    local mt = getmetatable(v)
+    if mt == _M.empty_array_mt then
+        return encode_array(v)
+    end
+    local hint = TABLE_TYPE_HINT[v]
+    if hint == "object" then
+        return encode_object(v)
+    end
+    if hint == "array" then
+        return encode_array(v)
+    end
+    if is_array(v) then
+        return encode_array(v)
+    end
+    return encode_object(v)
+end
+
 encode = function(v)
     if rawequal(v, _M.null) then
         return "null"
@@ -554,19 +574,7 @@ encode = function(v)
         if mt == LazyObject or mt == LazyArray then
             return encode_proxy(v)
         end
-        if mt == _M.empty_array_mt then
-            return encode_array(v)
-        end
-        if TABLE_TYPE_HINT[v] == "object" then
-            return encode_object(v)
-        end
-        if TABLE_TYPE_HINT[v] == "array" then
-            return encode_array(v)
-        end
-        if is_array(v) then
-            return encode_array(v)
-        end
-        return encode_object(v)
+        return encode_plain_table(v)
     end
     error("qjson.encode: unsupported value type: " .. tv)
 end

From 65da500ba57dd5ab75d6949857874ca6c622bb2c Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 15:38:22 +0000
Subject: [PATCH 05/11] perf+fix: revert AVX2 PSHUFB LUT on x86; address review
 feedback

Scanner (src/scan/avx2.rs):
- Revert structural_mask_chunk from the PSHUFB-LUT nibble classifier
  back to the parallel 7-cmpeq form. On AMD Zen2 the PSHUFB variant
  measured -45% parse on small payloads (-7% on 1m where the
  in-string fast-probe dominates anyway): VPSHUFB ymm is split into
  two micro-ops per 128-bit lane, the srli->and->pshufb->and->cmpeq->
  xor->movemask chain has a longer critical path, and VPMOVMSKB
  (lat 4) plus the LUT-load constants pressure the FP ports. The
  parallel cmpeq design lets independent compare chains dispatch
  across multiple ports. NEON path is unaffected (separate file).

Lua layer (lua/qjson/table.lua) -- review feedback:
- Update INTERNAL_KEYS comment to reference its actual consumers
  (__newindex cache snapshotting and encode_lazy_object_walking),
  not the removed recursive is_dirty walk.
- Replace 'for _, f in ipairs({...})' in both LazyObject.__newindex
  and LazyArray.__newindex with seven inline rawset(t, ..., nil)
  calls. Eliminates the per-call 7-elem table-literal allocation
  on the materialization path. Uses rawset (not multi-assignment
  to nil) because root views from _M.decode lack _parent, so
  't._parent = nil' could fire __newindex recursively.

Bench (benches/lua_bench.lua):
- Force LuaJIT to evaluate qjson.encode results: replace
  'local _ = qjson.encode(t)' with 'local _enc = qjson.encode(t);
  if #_enc < 2 then error(...) end' in all 8 mutation/encode cases.
  Without this, LuaJIT could partially DCE the encode call on some
  trace shapes and produce misleading speedups.
- Raise the warmup floor from max(3, iters/5) to max(50, iters/5).
  LuaJIT's default hotloop is 56; with the original floor of 3, the
  1m payload (iters=15) measured pre-JIT interpreter mode for most
  of the run.
- Raise 500k iters 20->100 and 1m iters 15->60 so per-round wall
  time exceeds os.clock granularity (~1ms) by a comfortable margin.
---
 benches/lua_bench.lua | 33 ++++++++++++-------
 lua/qjson/table.lua   | 26 ++++++++++-----
 src/scan/avx2.rs      | 76 ++++++++++---------------------------------
 3 files changed, 58 insertions(+), 77 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 4e16c03..1360b27 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -145,7 +145,10 @@ local ROUNDS = 5
 local function bench(name, iters, fn)
     -- Warmup pass: lets JIT compile hot traces and any one-time pools fill
     -- before measurement starts. Excluded from timing and memory delta.
-    local warmup = math.max(3, math.floor(iters / 5))
+    -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave
+    -- the bench measuring interpreter mode for the large-payload scenarios
+    -- (1m has iters=15, iters/5=3 → trace never compiles → ~30% noise).
+    local warmup = math.max(50, math.floor(iters / 5))
     for _ = 1, warmup do fn() end
 
     collectgarbage("collect")
@@ -278,8 +281,8 @@ local scenarios = {
      modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested},
     {name = "100k",   iters = 100,  payload = make_payload(100 * 1024)},
     {name = "200k",   iters = 50,   payload = make_payload(200 * 1024)},
-    {name = "500k",   iters = 20,   payload = make_payload(500 * 1024)},
-    {name = "1m",     iters = 15,   payload = make_payload(1024 * 1024)},
+    {name = "500k",   iters = 100,  payload = make_payload(500 * 1024)},
+    {name = "1m",     iters = 60,   payload = make_payload(1024 * 1024)},
     {name = "2m",     iters = 20,   payload = make_payload(2 * 1024 * 1024)},
     {name = "5m",     iters = 20,   payload = make_payload(5 * 1024 * 1024)},
     {name = "10m",    iters = 20,   payload = make_payload(10 * 1024 * 1024)},
@@ -342,25 +345,29 @@ for _, s in ipairs(scenarios) do
 
     bench("qjson.decode + qjson.encode (unmodified)", s.iters, function()
         local t = qjson.decode(s.payload)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     bench("qjson.decode + modify top + encode", s.iters, function()
         local t = qjson.decode(s.payload)
         modify_top(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     bench("qjson.decode + add field + encode", s.iters, function()
         local t = qjson.decode(s.payload)
         modify_add(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     bench("qjson.decode + modify nested + encode", s.iters, function()
         local t = qjson.decode(s.payload)
         modify_nested(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 end
 
@@ -437,7 +444,8 @@ do
     bench("qjson.decode + qjson.encode (unmodified)", 400, function()
         local p = next_p()
         local t = qjson.decode(p)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     next_p = make_cycler(interleaved)
@@ -445,7 +453,8 @@ do
         local p = next_p()
         local t = qjson.decode(p)
         default_table_modify_top(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     next_p = make_cycler(interleaved)
@@ -453,7 +462,8 @@ do
         local p = next_p()
         local t = qjson.decode(p)
         default_table_modify_add(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 
     next_p = make_cycler(interleaved)
@@ -461,6 +471,7 @@ do
         local p = next_p()
         local t = qjson.decode(p)
         default_table_modify_nested(t)
-        local _ = qjson.encode(t)
+        local _enc = qjson.encode(t)
+        if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 end
diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua
index 9af2241..7c1ccf1 100644
--- a/lua/qjson/table.lua
+++ b/lua/qjson/table.lua
@@ -254,8 +254,10 @@ local function materialize_array_contents(view)
 end
 
 -- The set of keys reserved by the lazy view bookkeeping; user-supplied JSON
--- keys with these names would collide (minor, deferred). Centralized here so
--- the dirty check and __newindex can share the list.
+-- keys with these names would collide (minor, deferred). Centralized so
+-- __newindex (cache snapshotting before materialization) and
+-- encode_lazy_object_walking (skipping internals while encoding a dirty
+-- proxy) share one source of truth.
 local INTERNAL_KEYS = {
     _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true,
     _parent = true, _dirty = true,
@@ -285,9 +287,13 @@ LazyObject.__newindex = function(t, k, v)
         end
         ck, cv = next(t, ck)
     end
-    for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do
-        rawset(t, f, nil)
-    end
+    rawset(t, "_parent",  nil)
+    rawset(t, "_dirty",   nil)
+    rawset(t, "_doc",     nil)
+    rawset(t, "_cur_box", nil)
+    rawset(t, "_cur",     nil)
+    rawset(t, "_bs",      nil)
+    rawset(t, "_be",      nil)
     setmetatable(t, nil)
     TABLE_TYPE_HINT[t] = "object"
     for _, kv in ipairs(contents) do
@@ -318,9 +324,13 @@ LazyArray.__newindex = function(t, k, v)
         end
         ck, cv = next(t, ck)
     end
-    for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do
-        rawset(t, f, nil)
-    end
+    rawset(t, "_parent",  nil)
+    rawset(t, "_dirty",   nil)
+    rawset(t, "_doc",     nil)
+    rawset(t, "_cur_box", nil)
+    rawset(t, "_cur",     nil)
+    rawset(t, "_bs",      nil)
+    rawset(t, "_be",      nil)
     setmetatable(t, _M.empty_array_mt)
     TABLE_TYPE_HINT[t] = "array"
     for i, x in ipairs(contents) do
diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs
index 9923705..2f0f414 100644
--- a/src/scan/avx2.rs
+++ b/src/scan/avx2.rs
@@ -105,66 +105,26 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
     super::validate_brackets(buf, out)
 }
 
-// Tag bits for nibble-based structural byte classification (mirrors NEON).
-const TAG_QUOTE: u8         = 0x01;
-const TAG_COMMA: u8         = 0x02;
-const TAG_COLON: u8         = 0x04;
-const TAG_OPEN_BRACKET: u8  = 0x08;
-const TAG_CLOSE_BRACKET: u8 = 0x10;
-const TAG_OPEN_BRACE: u8    = 0x20;
-const TAG_CLOSE_BRACE: u8   = 0x40;
-
 #[inline(always)]
 unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 {
-    // Nibble-based classification via PSHUFB LUTs.  Each structural byte
-    // has a unique (hi, lo) nibble pair; the LUTs hold disjoint tag bits
-    // so that HI_LUT[hi] & LO_LUT[lo] is non-zero only for the 7
-    // structural bytes: { } [ ] : , "
-    #[rustfmt::skip]
-    const HI_LUT: [u8; 16] = [
-        0, 0,
-        TAG_QUOTE | TAG_COMMA,           // index 2: 0x2_
-        TAG_COLON,                       // index 3: 0x3_
-        0,
-        TAG_OPEN_BRACKET | TAG_CLOSE_BRACKET, // index 5: 0x5_
-        0,
-        TAG_OPEN_BRACE | TAG_CLOSE_BRACE,     // index 7: 0x7_
-        0, 0, 0, 0, 0, 0, 0, 0,
-    ];
-    #[rustfmt::skip]
-    const LO_LUT: [u8; 16] = [
-        0, 0,
-        TAG_QUOTE,                                   // index  2: 0x_2
-        0, 0, 0, 0, 0, 0, 0,
-        TAG_COLON,                                   // index 10: 0x_A
-        TAG_OPEN_BRACKET | TAG_OPEN_BRACE,           // index 11: 0x_B
-        TAG_COMMA,                                   // index 12: 0x_C
-        TAG_CLOSE_BRACKET | TAG_CLOSE_BRACE,         // index 13: 0x_D
-        0, 0,
-    ];
-
-    let hi_lut = _mm256_broadcastsi128_si256(
-        _mm_loadu_si128(HI_LUT.as_ptr() as *const __m128i));
-    let lo_lut = _mm256_broadcastsi128_si256(
-        _mm_loadu_si128(LO_LUT.as_ptr() as *const __m128i));
-    let mask_0f = _mm256_set1_epi8(0x0f);
-    let zero   = _mm256_setzero_si256();
-    let all_ff = _mm256_cmpeq_epi8(zero, zero); // 0xFF in every lane
-
-    let classify = |chunk: __m256i| -> i32 {
-        let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask_0f);
-        let lo_nib = _mm256_and_si256(chunk, mask_0f);
-        let hi_part = _mm256_shuffle_epi8(hi_lut, hi_nib);
-        let lo_part = _mm256_shuffle_epi8(lo_lut, lo_nib);
-        let tags = _mm256_and_si256(hi_part, lo_part);
-        // tags != 0  →  structural.  Map to 0xFF / 0x00 for movemask.
-        let is_zero = _mm256_cmpeq_epi8(tags, zero);
-        _mm256_movemask_epi8(_mm256_xor_si256(is_zero, all_ff))
-    };
-
-    let mlo = classify(lo);
-    let mhi = classify(hi);
-    (mlo as u32 as u64) | ((mhi as u32 as u64) << 32)
+    // 7 parallel byte-equality compares. On AMD Zen2 these dispatch across
+    // multiple FP ports and beat a PSHUFB-LUT nibble classifier (PSHUFB ymm
+    // is split into 2 micro-ops per lane, the LUT chain lengthens the
+    // critical path, and VPMOVMSKB has lat=4 — the 14-movemask total is
+    // still cheaper than the LUT path's serial dependency). PR #54 tried
+    // PSHUFB-LUT but measured -45% parse on small payloads on Zen2; this
+    // form is what shipped through #51.
+    let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"'];
+    let mut mask_lo: i32 = 0;
+    let mut mask_hi: i32 = 0;
+    for c in chars {
+        let v = _mm256_set1_epi8(c as i8);
+        let eq_lo = _mm256_cmpeq_epi8(lo, v);
+        let eq_hi = _mm256_cmpeq_epi8(hi, v);
+        mask_lo |= _mm256_movemask_epi8(eq_lo);
+        mask_hi |= _mm256_movemask_epi8(eq_hi);
+    }
+    (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32)
 }
 
 /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`.

From c3901b01b5c962d3fb82d7d1514632260e339e86 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 21:35:51 +0000
Subject: [PATCH 06/11] =?UTF-8?q?fix:=20address=20review=20feedback=20?=
 =?UTF-8?q?=E2=80=94=20clippy=20lint,=20warmup=20comment,=20shared=20empty?=
 =?UTF-8?q?=20Rc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Collapse nested if in parse_f64 skip_validation path (clippy::collapsible_if)
- Allow clippy::approx_constant on intentional test value 3.14
- Replace push loops with vec![b'['; N] in depth tests (clippy::same-item-push)
- Update warmup comment in lua_bench.lua to reference current iters values
- Use shared empty_rc field in SkipCache to avoid per-slot Rc::from([]) allocation
---
 benches/lua_bench.lua |  2 +-
 src/decode/number.rs  |  9 +++++----
 src/skip_cache.rs     | 16 ++++++++++++----
 src/validate/mod.rs   | 10 ++++------
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 1360b27..9462bce 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -147,7 +147,7 @@ local function bench(name, iters, fn)
     -- before measurement starts. Excluded from timing and memory delta.
     -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave
     -- the bench measuring interpreter mode for the large-payload scenarios
-    -- (1m has iters=15, iters/5=3 → trace never compiles → ~30% noise).
+    -- (e.g. 500k has iters=100, iters/5=20 → without floor, traces may not compile).
     local warmup = math.max(50, math.floor(iters / 5))
     for _ = 1, warmup do fn() end
 
diff --git a/src/decode/number.rs b/src/decode/number.rs
index 24f8cf7..ba1176f 100644
--- a/src/decode/number.rs
+++ b/src/decode/number.rs
@@ -48,10 +48,10 @@ pub(crate) fn parse_f64(bytes: &[u8], skip_validation: bool) -> Result<f64, qjso
     // When validation is skipped, do a cheap precheck to avoid returning
     // a mode-dependent error code for non-number input.  The leading
     // byte must plausibly start a JSON number: `-`, `.`, or digit.
-    if skip_validation {
-        if bytes.is_empty() || !matches!(bytes[0], b'-' | b'.' | b'0'..=b'9') {
-            return Err(qjson_err::QJSON_INVALID_NUMBER);
-        }
+    if skip_validation
+        && (bytes.is_empty() || !matches!(bytes[0], b'-' | b'.' | b'0'..=b'9'))
+    {
+        return Err(qjson_err::QJSON_INVALID_NUMBER);
     }
 
     let s = std::str::from_utf8(bytes).map_err(|_| qjson_err::QJSON_DECODE_FAILED)?;
@@ -120,6 +120,7 @@ mod tests {
     }
 
     #[test]
+    #[allow(clippy::approx_constant)]
     fn f64_skip_validation_valid_input() {
         assert_eq!(parse_f64(b"3.14", true).unwrap(), 3.14);
     }
diff --git a/src/skip_cache.rs b/src/skip_cache.rs
index 1b335bc..bacf4ca 100644
--- a/src/skip_cache.rs
+++ b/src/skip_cache.rs
@@ -1,12 +1,14 @@
 use rustc_hash::FxHashMap;
 use std::rc::Rc;
 
-#[derive(Default)]
 pub(crate) struct SkipCache {
     /// Slot 0 reserved as "no cache" marker (never written to).
     slots: Vec<SkipSlot>,
     /// Map from a container's opener position-in-indices to slot index.
     by_opener: FxHashMap<u32, u32>,
+    /// Shared empty Rc slice reused for all newly-created empty slots,
+    /// avoiding per-slot Rc allocation until the slot is populated.
+    empty_rc: Rc<[u32]>,
 }
 
 pub(crate) struct SkipSlot {
@@ -24,8 +26,12 @@ impl SkipCache {
     pub(crate) fn new() -> Self {
         let empty: Rc<[u32]> = Rc::from([]);
         Self {
-            slots: vec![SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }],
+            slots: vec![SkipSlot {
+                child_starts: Rc::clone(&empty),
+                child_ends: Rc::clone(&empty),
+            }],
             by_opener: FxHashMap::default(),
+            empty_rc: empty,
         }
     }
 
@@ -36,8 +42,10 @@ impl SkipCache {
             return (slot, true);
         }
         let new = self.slots.len() as u32;
-        let empty: Rc<[u32]> = Rc::from([]);
-        self.slots.push(SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty });
+        self.slots.push(SkipSlot {
+            child_starts: Rc::clone(&self.empty_rc),
+            child_ends: Rc::clone(&self.empty_rc),
+        });
         self.by_opener.insert(opener_idx, new);
         (new, false)
     }
diff --git a/src/validate/mod.rs b/src/validate/mod.rs
index 803831e..aa6f53b 100644
--- a/src/validate/mod.rs
+++ b/src/validate/mod.rs
@@ -541,9 +541,8 @@ mod tests {
     #[test]
     fn grammar_accepts_at_max_depth() {
         // 1024 nested arrays at the default max_depth limit.
-        let mut buf = Vec::new();
-        for _ in 0..1024 { buf.push(b'['); }
-        for _ in 0..1024 { buf.push(b']'); }
+        let mut buf = vec![b'['; 1024];
+        buf.extend_from_slice(&vec![b']'; 1024]);
         assert!(
             validate_eager_values(&buf, &ix(&buf), 1024).is_ok(),
             "should accept exactly at max_depth"
@@ -553,9 +552,8 @@ mod tests {
     #[test]
     fn grammar_rejects_over_max_depth() {
         // 1025 nested arrays — one past the default max_depth limit.
-        let mut buf = Vec::new();
-        for _ in 0..1025 { buf.push(b'['); }
-        for _ in 0..1025 { buf.push(b']'); }
+        let mut buf = vec![b'['; 1025];
+        buf.extend_from_slice(&vec![b']'; 1025]);
         assert_eq!(
             validate_eager_values(&buf, &ix(&buf), 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP),
         );

From 9c491353be5700906b7112e9019a878708f5a642 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 22:08:07 +0000
Subject: [PATCH 07/11] perf(bench): fresh-process isolation per scenario,
 document modify+encode results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
- benches/lua_bench.lua: accept arg[1] as scenario name filter;
  when set, only that single scenario runs (backward-compatible —
  no arg runs all as before).
- Makefile: bench target loops BENCH_SCENARIOS, launching a fresh
  resty process for each. Eliminates accumulated GC/JIT trace-cache
  interference between payload sizes.
- docs/benchmarks.md: document fresh-process methodology, add
  modify+encode workload descriptions and throughput table.
- README.md: reference modify+encode scenarios added in PR #54.

Benchmark summary (x86_64 Linux, 3-round fresh-process, PR vs main):

modify + encode path (median ops/s, PR main):
  small  modify top:     +41%..+75%   (33k→59k)
  small  modify nested:  +44%..+80%   (29k→53k)
  medium modify nested:  +58%..+106%  (92k→190k)
  100k   modify nested:  +31%..+45%   (47k→68k)
  interleaved mod top:   +2%..+30%   (5.8k→7.2k)

30/33 data points PR > main (91%). encode(unmodified) broadly flat
or slightly up (+2%..+26%), consistent with JIT trace slot competition
from the additional modify+encode scenarios.
---
 Makefile              |  8 ++++++--
 README.md             | 15 ++++----------
 benches/lua_bench.lua | 11 +++++++++++
 docs/benchmarks.md    | 46 +++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 628ee82..8f44942 100644
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,12 @@ test: build ## Run cargo tests + busted Lua tests
 lint: ## Run clippy with -D warnings
 	cargo clippy --release --all-targets -- -D warnings
 
-bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark
-	$(LUA_ENV) $(RESTY) benches/lua_bench.lua
+BENCH_SCENARIOS := small medium github-100k 100k 200k 500k 1m 2m 5m 10m interleaved
+
+bench: build vendor/lua-cjson/cjson.so ## Run each scenario in a fresh LuaJIT process
+	@for s in $(BENCH_SCENARIOS); do \
+		$(LUA_ENV) $(RESTY) benches/lua_bench.lua $$s; \
+	done
 
 vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile
 ifeq ($(shell uname),Darwin)
diff --git a/README.md b/README.md
index 59d7738..7c03959 100644
--- a/README.md
+++ b/README.md
@@ -111,17 +111,10 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
 |   1 MB |     517 |   3,538 |  16,520 |  16,988 | 32.0× / 32.9× |
 |  10 MB |      50 |     402 |   1,899 |   1,918 | 38.0× / 38.4× |
 
-`qjson.parse` wins because it skips building a Lua table for the parts you
-never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top
-with similar throughput. Memory retention for `qjson` is essentially
-flat in payload size (a few KB for the reusable buffers), while `cjson`
-and `simdjson` retain more Lua heap because they materialize the table tree.
-
-See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
-memory numbers, an "encode round-trip" row (passthrough emit via
-`memcpy`), exact environment, and the reproduction command. `make bench`
-uses `lua-resty-simdjson` when `resty.simdjson` is available in the
-OpenResty environment; otherwise it skips the simdjson rows.
+Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode
+workloads; small payload modify+encode reaches 48k–60k ops/s. See
+[`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
+modify+encode results, memory numbers, environment, and reproduction.
 
 ```sh
 make bench       # qjson vs cjson and lua-resty-simdjson
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 9462bce..4aa1196 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -293,12 +293,18 @@ local scenarios = {
 local has_pooled_api = type(qjson.new_decoder) == "function"
 local pooled_decoder = has_pooled_api and qjson.new_decoder() or nil
 
+-- Optional scenario filter: arg[1] = scenario name (e.g. "small").
+-- When set, only that single scenario runs in a fresh LuaJIT process,
+-- avoiding accumulated GC/JIT state from prior payloads.
+local filter = arg[1]
+
 if not simdjson then
     print("lua-resty-simdjson unavailable; skipping simdjson rows: "
         .. tostring(simdjson_or_err))
 end
 
 for _, s in ipairs(scenarios) do
+    if filter and s.name ~= filter then goto continue_scenario end
     print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))
 
     local cjson_access = s.cjson_access or default_cjson_access
@@ -369,6 +375,7 @@ for _, s in ipairs(scenarios) do
         local _enc = qjson.encode(t)
         if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
+    ::continue_scenario::
 end
 
 -- Interleaved scenario: cycle through several payloads of different sizes
@@ -398,6 +405,8 @@ local function make_cycler(items)
     end
 end
 
+if not filter or filter == "interleaved" then
+
 print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ",")))
 
 do
@@ -475,3 +484,5 @@ do
         if #_enc < 2 then error("qjson.encode produced too-short result") end
     end)
 end
+
+end  -- filter == "interleaved"
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index fe6f09f..e7b1e2e 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -26,7 +26,7 @@ Lua-table baselines.
 
 The harness lives at `benches/lua_bench.lua`. For each scenario:
 
-1. Warmup pass (≥ 3 iterations, or `iters / 5`) to let LuaJIT compile hot
+1. Warmup pass (≥ 50 iterations, or `iters / 5`) to let LuaJIT compile hot
    traces and the `qjson` `indices` / `scratch` buffers grow to their
    working size. Warmup is excluded from timing and the memory delta.
 2. `collectgarbage("collect")` baseline.
@@ -36,6 +36,11 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
    KB. The harness does not force a final collection after timing, so
    short-lived garbage from the last round may still be included.
 
+**Fresh-process isolation (post PR #54).** `make bench` now launches a
+separate `resty` process for each payload size (small, medium, 100k, …,
+interleaved). This avoids accumulated GC state and JIT trace-cache pressure
+from earlier payloads bleeding into later scenarios.
+
 The payload is a synthetic multimodal chat-completion request with one or more
 historical messages. Each message contains one small text part and one
 base64-encoded image part. Message count scales with payload size: the 10 MB
@@ -58,6 +63,11 @@ parsing workloads with ~3-5% structural density.
 | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads |
 | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` |
 | `qjson.decode + qjson.encode (unmodified)` | `qjson.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
+| `qjson.decode + modify top + encode` | `qjson.decode(s)`, mutate a top-level field, `qjson.encode()` | Triggers materialization of the root container + full re-encode |
+| `qjson.decode + add field + encode` | `qjson.decode(s)`, add a new top-level field, `qjson.encode()` | Same as modify-top, plus a new key shaping the encode output |
+| `qjson.decode + modify nested + encode` | `qjson.decode(s)`, mutate a deeply nested field, `qjson.encode()` | Only materializes the modified subtree branch; unmodified siblings stay on the fast path |
+
+The new modify+encode scenarios were added in [#54](https://github.com/api7/lua-qjson/pull/54) to exercise the decode → mutate → re-encode pipeline end-to-end.
 
 ## Reproducing
 
@@ -80,7 +90,7 @@ Numbers below come from one such run.
 Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---:|---:|---:|---:|---:|---:|
+|---|---|---:|---:|---:|---:|---:|---:|
 | small      |   2.1 KB |  94,075 | 108,108 | 127,214 | 120,398 | 203,666 |
 | medium     |  60.4 KB |   9,041 |  83,043 | 123,487 | 214,500 | 214,408 |
 | github-100k |   100 KB |   2,238 |   2,047 |   6,010 |   5,994 |   6,701 |
@@ -93,6 +103,28 @@ Each row is "parse + access request fields" on the named payload.
 | 10m        | 10.00 MB |      50 |     402 |   1,899 |   1,918 |   1,925 |
 | interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 |
 
+### Modify + encode throughput (PR #54)
+
+One-shot modify-then-encode benchmarks. Exercises the decode → mutate →
+re-encode pipeline. Numbers below come from a 3-round per-scenario
+fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2).
+
+| Scenario | modify top + encode | add field + encode | modify nested + encode |
+|---|---|---:|---:|---:|
+| small (2 KB)    | 59,835  | 56,655  | 47,541  |
+| medium (60 KB)  | 37,142  | 46,275  | 184,638 |
+| 100k (100 KB)   | 35,881  | 38,183  | 73,529  |
+| 200k (200 KB)   | 17,129  | 16,250  | 59,524  |
+| 500k (500 KB)   |  6,221  |  5,170  | 22,158  |
+| 1m              |  2,938  |  2,434  | 13,806  |
+| 2m              |  1,518  |  1,241  |  1,597  |
+| 5m              |    366  |    364  |    232  |
+| 10m             |    120  |    115  |     87  |
+| interleaved     |  7,176  |  5,645  | 26,824  |
+
+For a before/after comparison against the pre-#54 baseline, see the
+[PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361).
+
 ### Speed-up vs. baselines
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
@@ -163,6 +195,16 @@ key into the Lua table heap.
    structural density is higher than the multimodal request ladder. Memory
    savings remain dramatic because `cjson` must materialize every nested
    object and string into the Lua heap.
+7. **Modify + encode pipeline (PR #54)** shows the lazy-table API in
+   mutation mode. Small/medium payloads reach 47k–185k median ops/s.
+   The `_dirty` flag and `TABLE_TYPE_HINT` side-table eliminate
+   redundant tree walks and array/object re-scans inside the encoder.
+   Large payloads (≥5 MB) are dominated by the root-container
+   materialization cost, which copies all fields into a plain table.
+8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
+   interference between payload sizes. Each size now runs in its own
+   `resty` process, eliminating the systemic cross-scenario variance
+   observed in earlier benchmark runs.
 
 ## When to pick which
 

From 923b91bc8a644724668a5ed857596eb58e8b2087 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 22:13:51 +0000
Subject: [PATCH 08/11] docs: refresh all benchmark tables with fresh-process
 build data

Update README.md summary table and docs/benchmarks.md
(throughput, speedup, memory, modify+encode) with results from
a clean make bench run on the current branch.
---
 README.md          | 10 ++---
 docs/benchmarks.md | 99 +++++++++++++++++++++++-----------------------
 2 files changed, 55 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index 7c03959..b733eaf 100644
--- a/README.md
+++ b/README.md
@@ -105,11 +105,11 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
 
 | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB |  94,075 | 108,108 | 127,214 | 120,398 |  1.4× /  1.3× |
-|  60 KB |   9,041 |  83,043 | 123,487 | 214,500 | 13.7× / 23.7× |
-| 100 KB |   5,302 |  32,248 | 109,649 | 102,564 | 20.7× / 19.3× |
-|   1 MB |     517 |   3,538 |  16,520 |  16,988 | 32.0× / 32.9× |
-|  10 MB |      50 |     402 |   1,899 |   1,918 | 38.0× / 38.4× |
+|   2 KB |  92,716 | 102,602 | 128,005 | 125,815 |  1.4× /  1.4× |
+|  60 KB |   9,007 |  82,699 | 116,198 | 219,491 | 12.9× / 24.4× |
+| 100 KB |   2,769 |  40,437 |  84,034 | 121,803 | 30.3× / 44.0× |
+|   1 MB |     512 |   4,020 |  16,056 |  15,400 | 31.4× / 30.1× |
+|  10 MB |      51 |     363 |   1,830 |   1,783 | 35.9× / 35.0× |
 
 Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode
 workloads; small payload modify+encode reaches 48k–60k ops/s. See
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index e7b1e2e..25f1595 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -91,17 +91,17 @@ Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
 |---|---|---:|---:|---:|---:|---:|---:|
-| small      |   2.1 KB |  94,075 | 108,108 | 127,214 | 120,398 | 203,666 |
-| medium     |  60.4 KB |   9,041 |  83,043 | 123,487 | 214,500 | 214,408 |
-| github-100k |   100 KB |   2,238 |   2,047 |   6,010 |   5,994 |   6,701 |
-| 100k       |   100 KB |   5,302 |  32,248 | 109,649 | 102,564 | 114,548 |
-| 200k       |   200 KB |   2,659 |  19,040 |  90,090 |  92,251 | 106,383 |
-| 500k       |   500 KB |   1,052 |   7,062 |  34,722 |  35,336 |  37,453 |
-| 1m         |  1.00 MB |     517 |   3,538 |  16,520 |  16,988 |  17,261 |
-| 2m         |  2.00 MB |     258 |   2,026 |   9,021 |   8,580 |   9,033 |
-| 5m         |  5.00 MB |     102 |     663 |   2,982 |   3,728 |   3,829 |
-| 10m        | 10.00 MB |      50 |     402 |   1,899 |   1,918 |   1,925 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 |
+| small      |   2.1 KB |  92,716 | 102,602 | 128,005 | 125,815 | 260,322 |
+| medium     |  60.4 KB |   9,007 |  82,699 | 116,198 | 219,491 | 141,563 |
+| github-100k |   100 KB |   1,834 |   1,909 |   4,591 |   5,643 |   6,207 |
+| 100k       |   100 KB |   2,769 |  40,437 |  84,034 | 121,803 | 105,374 |
+| 200k       |   200 KB |   2,543 |  20,593 |  45,704 |  91,408 |  67,114 |
+| 500k       |   500 KB |   1,047 |   8,218 |  28,852 |  37,580 |  29,334 |
+| 1m         |  1.00 MB |     512 |   4,020 |  16,056 |  15,400 |  16,269 |
+| 2m         |  2.00 MB |     251 |   2,105 |   9,145 |   9,137 |   9,634 |
+| 5m         |  5.00 MB |     102 |     791 |   3,543 |   3,747 |   3,679 |
+| 10m        | 10.00 MB |      51 |     363 |   1,830 |   1,783 |   1,749 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 |
 
 ### Modify + encode throughput (PR #54)
 
@@ -111,16 +111,17 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2).
 
 | Scenario | modify top + encode | add field + encode | modify nested + encode |
 |---|---|---:|---:|---:|
-| small (2 KB)    | 59,835  | 56,655  | 47,541  |
-| medium (60 KB)  | 37,142  | 46,275  | 184,638 |
-| 100k (100 KB)   | 35,881  | 38,183  | 73,529  |
-| 200k (200 KB)   | 17,129  | 16,250  | 59,524  |
-| 500k (500 KB)   |  6,221  |  5,170  | 22,158  |
-| 1m              |  2,938  |  2,434  | 13,806  |
-| 2m              |  1,518  |  1,241  |  1,597  |
-| 5m              |    366  |    364  |    232  |
-| 10m             |    120  |    115  |     87  |
-| interleaved     |  7,176  |  5,645  | 26,824  |
+| small (2 KB)    | 58,242  | 58,190  | 43,003  |
+| medium (60 KB)  | 37,498  | 45,364  | 134,590 |
+| github-100k      |  4,419  |  3,964  |  4,359  |
+| 100k (100 KB)   | 28,114  | 34,364  | 71,942  |
+| 200k (200 KB)   | 18,282  | 16,932  | 55,127  |
+| 500k (500 KB)   |  6,850  |  4,841  | 19,001  |
+| 1m              |  3,125  |  2,998  | 13,649  |
+| 2m              |  1,788  |  1,076  |  1,555  |
+| 5m              |    366  |    283  |    215  |
+| 10m             |    120  |     92  |     83  |
+| interleaved     |  7,712  |  8,178  | 29,123  |
 
 For a before/after comparison against the pre-#54 baseline, see the
 [PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361).
@@ -129,16 +130,16 @@ For a before/after comparison against the pre-#54 baseline, see the
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
 |---|---:|---:|---:|---:|
-| small  |  1.4× |  1.2× |  1.3× |  1.1× |
-| medium | 13.7× |  1.5× | 23.7× |  2.6× |
-| github-100k | 2.7× |  2.9× | 2.7× |  2.9× |
-| 100k   | 20.7× |  3.4× | 19.3× |  3.2× |
-| 200k   | 33.9× |  4.7× | 34.7× |  4.8× |
-| 500k   | 33.0× |  4.9× | 33.6× |  5.0× |
-| 1m     | 32.0× |  4.7× | 32.9× |  4.8× |
-| 2m     | 35.0× |  4.5× | 33.3× |  4.2× |
-| 5m     | 29.2× |  4.5× | 36.5× |  5.6× |
-| 10m    | 38.0× |  4.7× | 38.4× |  4.8× |
+| small  |  1.4× |  1.2× |  1.4× |  1.2× |
+| medium | 12.9× |  1.4× | 24.4× |  2.7× |
+| github-100k | 2.5× |  2.4× | 3.1× |  3.0× |
+| 100k   | 30.3× |  2.1× | 44.0× |  3.0× |
+| 200k   | 18.0× |  2.2× | 35.9× |  4.4× |
+| 500k   | 27.6× |  3.5× | 35.9× |  4.6× |
+| 1m     | 31.4× |  4.0× | 30.1× |  3.8× |
+| 2m     | 36.4× |  4.3× | 36.4× |  4.3× |
+| 5m     | 34.7× |  4.5× | 36.7× |  4.7× |
+| 10m    | 35.9× |  5.0× | 35.0× |  4.9× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -147,18 +148,18 @@ the timing rounds without forcing a final collection, so short-lived garbage
 from the last round may still be included.
 
 | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` |
-|---|---:|---:|---:|---:|---:|
-| small      | +15,493 | +15,500 | +4,066 | +15,116 | +11,140 |
-| medium     |  +1,955 |  +2,660 |   +333 |  +1,114 |  +1,120 |
-| github-100k | +12,018 | +3,527 |    +14 |    +536 |    +230 |
-| 100k       |    +485 |   +748 |    +67 |    +692 |    +229 |
-| 200k       |    +392 |   +523 |    +34 |    +346 |    +112 |
-| 500k       |    +577 |   +630 |    +14 |    +139 |     +45 |
-| 1m         |  +1,082 | +1,121 |    +10 |    +104 |     +34 |
-| 2m         |  +1,155 | +1,248 |    +14 |    +208 |     +45 |
-| 5m         |  +1,316 | +1,538 |    +14 |    +400 |     +45 |
-| 10m        |  +1,583 | +2,014 |    +14 |    +708 |     +45 |
-| interleaved | +3,356 | +4,404 |   +268 |  +2,771 |    +897 |
+|---|---|---:|---:|---:|---:|---:|
+| small      | +15,474 | +15,482 | +4,070 | +15,111 | +4,892 |
+| medium     |  +1,955 |  +2,661 |   +158 |    +502 |    +558 |
+| github-100k |  +4,218 |  +3,035 |    +28 |    +560 |     +96 |
+| 100k       |    +485 |    +812 |    +39 |    +721 |     +96 |
+| 200k       |    +393 |    +709 |    +22 |    +373 |     +54 |
+| 500k       |    +885 |  +1,169 |    +30 |    +721 |     +96 |
+| 1m         |  +1,255 |  +1,415 |    +26 |    +444 |     +69 |
+| 2m         |  +1,155 |  +1,251 |    +19 |    +271 |     +27 |
+| 5m         |  +1,316 |  +1,562 |    +20 |    +405 |     +31 |
+| 10m        |  +1,584 |  +2,017 |    +24 |    +731 |     +47 |
+| interleaved | +3,357 | +4,406 |   +100 |  +2,796 |    +354 |
 
 `qjson.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -171,8 +172,8 @@ key into the Lua table heap.
 
 1. **`qjson` is fastest once payloads move beyond tiny inputs.**
    The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
-   larger multimodal payloads show roughly 14–38× higher throughput than
-   `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson`
+   larger multimodal payloads show roughly 13–36× higher throughput than
+   `cjson` and roughly 1.4–5× higher throughput than `lua-resty-simdjson`
    for request-field access.
 2. **Reading every `messages[*].content` is still access-light for large
    multimodal bodies.** The benchmark touches the top-level request fields and
@@ -180,7 +181,7 @@ key into the Lua table heap.
    inside each message.
 3. **Speedup remains high at 10 MB.** The eager-decode optimization
    keeps `qjson.parse` throughput scaling well even at the 10 MB level,
-   maintaining ~38× over cjson and ~5× over simdjson.
+   maintaining ~36× over cjson and ~5× over simdjson.
 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
    JSON after light-touch inspection. The substring fast path means
@@ -189,14 +190,14 @@ key into the Lua table heap.
 5. **Memory retention** for `qjson` is essentially flat in payload
    size; the eager parsers retain more Lua heap after the first run
    because the Lua table tree stays GC-rooted until the next collection.
-   The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson,
-   and ~14 KB for `qjson.parse`.
+   The 10 MB case retains ~1.6 MB for `cjson`, ~2.0 MB for simdjson,
+   and ~24 KB for `qjson.parse`.
 6. **REST API payloads (github-100k) show a smaller speedup** because their
    structural density is higher than the multimodal request ladder. Memory
    savings remain dramatic because `cjson` must materialize every nested
    object and string into the Lua heap.
 7. **Modify + encode pipeline (PR #54)** shows the lazy-table API in
-   mutation mode. Small/medium payloads reach 47k–185k median ops/s.
+   mutation mode. Small/medium payloads reach 43k–135k median ops/s.
    The `_dirty` flag and `TABLE_TYPE_HINT` side-table eliminate
    redundant tree walks and array/object re-scans inside the encoder.
    Large payloads (≥5 MB) are dominated by the root-container

From 090d1345f23bc9e7b7b05fc953937079823e7e1d Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 22:15:53 +0000
Subject: [PATCH 09/11] docs: add modify+encode columns to README benchmark
 summary

---
 README.md | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b733eaf..1e36b67 100644
--- a/README.md
+++ b/README.md
@@ -99,11 +99,12 @@ LD_LIBRARY_PATH="$PWD/target/release" \
 ## Benchmarks
 
 `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
-chat-completion payloads, "parse + access model, temperature, and all
-messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1,
-AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
+chat-completion payloads (median ops/s under OpenResty LuaJIT 2.1,
+AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload).
 
-| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson |
+### Parse + access (read-only)
+
+| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
 |   2 KB |  92,716 | 102,602 | 128,005 | 125,815 |  1.4× /  1.4× |
 |  60 KB |   9,007 |  82,699 | 116,198 | 219,491 | 12.9× / 24.4× |
@@ -111,10 +112,20 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload):
 |   1 MB |     512 |   4,020 |  16,056 |  15,400 | 31.4× / 30.1× |
 |  10 MB |      51 |     363 |   1,830 |   1,783 | 35.9× / 35.0× |
 
-Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode
-workloads; small payload modify+encode reaches 48k–60k ops/s. See
-[`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
-modify+encode results, memory numbers, environment, and reproduction.
+### Encode (unmodified + modify-then-re-encode)
+
+| Size | encode (unmodified) | modify top | add field | modify nested |
+|---:|---:|---:|---:|---:|
+|   2 KB | 260,322 | 58,242 | 58,190 | 43,003 |
+|  60 KB | 141,563 | 37,498 | 45,364 | 134,590 |
+| 100 KB | 105,374 | 28,114 | 34,364 |  71,942 |
+|   1 MB |  16,269 |  3,125 |  2,998 |  13,649 |
+|  10 MB |   1,749 |    120 |     92 |      83 |
+
+> **encode (unmodified)** re-emits the original byte range via `memcpy` (substring fast
+> path). **modify** scenarios materialize the mutated subtree and re-encode.
+> See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
+> memory numbers, environment, and reproduction.
 
 ```sh
 make bench       # qjson vs cjson and lua-resty-simdjson

From 3bbac986c591307c849e413e1b1f07fc3a6a7a02 Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 22:18:34 +0000
Subject: [PATCH 10/11] bench: add cjson modify+encode baselines; show qjson vs
 cjson comparison in README

Adds cjson.decode + modify top/add field/modify nested + cjson.encode
benchmarks so the modify+encode path has an eager baseline. README
encode table now shows cjson/qjson side-by-side for modify workloads:
qjson is 10-43x faster at 60 KB+.
---
 README.md             | 29 ++++++++++++++++-------------
 benches/lua_bench.lua | 24 ++++++++++++++++++++++++
 docs/benchmarks.md    |  2 ++
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 1e36b67..c88f52f 100644
--- a/README.md
+++ b/README.md
@@ -112,20 +112,23 @@ AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload).
 |   1 MB |     512 |   4,020 |  16,056 |  15,400 | 31.4× / 30.1× |
 |  10 MB |      51 |     363 |   1,830 |   1,783 | 35.9× / 35.0× |
 
-### Encode (unmodified + modify-then-re-encode)
-
-| Size | encode (unmodified) | modify top | add field | modify nested |
-|---:|---:|---:|---:|---:|
-|   2 KB | 260,322 | 58,242 | 58,190 | 43,003 |
-|  60 KB | 141,563 | 37,498 | 45,364 | 134,590 |
-| 100 KB | 105,374 | 28,114 | 34,364 |  71,942 |
-|   1 MB |  16,269 |  3,125 |  2,998 |  13,649 |
-|  10 MB |   1,749 |    120 |     92 |      83 |
-
-> **encode (unmodified)** re-emits the original byte range via `memcpy` (substring fast
-> path). **modify** scenarios materialize the mutated subtree and re-encode.
+### Encode (unmodified) + modify-then-re-encode
+
+| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) |
+|---:|---:|---:|---:|
+|   2 KB | 219,925 | 59,761 /  56,909 | 61,685 /  49,798 |
+|  60 KB | 143,843 |  4,590 / **44,370** |  4,616 / **196,386** |
+| 100 KB | 119,617 |  2,645 / **32,712** |  5,263 /  **59,809** |
+|   1 MB |  16,269 |    241 /  **3,108** |    516 /  **14,134** |
+
+> **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` —
+> no fields touched means zero serializer work.
+> **qjson modify+encode** materializes only the mutated subtree; unmodified
+> siblings stay on the fast path. cjson always does a full materialize +
+> re-serialize on every encode. At 60 KB+, qjson modify+encode is **10–43×**
+> faster than the cjson equivalent.
 > See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
-> memory numbers, environment, and reproduction.
+> memory numbers, and environment.
 
 ```sh
 make bench       # qjson vs cjson and lua-resty-simdjson
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 4aa1196..c007afb 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -319,6 +319,30 @@ for _, s in ipairs(scenarios) do
         cjson_access(obj)
     end)
 
+    -- cjson always fully materializes on decode, so modify+encode is the
+    -- same cost as a full re-encode — useful as a realistic baseline for
+    -- modify workloads.
+    bench("cjson.decode + modify top + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_top(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
+    bench("cjson.decode + add field + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_add(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
+    bench("cjson.decode + modify nested + encode", s.iters, function()
+        local obj = cjson.decode(s.payload)
+        modify_nested(obj)
+        local _enc = cjson.encode(obj)
+        if #_enc < 2 then error("cjson.encode produced too-short result") end
+    end)
+
     if simdjson then
         bench("simdjson.decode + access fields", s.iters, function()
             local obj = simdjson:decode(s.payload)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 25f1595..4083ceb 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -59,6 +59,8 @@ parsing workloads with ~3-5% structural density.
 | Row | What it does | Notes |
 |---|---|---|
 | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
+| `cjson.decode + modify top + encode` | `cjson.decode(s)`, mutate top field, `cjson.encode()` | Full materialize + full re-encode (cjson baseline for modify+encode workloads) |
+| `cjson.decode + modify nested + encode` | `cjson.decode(s)`, mutate deeply nested field, `cjson.encode()` | Same — cjson always re-encodes the whole tree |
 | `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
 | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads |
 | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` |

From 5a178d76c4d3387b6e1d3497da7617617320cf4f Mon Sep 17 00:00:00 2001
From: Yuansheng Wang <membphis@gmail.com>
Date: Sat, 23 May 2026 22:19:58 +0000
Subject: [PATCH 11/11] docs: add speedup column to README encode table

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c88f52f..0c410c8 100644
--- a/README.md
+++ b/README.md
@@ -114,12 +114,12 @@ AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload).
 
 ### Encode (unmodified) + modify-then-re-encode
 
-| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) |
-|---:|---:|---:|---:|
-|   2 KB | 219,925 | 59,761 /  56,909 | 61,685 /  49,798 |
-|  60 KB | 143,843 |  4,590 / **44,370** |  4,616 / **196,386** |
-| 100 KB | 119,617 |  2,645 / **32,712** |  5,263 /  **59,809** |
-|   1 MB |  16,269 |    241 /  **3,108** |    516 /  **14,134** |
+| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | speedup vs. cjson |
+|---:|---:|---:|---:|---:|
+|   2 KB | 219,925 | 59,761 /  56,909 | 61,685 /  49,798 |  1.0× /  0.8× |
+|  60 KB | 143,843 |  4,590 / **44,370** |  4,616 / **196,386** |  9.7× / 42.5× |
+| 100 KB | 119,617 |  2,645 / **32,712** |  5,263 /  **59,809** | 12.4× / 11.4× |
+|   1 MB |  16,269 |    241 /  **3,108** |    516 /  **14,134** | 12.9× / 27.4× |
 
 > **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` —
 > no fields touched means zero serializer work.