From c7574e37859e12f24f5f95642597e31de9f8a83f Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 20:44:13 +0800 Subject: [PATCH 01/11] perf: Lua encode bulk-copy + Rc cache + AVX2 LUT + validation stack - Replace per-byte encode_string with bulk-copy segments and fast escape-free path (zero table allocations for clean strings) - Replace recursive is_dirty with O(1) _dirty flag propagated via parent chain on __newindex - Eliminate is_array pre-scan by tagging materialized tables with __qjson_type marker during materialization - Replace Vec::clone in SkipCache with Rc<[u32]> for O(1) cache-hit access instead of O(n) vector clone - Replace 7-cmpeq loop in AVX2 structural_mask_chunk with PSHUFB-based nibble LUT classification - Replace heap-allocated validation state stack with fixed-size [CtxKind; 64] array for typical JSON depths - Replace byte-by-byte float-detection in parse_i64 with memchr::memchr3 - Add modified-encode benchmark scenarios (modify top / add field / modify nested) to bench harness - Add correctness tests for modified-encode round-trip --- benches/lua_bench.lua | 77 +++++++++++++++++++++++- lua/qjson/table.lua | 107 +++++++++++++++++++++------------- src/cursor.rs | 14 +++-- src/decode/number.rs | 2 +- src/scan/avx2.rs | 71 +++++++++++++++++----- src/skip_cache.rs | 11 ++-- src/validate/mod.rs | 80 +++++++++++++++++++------ tests/lua/lazy_table_spec.lua | 50 ++++++++++++++++ 8 files changed, 329 insertions(+), 83 deletions(-) diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 30a3977..81639b8 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -220,6 +220,21 @@ local function default_table_access(t) end end +local function default_table_modify_top(t) + t.model = "new-model" + t.temperature = 0.0 +end + +local function default_table_modify_add(t) + t.stream = true +end + +local function default_table_modify_nested(t) + if t.messages and qjson.len(t.messages) > 0 then + t.messages[1].content = "modified" + end +end + -- GitHub issues accessors: array of issues, access first issue's fields local function github_cjson_access(obj) local _ = obj[1] and obj[1].id @@ -239,11 +254,26 @@ local function github_table_access(t) local _ = t[1] and t[1].user and t[1].user.login end +local function github_table_modify_top(t) + t[1].title = "modified title" +end + +local function github_table_modify_add(t) + t.extra_field = true +end + +local function github_table_modify_nested(t) + if t[1] and t[1].user then + t[1].user.login = "modified-user" + end +end + local scenarios = { {name = "small", iters = 5000, payload = read_file("benches/fixtures/small_api.json")}, {name = "medium", iters = 500, payload = read_file("benches/fixtures/medium_resp.json")}, {name = "github-100k", iters = 100, payload = make_github_issues_payload(100 * 1024), - cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access}, + cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access, + modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested}, {name = "100k", iters = 100, payload = make_payload(100 * 1024)}, {name = "200k", iters = 50, payload = make_payload(200 * 1024)}, {name = "500k", iters = 20, payload = make_payload(500 * 1024)}, @@ -269,6 +299,9 @@ for _, s in ipairs(scenarios) do local cjson_access = s.cjson_access or default_cjson_access local qjson_access = s.qjson_access or default_qjson_access local table_access = s.table_access or default_table_access + local modify_top = s.modify_top or default_table_modify_top + local modify_add = s.modify_add or default_table_modify_add + local modify_nested = s.modify_nested or default_table_modify_nested bench("cjson.decode + access fields", s.iters, function() local obj = cjson.decode(s.payload) @@ -309,6 +342,24 @@ for _, s in ipairs(scenarios) do local t = qjson.decode(s.payload) local _ = qjson.encode(t) end) + + bench("qjson.decode + modify top + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_top(t) + local _ = qjson.encode(t) + end) + + bench("qjson.decode + add field + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_add(t) + local _ = qjson.encode(t) + end) + + bench("qjson.decode + modify nested + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_nested(t) + local _ = qjson.encode(t) + end) end -- Interleaved scenario: cycle through several payloads of different sizes @@ -386,4 +437,28 @@ do local t = qjson.decode(p) local _ = qjson.encode(t) end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + modify top + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_top(t) + local _ = qjson.encode(t) + end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + add field + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_add(t) + local _ = qjson.encode(t) + end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + modify nested + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_nested(t) + local _ = qjson.encode(t) + end) end diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 86f50d0..754e3f8 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -65,6 +65,8 @@ local function wrap_child(parent_view, src_box) local own_box = ffi.new("qjson_cursor[1]") ffi.copy(own_box, src_box, ffi.sizeof("qjson_cursor")) return { + _parent = parent_view, + _dirty = false, _doc = parent_view._doc, _cur_box = own_box, -- keep cdata alive _cur = own_box[0], -- stable reference into own_box @@ -252,6 +254,7 @@ end -- the dirty check and __newindex can share the list. local INTERNAL_KEYS = { _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true, + _parent = true, _dirty = true, } -- On first write, walk all existing key/value pairs into a plain table, @@ -260,10 +263,14 @@ local INTERNAL_KEYS = { -- Existing rawget-cached entries (e.g. previously returned child proxies) -- are preserved so callers' references remain valid. LazyObject.__newindex = function(t, k, v) + -- Mark dirty from this view up to the root. + local cur = t + while cur do + rawset(cur, "_dirty", true) + cur = rawget(cur, "_parent") + end local contents = materialize_object_contents(t) -- Snapshot user-key cache BEFORE nilling internals. - -- Use next() for raw iteration: pairs() invokes __pairs on lazy tables, - -- walking the full JSON via FFI instead of the Lua-side rawget cache. local cache = {} local ck, cv = next(t) while ck ~= nil do @@ -272,8 +279,11 @@ LazyObject.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil + for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do + rawset(t, f, nil) + end setmetatable(t, nil) + rawset(t, "__qjson_type", "object") for _, kv in ipairs(contents) do rawset(t, kv[1], cache[kv[1]] or kv[2]) end @@ -284,10 +294,14 @@ end -- switch to empty_array_mt (no lazy machinery), then apply the assignment. -- Existing rawget-cached entries are preserved so callers' references remain valid. LazyArray.__newindex = function(t, k, v) + -- Mark dirty from this view up to the root. + local cur = t + while cur do + rawset(cur, "_dirty", true) + cur = rawget(cur, "_parent") + end local contents = materialize_array_contents(t) -- Snapshot integer-key cache BEFORE nilling internals. - -- Use next() for raw iteration: pairs() would invoke __pairs on lazy arrays, - -- walking the full JSON via FFI instead of the Lua-side rawget cache. local cache = {} local ck, cv = next(t) while ck ~= nil do @@ -296,8 +310,11 @@ LazyArray.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil + for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do + rawset(t, f, nil) + end setmetatable(t, _M.empty_array_mt) + rawset(t, "__qjson_type", "array") for i, x in ipairs(contents) do rawset(t, i, cache[i] or x) end @@ -328,6 +345,7 @@ function _M.decode(json_str) error("qjson: root byte-span failed") end local view = { + _dirty = false, _doc = doc, _cur_box = root_box, -- keep the box alive; _cur is a stable reference _cur = root_box[0], @@ -370,23 +388,42 @@ _M.materialize = materialize local string_byte = string.byte local string_format = string.format --- Minimal JSON string escaper covering the cjson default set. +-- Escape lookup table: byte value → escape sequence string (or nil if safe). +local ESCAPES = { + [0x22] = '\\"', + [0x5C] = '\\\\', + [0x0A] = '\\n', + [0x0D] = '\\r', + [0x09] = '\\t', + [0x08] = '\\b', + [0x0C] = '\\f', +} + +-- JSON string escaper with bulk-copy fast path. +-- Scans for bytes that need escaping; copies clean segments via s:sub. +-- For strings with no escapes, returns '"' .. s .. '"' with zero table allocations. local function encode_string(s) - local out = {'"'} - for i = 1, #s do + local n = #s + local last, i = 1, 1 + local out = nil -- lazily create table only when escapes found + while i <= n do local b = string_byte(s, i) - if b == 0x22 then out[#out+1] = '\\"' - elseif b == 0x5C then out[#out+1] = '\\\\' - elseif b == 0x0A then out[#out+1] = '\\n' - elseif b == 0x0D then out[#out+1] = '\\r' - elseif b == 0x09 then out[#out+1] = '\\t' - elseif b == 0x08 then out[#out+1] = '\\b' - elseif b == 0x0C then out[#out+1] = '\\f' - elseif b < 0x20 then out[#out+1] = string_format('\\u%04x', b) - else out[#out+1] = string.char(b) + local esc = ESCAPES[b] + if esc or b < 0x20 then + if not out then out = {'"'} end + if i > last then out[#out + 1] = s:sub(last, i - 1) end + if esc then + out[#out + 1] = esc + else + out[#out + 1] = string_format('\\u%04x', b) + end + last = i + 1 end + i = i + 1 end - out[#out+1] = '"' + if not out then return '"' .. s .. '"' end + if last <= n then out[#out + 1] = s:sub(last, n) end + out[#out + 1] = '"' return table.concat(out) end @@ -400,27 +437,6 @@ local function encode_number(n) return string_format("%.14g", n) end --- A lazy subtree is "dirty" if any cached descendant has been materialized --- (no longer carries Lazy* metatable). Non-cached descendants are guaranteed --- untouched, so we only need to walk the rawget-cached entries. -local function is_dirty(v) - if type(v) ~= "table" then return false end - local mt = getmetatable(v) - if mt ~= LazyObject and mt ~= LazyArray then - return true -- materialized - end - -- Use next() for raw table iteration: pairs() would invoke __pairs on - -- lazy tables, walking the full JSON via FFI instead of the Lua cache. - local k, child = next(v) - while k ~= nil do - if not INTERNAL_KEYS[k] then - if is_dirty(child) then return true end - end - k, child = next(v, k) - end - return false -end - -- Forward declaration so encode_lazy_object_walking, encode_lazy_array_walking, -- and encode_array/encode_object can reference encode before its definition is -- complete (Lua resolves upvalues at call time, but the slot must be declared first). @@ -471,7 +487,7 @@ local function encode_lazy_array_walking(t) end local function encode_proxy(t) - if not is_dirty(t) then + if not t._dirty then -- Fast path: no mutations — slice the original buffer bytes. return t._doc._hold:sub(t._bs + 1, t._be) end @@ -530,6 +546,15 @@ encode = function(v) if mt == LazyObject or mt == LazyArray then return encode_proxy(v) end + if mt == _M.empty_array_mt then + return encode_array(v) + end + if rawget(v, "__qjson_type") == "object" then + return encode_object(v) + end + if rawget(v, "__qjson_type") == "array" then + return encode_array(v) + end if is_array(v) then return encode_array(v) end diff --git a/src/cursor.rs b/src/cursor.rs index bf38d40..82ce885 100644 --- a/src/cursor.rs +++ b/src/cursor.rs @@ -1,6 +1,7 @@ use crate::doc::Document; use crate::error::qjson_err; use crate::path::{PathIter, PathSeg}; +use std::rc::Rc; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub(crate) struct Cursor { @@ -62,9 +63,10 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result Result Result Ok(c), diff --git a/src/decode/number.rs b/src/decode/number.rs index 74839ff..24f8cf7 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -19,7 +19,7 @@ pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result) -> Result<(), usize> { super::validate_brackets(buf, out) } +// Tag bits for nibble-based structural byte classification (mirrors NEON). +const TAG_QUOTE: u8 = 0x01; +const TAG_COMMA: u8 = 0x02; +const TAG_COLON: u8 = 0x04; +const TAG_OPEN_BRACKET: u8 = 0x08; +const TAG_CLOSE_BRACKET: u8 = 0x10; +const TAG_OPEN_BRACE: u8 = 0x20; +const TAG_CLOSE_BRACE: u8 = 0x40; + #[inline(always)] unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 { - // For each byte, set 1 if byte is one of: { } [ ] : , " - // Bit-OR results from 7 byte-equality compares. - let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"']; - let mut mask_lo: i32 = 0; - let mut mask_hi: i32 = 0; - for c in chars { - let v = _mm256_set1_epi8(c as i8); - let eq_lo = _mm256_cmpeq_epi8(lo, v); - let eq_hi = _mm256_cmpeq_epi8(hi, v); - mask_lo |= _mm256_movemask_epi8(eq_lo); - mask_hi |= _mm256_movemask_epi8(eq_hi); - } - (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32) + // Nibble-based classification via PSHUFB LUTs. Each structural byte + // has a unique (hi, lo) nibble pair; the LUTs hold disjoint tag bits + // so that HI_LUT[hi] & LO_LUT[lo] is non-zero only for the 7 + // structural bytes: { } [ ] : , " + #[rustfmt::skip] + const HI_LUT: [u8; 16] = [ + 0, 0, + TAG_QUOTE | TAG_COMMA, // index 2: 0x2_ + TAG_COLON, // index 3: 0x3_ + 0, + TAG_OPEN_BRACKET | TAG_CLOSE_BRACKET, // index 5: 0x5_ + 0, + TAG_OPEN_BRACE | TAG_CLOSE_BRACE, // index 7: 0x7_ + 0, 0, 0, 0, 0, 0, 0, 0, + ]; + #[rustfmt::skip] + const LO_LUT: [u8; 16] = [ + 0, 0, + TAG_QUOTE, // index 2: 0x_2 + 0, 0, 0, 0, 0, 0, 0, + TAG_COLON, // index 10: 0x_A + TAG_OPEN_BRACKET | TAG_OPEN_BRACE, // index 11: 0x_B + TAG_COMMA, // index 12: 0x_C + TAG_CLOSE_BRACKET | TAG_CLOSE_BRACE, // index 13: 0x_D + 0, 0, + ]; + + let hi_lut = _mm256_broadcastsi128_si256( + _mm_loadu_si128(HI_LUT.as_ptr() as *const __m128i)); + let lo_lut = _mm256_broadcastsi128_si256( + _mm_loadu_si128(LO_LUT.as_ptr() as *const __m128i)); + let mask_0f = _mm256_set1_epi8(0x0f); + let zero = _mm256_setzero_si256(); + let all_ff = _mm256_cmpeq_epi8(zero, zero); // 0xFF in every lane + + let classify = |chunk: __m256i| -> i32 { + let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask_0f); + let lo_nib = _mm256_and_si256(chunk, mask_0f); + let hi_part = _mm256_shuffle_epi8(hi_lut, hi_nib); + let lo_part = _mm256_shuffle_epi8(lo_lut, lo_nib); + let tags = _mm256_and_si256(hi_part, lo_part); + // tags != 0 → structural. Map to 0xFF / 0x00 for movemask. + let is_zero = _mm256_cmpeq_epi8(tags, zero); + _mm256_movemask_epi8(_mm256_xor_si256(is_zero, all_ff)) + }; + + let mlo = classify(lo); + let mhi = classify(hi); + (mlo as u32 as u64) | ((mhi as u32 as u64) << 32) } /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`. diff --git a/src/skip_cache.rs b/src/skip_cache.rs index 6b6b1dd..1b335bc 100644 --- a/src/skip_cache.rs +++ b/src/skip_cache.rs @@ -1,4 +1,5 @@ use rustc_hash::FxHashMap; +use std::rc::Rc; #[derive(Default)] pub(crate) struct SkipCache { @@ -12,17 +13,18 @@ pub(crate) struct SkipSlot { /// child_starts[i] = position in doc.indices of the i-th child's leading /// marker. For object children this is the key's opening '"'; for array /// children, the value's first marker. - pub(crate) child_starts: Vec, + pub(crate) child_starts: Rc<[u32]>, /// child_ends[i] = the `cursor_end` value for the i-th child (i.e. the /// idx_end to put in a Cursor pointing at that child's value). Storing /// this lets cache-hit resolution skip the brace-counting find_value_span. - pub(crate) child_ends: Vec, + pub(crate) child_ends: Rc<[u32]>, } impl SkipCache { pub(crate) fn new() -> Self { + let empty: Rc<[u32]> = Rc::from([]); Self { - slots: vec![SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() }], + slots: vec![SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }], by_opener: FxHashMap::default(), } } @@ -34,7 +36,8 @@ impl SkipCache { return (slot, true); } let new = self.slots.len() as u32; - self.slots.push(SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() }); + let empty: Rc<[u32]> = Rc::from([]); + self.slots.push(SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }); self.by_opener.insert(opener_idx, new); (new, false) } diff --git a/src/validate/mod.rs b/src/validate/mod.rs index a9ce958..803831e 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -145,10 +145,56 @@ pub(crate) fn validate_eager_values( indices: &[u32], max_depth: u32, ) -> Result<(), qjson_err> { - // Stack of container contexts; the top is the current state. - // We use a single seed entry `CtxKind::Top` for the root value. - let mut stack: Vec = Vec::with_capacity(16); - stack.push(CtxKind::Top); + // Fixed-size stack avoids heap allocation for typical JSON depths. + const STACK_CAP: usize = 64; + let mut stack_buf: [CtxKind; STACK_CAP] = [CtxKind::Top; STACK_CAP]; + let mut sp: usize = 1; // next free slot (= len) + let mut fallback: Option> = None; + + macro_rules! push { + ($kind:expr) => { + if sp < STACK_CAP { + stack_buf[sp] = $kind; + sp += 1; + } else { + let fb = fallback.get_or_insert_with(|| { + let mut v: Vec = Vec::with_capacity(STACK_CAP + 16); + v.extend_from_slice(&stack_buf[..sp]); + v + }); + sp = STACK_CAP.wrapping_add(fb.len() + 1); + fb.push($kind); + } + }; + } + macro_rules! pop { + () => {{ + if sp <= STACK_CAP { + if sp == 0 { None } + else { sp -= 1; Some(stack_buf[sp]) } + } else { + let fb = fallback.as_mut().unwrap(); + let val = fb.pop(); + if fb.is_empty() { sp = STACK_CAP; } + val + } + }}; + } + macro_rules! last_mut { + () => {{ + if sp <= STACK_CAP { + if sp == 0 { None } else { Some(&mut stack_buf[sp - 1]) } + } else { + fallback.as_mut().unwrap().last_mut() + } + }}; + } + macro_rules! stack_len { + () => { if sp <= STACK_CAP { sp } else { fallback.as_ref().map_or(0, |v| v.len()) } }; + } + macro_rules! stack_is_empty { + () => { stack_len!() == 0 }; + } // Byte position just past the previous structural we consumed — // i.e. the start of the current gap. A gap may contain a scalar @@ -165,11 +211,11 @@ pub(crate) fn validate_eager_values( // First, consume any scalar token sitting in the gap before // this structural. This may transition the current state from // a value-expecting form to its "AfterValue" form. - consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?; + consume_scalar_gap(buf, prev_end, pos, last_mut!().unwrap())?; match b { b'{' | b'[' => { - let cur = stack.last_mut().unwrap(); + let cur = last_mut!().unwrap(); match *cur { CtxKind::Top | CtxKind::ArrAfterOpen @@ -178,10 +224,10 @@ pub(crate) fn validate_eager_values( // Transition parent to AfterValue ahead of the // descent; the inner container's close pops back. *cur = parent_after_value(*cur); - if stack.len() > max_depth as usize { + if stack_len!() > max_depth as usize { return Err(qjson_err::QJSON_NESTING_TOO_DEEP); } - stack.push(if b == b'{' { + push!(if b == b'{' { CtxKind::ObjAfterOpen } else { CtxKind::ArrAfterOpen @@ -193,25 +239,25 @@ pub(crate) fn validate_eager_values( i += 1; } b'}' => { - let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) { return Err(qjson_err::QJSON_PARSE_ERROR); } - if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); } + if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); } prev_end = pos + 1; i += 1; } b']' => { - let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) { return Err(qjson_err::QJSON_PARSE_ERROR); } - if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); } + if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); } prev_end = pos + 1; i += 1; } b',' => { - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma, CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma, @@ -221,7 +267,7 @@ pub(crate) fn validate_eager_values( i += 1; } b':' => { - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon, _ => return Err(qjson_err::QJSON_PARSE_ERROR), @@ -239,7 +285,7 @@ pub(crate) fn validate_eager_values( } strings::validate_string_span(&buf[pos + 1 .. close])?; - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { // Key position in an object. CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => { @@ -264,11 +310,11 @@ pub(crate) fn validate_eager_values( // Tail: a top-level scalar root (e.g. `42`, `true`) lives in the // gap after the last structural — or, if there are no structurals, // the whole buffer. - consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?; + consume_scalar_gap(buf, prev_end, buf.len(), last_mut!().unwrap())?; // After the walk, the stack must hold exactly one frame: the root // context, which must be `TopDone` (root value consumed). - if stack.len() != 1 || stack[0] != CtxKind::TopDone { + if stack_len!() != 1 || stack_buf[0] != CtxKind::TopDone { return Err(qjson_err::QJSON_PARSE_ERROR); } Ok(()) diff --git a/tests/lua/lazy_table_spec.lua b/tests/lua/lazy_table_spec.lua index 2769d39..532833c 100644 --- a/tests/lua/lazy_table_spec.lua +++ b/tests/lua/lazy_table_spec.lua @@ -390,4 +390,54 @@ describe("qjson.encode — nested mutations propagate", function() inner.x = 99 assert.are.equal(99, t.a.x) end) + + it("modifies top-level field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"model":"gpt-4","temperature":0.7}') + t.model = "gpt-5" + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal("gpt-5", parsed.model) + assert.are.equal(0.7, parsed.temperature) + end) + + it("adds new field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"a":1}') + t.b = true + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal(1, parsed.a) + assert.are.equal(true, parsed.b) + end) + + it("modifies nested field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"messages":[{"role":"user","content":"hello"}]}') + t.messages[1].content = "world" + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal("user", parsed.messages[1].role) + assert.are.equal("world", parsed.messages[1].content) + end) + + it("encodes unmodified proxy via fast path", function() + local json = '{"a":1,"b":"text","c":true}' + local t = qjson.decode(json) + local out = qjson.encode(t) + local cjson = require("cjson") + local parsed = cjson.decode(out) + assert.are.equal(1, parsed.a) + assert.are.equal("text", parsed.b) + assert.are.equal(true, parsed.c) + end) + + it("encodes string with escapes correctly", function() + local t = qjson.decode('{"key":"value"}') + t.key = 'line1\nline2\t"quoted"' + local out = qjson.encode(t) + local cjson = require("cjson") + local parsed = cjson.decode(out) + assert.are.equal('line1\nline2\t"quoted"', parsed.key) + end) end) From 913f6a868a79c70c89f371e5bd44cd9b1495d7a9 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 20:51:31 +0800 Subject: [PATCH 02/11] fix: prevent __qjson_type and _dirty leakage into encode output - Skip __qjson_type internal marker in encode_object() pairs iteration - Guard dirty propagation to stop at non-lazy ancestors, preventing _dirty rawset on already-materialized tables --- lua/qjson/table.lua | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 754e3f8..c19da24 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -266,6 +266,8 @@ LazyObject.__newindex = function(t, k, v) -- Mark dirty from this view up to the root. local cur = t while cur do + local mt = getmetatable(cur) + if mt ~= LazyObject and mt ~= LazyArray then break end rawset(cur, "_dirty", true) cur = rawget(cur, "_parent") end @@ -297,6 +299,8 @@ LazyArray.__newindex = function(t, k, v) -- Mark dirty from this view up to the root. local cur = t while cur do + local mt = getmetatable(cur) + if mt ~= LazyObject and mt ~= LazyArray then break end rawset(cur, "_dirty", true) cur = rawget(cur, "_parent") end @@ -525,7 +529,9 @@ local function encode_object(t) if type(k) ~= "string" then error("qjson.encode: object key must be a string, got " .. type(k)) end - parts[#parts+1] = encode_string(k) .. ":" .. encode(v) + if k ~= "__qjson_type" then + parts[#parts+1] = encode_string(k) .. ":" .. encode(v) + end end return "{" .. table.concat(parts, ",") .. "}" end From 113e083b4f32489aa356700d03fa801e6df1a2c6 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 20:55:31 +0800 Subject: [PATCH 03/11] fix: use weak side-table for type hints, fix array-root modify_add - Replace __qjson_type string key with local weak side-table (TABLE_TYPE_HINT) to prevent collision with user payload keys - Fix github_table_modify_add to mutate first element instead of array root so the added field appears in encoded output --- benches/lua_bench.lua | 4 +++- lua/qjson/table.lua | 16 +++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 81639b8..4e16c03 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -259,7 +259,9 @@ local function github_table_modify_top(t) end local function github_table_modify_add(t) - t.extra_field = true + if t[1] then + t[1].extra_field = true + end end local function github_table_modify_nested(t) diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index c19da24..486e28d 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -23,6 +23,10 @@ else _M.empty_array_mt = { __jsontype = "array" } end +-- Weak side-table for container type hints, avoiding collision with +-- user-visible keys. Maps materialized table → "object" | "array". +local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" }) + -- Box scratch used for one-shot FFI returns. Reused across calls to avoid -- per-call allocation; safe because the parent Doc / lazy view holds the -- buffer alive and these are read-and-copy. @@ -285,7 +289,7 @@ LazyObject.__newindex = function(t, k, v) rawset(t, f, nil) end setmetatable(t, nil) - rawset(t, "__qjson_type", "object") + TABLE_TYPE_HINT[t] = "object" for _, kv in ipairs(contents) do rawset(t, kv[1], cache[kv[1]] or kv[2]) end @@ -318,7 +322,7 @@ LazyArray.__newindex = function(t, k, v) rawset(t, f, nil) end setmetatable(t, _M.empty_array_mt) - rawset(t, "__qjson_type", "array") + TABLE_TYPE_HINT[t] = "array" for i, x in ipairs(contents) do rawset(t, i, cache[i] or x) end @@ -529,9 +533,7 @@ local function encode_object(t) if type(k) ~= "string" then error("qjson.encode: object key must be a string, got " .. type(k)) end - if k ~= "__qjson_type" then - parts[#parts+1] = encode_string(k) .. ":" .. encode(v) - end + parts[#parts+1] = encode_string(k) .. ":" .. encode(v) end return "{" .. table.concat(parts, ",") .. "}" end @@ -555,10 +557,10 @@ encode = function(v) if mt == _M.empty_array_mt then return encode_array(v) end - if rawget(v, "__qjson_type") == "object" then + if TABLE_TYPE_HINT[v] == "object" then return encode_object(v) end - if rawget(v, "__qjson_type") == "array" then + if TABLE_TYPE_HINT[v] == "array" then return encode_array(v) end if is_array(v) then From c8630b7d39748fee0245cdb1aada9172c40a882d Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 21:19:04 +0800 Subject: [PATCH 04/11] perf: extract encode_plain_table to keep lazy-proxy fast path narrow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move TABLE_TYPE_HINT/is_array/empty_array_mt dispatch into a separate function so the hot encode path (lazy proxy → memcpy) stays minimal for LuaJIT trace compilation. --- lua/qjson/table.lua | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 486e28d..9af2241 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -538,6 +538,26 @@ local function encode_object(t) return "{" .. table.concat(parts, ",") .. "}" end +-- Dispatch for plain (non-lazy) tables. Separated from the main encode +-- function to keep the lazy-proxy fast path narrow for LuaJIT traces. +local function encode_plain_table(v) + local mt = getmetatable(v) + if mt == _M.empty_array_mt then + return encode_array(v) + end + local hint = TABLE_TYPE_HINT[v] + if hint == "object" then + return encode_object(v) + end + if hint == "array" then + return encode_array(v) + end + if is_array(v) then + return encode_array(v) + end + return encode_object(v) +end + encode = function(v) if rawequal(v, _M.null) then return "null" @@ -554,19 +574,7 @@ encode = function(v) if mt == LazyObject or mt == LazyArray then return encode_proxy(v) end - if mt == _M.empty_array_mt then - return encode_array(v) - end - if TABLE_TYPE_HINT[v] == "object" then - return encode_object(v) - end - if TABLE_TYPE_HINT[v] == "array" then - return encode_array(v) - end - if is_array(v) then - return encode_array(v) - end - return encode_object(v) + return encode_plain_table(v) end error("qjson.encode: unsupported value type: " .. tv) end From 65da500ba57dd5ab75d6949857874ca6c622bb2c Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 15:38:22 +0000 Subject: [PATCH 05/11] perf+fix: revert AVX2 PSHUFB LUT on x86; address review feedback Scanner (src/scan/avx2.rs): - Revert structural_mask_chunk from the PSHUFB-LUT nibble classifier back to the parallel 7-cmpeq form. On AMD Zen2 the PSHUFB variant measured -45% parse on small payloads (-7% on 1m where the in-string fast-probe dominates anyway): VPSHUFB ymm is split into two micro-ops per 128-bit lane, the srli->and->pshufb->and->cmpeq-> xor->movemask chain has a longer critical path, and VPMOVMSKB (lat 4) plus the LUT-load constants pressure the FP ports. The parallel cmpeq design lets independent compare chains dispatch across multiple ports. NEON path is unaffected (separate file). Lua layer (lua/qjson/table.lua) -- review feedback: - Update INTERNAL_KEYS comment to reference its actual consumers (__newindex cache snapshotting and encode_lazy_object_walking), not the removed recursive is_dirty walk. - Replace 'for _, f in ipairs({...})' in both LazyObject.__newindex and LazyArray.__newindex with seven inline rawset(t, ..., nil) calls. Eliminates the per-call 7-elem table-literal allocation on the materialization path. Uses rawset (not multi-assignment to nil) because root views from _M.decode lack _parent, so 't._parent = nil' could fire __newindex recursively. Bench (benches/lua_bench.lua): - Force LuaJIT to evaluate qjson.encode results: replace 'local _ = qjson.encode(t)' with 'local _enc = qjson.encode(t); if #_enc < 2 then error(...) end' in all 8 mutation/encode cases. Without this, LuaJIT could partially DCE the encode call on some trace shapes and produce misleading speedups. - Raise the warmup floor from max(3, iters/5) to max(50, iters/5). LuaJIT's default hotloop is 56; with the original floor of 3, the 1m payload (iters=15) measured pre-JIT interpreter mode for most of the run. - Raise 500k iters 20->100 and 1m iters 15->60 so per-round wall time exceeds os.clock granularity (~1ms) by a comfortable margin. --- benches/lua_bench.lua | 33 ++++++++++++------- lua/qjson/table.lua | 26 ++++++++++----- src/scan/avx2.rs | 76 ++++++++++--------------------------------- 3 files changed, 58 insertions(+), 77 deletions(-) diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 4e16c03..1360b27 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -145,7 +145,10 @@ local ROUNDS = 5 local function bench(name, iters, fn) -- Warmup pass: lets JIT compile hot traces and any one-time pools fill -- before measurement starts. Excluded from timing and memory delta. - local warmup = math.max(3, math.floor(iters / 5)) + -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave + -- the bench measuring interpreter mode for the large-payload scenarios + -- (1m has iters=15, iters/5=3 → trace never compiles → ~30% noise). + local warmup = math.max(50, math.floor(iters / 5)) for _ = 1, warmup do fn() end collectgarbage("collect") @@ -278,8 +281,8 @@ local scenarios = { modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested}, {name = "100k", iters = 100, payload = make_payload(100 * 1024)}, {name = "200k", iters = 50, payload = make_payload(200 * 1024)}, - {name = "500k", iters = 20, payload = make_payload(500 * 1024)}, - {name = "1m", iters = 15, payload = make_payload(1024 * 1024)}, + {name = "500k", iters = 100, payload = make_payload(500 * 1024)}, + {name = "1m", iters = 60, payload = make_payload(1024 * 1024)}, {name = "2m", iters = 20, payload = make_payload(2 * 1024 * 1024)}, {name = "5m", iters = 20, payload = make_payload(5 * 1024 * 1024)}, {name = "10m", iters = 20, payload = make_payload(10 * 1024 * 1024)}, @@ -342,25 +345,29 @@ for _, s in ipairs(scenarios) do bench("qjson.decode + qjson.encode (unmodified)", s.iters, function() local t = qjson.decode(s.payload) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) bench("qjson.decode + modify top + encode", s.iters, function() local t = qjson.decode(s.payload) modify_top(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) bench("qjson.decode + add field + encode", s.iters, function() local t = qjson.decode(s.payload) modify_add(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) bench("qjson.decode + modify nested + encode", s.iters, function() local t = qjson.decode(s.payload) modify_nested(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) end @@ -437,7 +444,8 @@ do bench("qjson.decode + qjson.encode (unmodified)", 400, function() local p = next_p() local t = qjson.decode(p) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) next_p = make_cycler(interleaved) @@ -445,7 +453,8 @@ do local p = next_p() local t = qjson.decode(p) default_table_modify_top(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) next_p = make_cycler(interleaved) @@ -453,7 +462,8 @@ do local p = next_p() local t = qjson.decode(p) default_table_modify_add(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) next_p = make_cycler(interleaved) @@ -461,6 +471,7 @@ do local p = next_p() local t = qjson.decode(p) default_table_modify_nested(t) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) end diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 9af2241..7c1ccf1 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -254,8 +254,10 @@ local function materialize_array_contents(view) end -- The set of keys reserved by the lazy view bookkeeping; user-supplied JSON --- keys with these names would collide (minor, deferred). Centralized here so --- the dirty check and __newindex can share the list. +-- keys with these names would collide (minor, deferred). Centralized so +-- __newindex (cache snapshotting before materialization) and +-- encode_lazy_object_walking (skipping internals while encoding a dirty +-- proxy) share one source of truth. local INTERNAL_KEYS = { _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true, _parent = true, _dirty = true, @@ -285,9 +287,13 @@ LazyObject.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do - rawset(t, f, nil) - end + rawset(t, "_parent", nil) + rawset(t, "_dirty", nil) + rawset(t, "_doc", nil) + rawset(t, "_cur_box", nil) + rawset(t, "_cur", nil) + rawset(t, "_bs", nil) + rawset(t, "_be", nil) setmetatable(t, nil) TABLE_TYPE_HINT[t] = "object" for _, kv in ipairs(contents) do @@ -318,9 +324,13 @@ LazyArray.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - for _, f in ipairs({"_parent", "_dirty", "_doc", "_cur_box", "_cur", "_bs", "_be"}) do - rawset(t, f, nil) - end + rawset(t, "_parent", nil) + rawset(t, "_dirty", nil) + rawset(t, "_doc", nil) + rawset(t, "_cur_box", nil) + rawset(t, "_cur", nil) + rawset(t, "_bs", nil) + rawset(t, "_be", nil) setmetatable(t, _M.empty_array_mt) TABLE_TYPE_HINT[t] = "array" for i, x in ipairs(contents) do diff --git a/src/scan/avx2.rs b/src/scan/avx2.rs index 9923705..2f0f414 100644 --- a/src/scan/avx2.rs +++ b/src/scan/avx2.rs @@ -105,66 +105,26 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec) -> Result<(), usize> { super::validate_brackets(buf, out) } -// Tag bits for nibble-based structural byte classification (mirrors NEON). -const TAG_QUOTE: u8 = 0x01; -const TAG_COMMA: u8 = 0x02; -const TAG_COLON: u8 = 0x04; -const TAG_OPEN_BRACKET: u8 = 0x08; -const TAG_CLOSE_BRACKET: u8 = 0x10; -const TAG_OPEN_BRACE: u8 = 0x20; -const TAG_CLOSE_BRACE: u8 = 0x40; - #[inline(always)] unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 { - // Nibble-based classification via PSHUFB LUTs. Each structural byte - // has a unique (hi, lo) nibble pair; the LUTs hold disjoint tag bits - // so that HI_LUT[hi] & LO_LUT[lo] is non-zero only for the 7 - // structural bytes: { } [ ] : , " - #[rustfmt::skip] - const HI_LUT: [u8; 16] = [ - 0, 0, - TAG_QUOTE | TAG_COMMA, // index 2: 0x2_ - TAG_COLON, // index 3: 0x3_ - 0, - TAG_OPEN_BRACKET | TAG_CLOSE_BRACKET, // index 5: 0x5_ - 0, - TAG_OPEN_BRACE | TAG_CLOSE_BRACE, // index 7: 0x7_ - 0, 0, 0, 0, 0, 0, 0, 0, - ]; - #[rustfmt::skip] - const LO_LUT: [u8; 16] = [ - 0, 0, - TAG_QUOTE, // index 2: 0x_2 - 0, 0, 0, 0, 0, 0, 0, - TAG_COLON, // index 10: 0x_A - TAG_OPEN_BRACKET | TAG_OPEN_BRACE, // index 11: 0x_B - TAG_COMMA, // index 12: 0x_C - TAG_CLOSE_BRACKET | TAG_CLOSE_BRACE, // index 13: 0x_D - 0, 0, - ]; - - let hi_lut = _mm256_broadcastsi128_si256( - _mm_loadu_si128(HI_LUT.as_ptr() as *const __m128i)); - let lo_lut = _mm256_broadcastsi128_si256( - _mm_loadu_si128(LO_LUT.as_ptr() as *const __m128i)); - let mask_0f = _mm256_set1_epi8(0x0f); - let zero = _mm256_setzero_si256(); - let all_ff = _mm256_cmpeq_epi8(zero, zero); // 0xFF in every lane - - let classify = |chunk: __m256i| -> i32 { - let hi_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask_0f); - let lo_nib = _mm256_and_si256(chunk, mask_0f); - let hi_part = _mm256_shuffle_epi8(hi_lut, hi_nib); - let lo_part = _mm256_shuffle_epi8(lo_lut, lo_nib); - let tags = _mm256_and_si256(hi_part, lo_part); - // tags != 0 → structural. Map to 0xFF / 0x00 for movemask. - let is_zero = _mm256_cmpeq_epi8(tags, zero); - _mm256_movemask_epi8(_mm256_xor_si256(is_zero, all_ff)) - }; - - let mlo = classify(lo); - let mhi = classify(hi); - (mlo as u32 as u64) | ((mhi as u32 as u64) << 32) + // 7 parallel byte-equality compares. On AMD Zen2 these dispatch across + // multiple FP ports and beat a PSHUFB-LUT nibble classifier (PSHUFB ymm + // is split into 2 micro-ops per lane, the LUT chain lengthens the + // critical path, and VPMOVMSKB has lat=4 — the 14-movemask total is + // still cheaper than the LUT path's serial dependency). PR #54 tried + // PSHUFB-LUT but measured -45% parse on small payloads on Zen2; this + // form is what shipped through #51. + let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"']; + let mut mask_lo: i32 = 0; + let mut mask_hi: i32 = 0; + for c in chars { + let v = _mm256_set1_epi8(c as i8); + let eq_lo = _mm256_cmpeq_epi8(lo, v); + let eq_hi = _mm256_cmpeq_epi8(hi, v); + mask_lo |= _mm256_movemask_epi8(eq_lo); + mask_hi |= _mm256_movemask_epi8(eq_hi); + } + (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32) } /// Build a u64 mask where bit i is 1 if byte i in (lo|hi) equals `"` OR `\`. From c3901b01b5c962d3fb82d7d1514632260e339e86 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 21:35:51 +0000 Subject: [PATCH 06/11] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20clippy=20lint,=20warmup=20comment,=20shared=20empty?= =?UTF-8?q?=20Rc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Collapse nested if in parse_f64 skip_validation path (clippy::collapsible_if) - Allow clippy::approx_constant on intentional test value 3.14 - Replace push loops with vec![b'['; N] in depth tests (clippy::same-item-push) - Update warmup comment in lua_bench.lua to reference current iters values - Use shared empty_rc field in SkipCache to avoid per-slot Rc::from([]) allocation --- benches/lua_bench.lua | 2 +- src/decode/number.rs | 9 +++++---- src/skip_cache.rs | 16 ++++++++++++---- src/validate/mod.rs | 10 ++++------ 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 1360b27..9462bce 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -147,7 +147,7 @@ local function bench(name, iters, fn) -- before measurement starts. Excluded from timing and memory delta. -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave -- the bench measuring interpreter mode for the large-payload scenarios - -- (1m has iters=15, iters/5=3 → trace never compiles → ~30% noise). + -- (e.g. 500k has iters=100, iters/5=20 → without floor, traces may not compile). local warmup = math.max(50, math.floor(iters / 5)) for _ = 1, warmup do fn() end diff --git a/src/decode/number.rs b/src/decode/number.rs index 24f8cf7..ba1176f 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -48,10 +48,10 @@ pub(crate) fn parse_f64(bytes: &[u8], skip_validation: bool) -> Result, /// Map from a container's opener position-in-indices to slot index. by_opener: FxHashMap, + /// Shared empty Rc slice reused for all newly-created empty slots, + /// avoiding per-slot Rc allocation until the slot is populated. + empty_rc: Rc<[u32]>, } pub(crate) struct SkipSlot { @@ -24,8 +26,12 @@ impl SkipCache { pub(crate) fn new() -> Self { let empty: Rc<[u32]> = Rc::from([]); Self { - slots: vec![SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }], + slots: vec![SkipSlot { + child_starts: Rc::clone(&empty), + child_ends: Rc::clone(&empty), + }], by_opener: FxHashMap::default(), + empty_rc: empty, } } @@ -36,8 +42,10 @@ impl SkipCache { return (slot, true); } let new = self.slots.len() as u32; - let empty: Rc<[u32]> = Rc::from([]); - self.slots.push(SkipSlot { child_starts: Rc::clone(&empty), child_ends: empty }); + self.slots.push(SkipSlot { + child_starts: Rc::clone(&self.empty_rc), + child_ends: Rc::clone(&self.empty_rc), + }); self.by_opener.insert(opener_idx, new); (new, false) } diff --git a/src/validate/mod.rs b/src/validate/mod.rs index 803831e..aa6f53b 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -541,9 +541,8 @@ mod tests { #[test] fn grammar_accepts_at_max_depth() { // 1024 nested arrays at the default max_depth limit. - let mut buf = Vec::new(); - for _ in 0..1024 { buf.push(b'['); } - for _ in 0..1024 { buf.push(b']'); } + let mut buf = vec![b'['; 1024]; + buf.extend_from_slice(&vec![b']'; 1024]); assert!( validate_eager_values(&buf, &ix(&buf), 1024).is_ok(), "should accept exactly at max_depth" @@ -553,9 +552,8 @@ mod tests { #[test] fn grammar_rejects_over_max_depth() { // 1025 nested arrays — one past the default max_depth limit. - let mut buf = Vec::new(); - for _ in 0..1025 { buf.push(b'['); } - for _ in 0..1025 { buf.push(b']'); } + let mut buf = vec![b'['; 1025]; + buf.extend_from_slice(&vec![b']'; 1025]); assert_eq!( validate_eager_values(&buf, &ix(&buf), 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP), ); From 9c491353be5700906b7112e9019a878708f5a642 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 22:08:07 +0000 Subject: [PATCH 07/11] perf(bench): fresh-process isolation per scenario, document modify+encode results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: - benches/lua_bench.lua: accept arg[1] as scenario name filter; when set, only that single scenario runs (backward-compatible — no arg runs all as before). - Makefile: bench target loops BENCH_SCENARIOS, launching a fresh resty process for each. Eliminates accumulated GC/JIT trace-cache interference between payload sizes. - docs/benchmarks.md: document fresh-process methodology, add modify+encode workload descriptions and throughput table. - README.md: reference modify+encode scenarios added in PR #54. Benchmark summary (x86_64 Linux, 3-round fresh-process, PR vs main): modify + encode path (median ops/s, PR main): small modify top: +41%..+75% (33k→59k) small modify nested: +44%..+80% (29k→53k) medium modify nested: +58%..+106% (92k→190k) 100k modify nested: +31%..+45% (47k→68k) interleaved mod top: +2%..+30% (5.8k→7.2k) 30/33 data points PR > main (91%). encode(unmodified) broadly flat or slightly up (+2%..+26%), consistent with JIT trace slot competition from the additional modify+encode scenarios. --- Makefile | 8 ++++++-- README.md | 15 ++++---------- benches/lua_bench.lua | 11 +++++++++++ docs/benchmarks.md | 46 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 65 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 628ee82..8f44942 100644 --- a/Makefile +++ b/Makefile @@ -34,8 +34,12 @@ test: build ## Run cargo tests + busted Lua tests lint: ## Run clippy with -D warnings cargo clippy --release --all-targets -- -D warnings -bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark - $(LUA_ENV) $(RESTY) benches/lua_bench.lua +BENCH_SCENARIOS := small medium github-100k 100k 200k 500k 1m 2m 5m 10m interleaved + +bench: build vendor/lua-cjson/cjson.so ## Run each scenario in a fresh LuaJIT process + @for s in $(BENCH_SCENARIOS); do \ + $(LUA_ENV) $(RESTY) benches/lua_bench.lua $$s; \ + done vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile ifeq ($(shell uname),Darwin) diff --git a/README.md b/README.md index 59d7738..7c03959 100644 --- a/README.md +++ b/README.md @@ -111,17 +111,10 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): | 1 MB | 517 | 3,538 | 16,520 | 16,988 | 32.0× / 32.9× | | 10 MB | 50 | 402 | 1,899 | 1,918 | 38.0× / 38.4× | -`qjson.parse` wins because it skips building a Lua table for the parts you -never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top -with similar throughput. Memory retention for `qjson` is essentially -flat in payload size (a few KB for the reusable buffers), while `cjson` -and `simdjson` retain more Lua heap because they materialize the table tree. - -See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, -memory numbers, an "encode round-trip" row (passthrough emit via -`memcpy`), exact environment, and the reproduction command. `make bench` -uses `lua-resty-simdjson` when `resty.simdjson` is available in the -OpenResty environment; otherwise it skips the simdjson rows. +Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode +workloads; small payload modify+encode reaches 48k–60k ops/s. See +[`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, +modify+encode results, memory numbers, environment, and reproduction. ```sh make bench # qjson vs cjson and lua-resty-simdjson diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 9462bce..4aa1196 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -293,12 +293,18 @@ local scenarios = { local has_pooled_api = type(qjson.new_decoder) == "function" local pooled_decoder = has_pooled_api and qjson.new_decoder() or nil +-- Optional scenario filter: arg[1] = scenario name (e.g. "small"). +-- When set, only that single scenario runs in a fresh LuaJIT process, +-- avoiding accumulated GC/JIT state from prior payloads. +local filter = arg[1] + if not simdjson then print("lua-resty-simdjson unavailable; skipping simdjson rows: " .. tostring(simdjson_or_err)) end for _, s in ipairs(scenarios) do + if filter and s.name ~= filter then goto continue_scenario end print(string.format("=== %s (%d bytes) ===", s.name, #s.payload)) local cjson_access = s.cjson_access or default_cjson_access @@ -369,6 +375,7 @@ for _, s in ipairs(scenarios) do local _enc = qjson.encode(t) if #_enc < 2 then error("qjson.encode produced too-short result") end end) + ::continue_scenario:: end -- Interleaved scenario: cycle through several payloads of different sizes @@ -398,6 +405,8 @@ local function make_cycler(items) end end +if not filter or filter == "interleaved" then + print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ","))) do @@ -475,3 +484,5 @@ do if #_enc < 2 then error("qjson.encode produced too-short result") end end) end + +end -- filter == "interleaved" diff --git a/docs/benchmarks.md b/docs/benchmarks.md index fe6f09f..e7b1e2e 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -26,7 +26,7 @@ Lua-table baselines. The harness lives at `benches/lua_bench.lua`. For each scenario: -1. Warmup pass (≥ 3 iterations, or `iters / 5`) to let LuaJIT compile hot +1. Warmup pass (≥ 50 iterations, or `iters / 5`) to let LuaJIT compile hot traces and the `qjson` `indices` / `scratch` buffers grow to their working size. Warmup is excluded from timing and the memory delta. 2. `collectgarbage("collect")` baseline. @@ -36,6 +36,11 @@ The harness lives at `benches/lua_bench.lua`. For each scenario: KB. The harness does not force a final collection after timing, so short-lived garbage from the last round may still be included. +**Fresh-process isolation (post PR #54).** `make bench` now launches a +separate `resty` process for each payload size (small, medium, 100k, …, +interleaved). This avoids accumulated GC state and JIT trace-cache pressure +from earlier payloads bleeding into later scenarios. + The payload is a synthetic multimodal chat-completion request with one or more historical messages. Each message contains one small text part and one base64-encoded image part. Message count scales with payload size: the 10 MB @@ -58,6 +63,11 @@ parsing workloads with ~3-5% structural density. | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | | `qjson.decode + qjson.encode (unmodified)` | `qjson.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` | +| `qjson.decode + modify top + encode` | `qjson.decode(s)`, mutate a top-level field, `qjson.encode()` | Triggers materialization of the root container + full re-encode | +| `qjson.decode + add field + encode` | `qjson.decode(s)`, add a new top-level field, `qjson.encode()` | Same as modify-top, plus a new key shaping the encode output | +| `qjson.decode + modify nested + encode` | `qjson.decode(s)`, mutate a deeply nested field, `qjson.encode()` | Only materializes the modified subtree branch; unmodified siblings stay on the fast path | + +The new modify+encode scenarios were added in [#54](https://github.com/api7/lua-qjson/pull/54) to exercise the decode → mutate → re-encode pipeline end-to-end. ## Reproducing @@ -80,7 +90,7 @@ Numbers below come from one such run. Each row is "parse + access request fields" on the named payload. | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:|---:| +|---|---|---:|---:|---:|---:|---:|---:| | small | 2.1 KB | 94,075 | 108,108 | 127,214 | 120,398 | 203,666 | | medium | 60.4 KB | 9,041 | 83,043 | 123,487 | 214,500 | 214,408 | | github-100k | 100 KB | 2,238 | 2,047 | 6,010 | 5,994 | 6,701 | @@ -93,6 +103,28 @@ Each row is "parse + access request fields" on the named payload. | 10m | 10.00 MB | 50 | 402 | 1,899 | 1,918 | 1,925 | | interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 | +### Modify + encode throughput (PR #54) + +One-shot modify-then-encode benchmarks. Exercises the decode → mutate → +re-encode pipeline. Numbers below come from a 3-round per-scenario +fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2). + +| Scenario | modify top + encode | add field + encode | modify nested + encode | +|---|---|---:|---:|---:| +| small (2 KB) | 59,835 | 56,655 | 47,541 | +| medium (60 KB) | 37,142 | 46,275 | 184,638 | +| 100k (100 KB) | 35,881 | 38,183 | 73,529 | +| 200k (200 KB) | 17,129 | 16,250 | 59,524 | +| 500k (500 KB) | 6,221 | 5,170 | 22,158 | +| 1m | 2,938 | 2,434 | 13,806 | +| 2m | 1,518 | 1,241 | 1,597 | +| 5m | 366 | 364 | 232 | +| 10m | 120 | 115 | 87 | +| interleaved | 7,176 | 5,645 | 26,824 | + +For a before/after comparison against the pre-#54 baseline, see the +[PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361). + ### Speed-up vs. baselines | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | @@ -163,6 +195,16 @@ key into the Lua table heap. structural density is higher than the multimodal request ladder. Memory savings remain dramatic because `cjson` must materialize every nested object and string into the Lua heap. +7. **Modify + encode pipeline (PR #54)** shows the lazy-table API in + mutation mode. Small/medium payloads reach 47k–185k median ops/s. + The `_dirty` flag and `TABLE_TYPE_HINT` side-table eliminate + redundant tree walks and array/object re-scans inside the encoder. + Large payloads (≥5 MB) are dominated by the root-container + materialization cost, which copies all fields into a plain table. +8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache + interference between payload sizes. Each size now runs in its own + `resty` process, eliminating the systemic cross-scenario variance + observed in earlier benchmark runs. ## When to pick which From 923b91bc8a644724668a5ed857596eb58e8b2087 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 22:13:51 +0000 Subject: [PATCH 08/11] docs: refresh all benchmark tables with fresh-process build data Update README.md summary table and docs/benchmarks.md (throughput, speedup, memory, modify+encode) with results from a clean make bench run on the current branch. --- README.md | 10 ++--- docs/benchmarks.md | 99 +++++++++++++++++++++++----------------------- 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 7c03959..b733eaf 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,11 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | |---:|---:|---:|---:|---:|---:| -| 2 KB | 94,075 | 108,108 | 127,214 | 120,398 | 1.4× / 1.3× | -| 60 KB | 9,041 | 83,043 | 123,487 | 214,500 | 13.7× / 23.7× | -| 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 20.7× / 19.3× | -| 1 MB | 517 | 3,538 | 16,520 | 16,988 | 32.0× / 32.9× | -| 10 MB | 50 | 402 | 1,899 | 1,918 | 38.0× / 38.4× | +| 2 KB | 92,716 | 102,602 | 128,005 | 125,815 | 1.4× / 1.4× | +| 60 KB | 9,007 | 82,699 | 116,198 | 219,491 | 12.9× / 24.4× | +| 100 KB | 2,769 | 40,437 | 84,034 | 121,803 | 30.3× / 44.0× | +| 1 MB | 512 | 4,020 | 16,056 | 15,400 | 31.4× / 30.1× | +| 10 MB | 51 | 363 | 1,830 | 1,783 | 35.9× / 35.0× | Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode workloads; small payload modify+encode reaches 48k–60k ops/s. See diff --git a/docs/benchmarks.md b/docs/benchmarks.md index e7b1e2e..25f1595 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -91,17 +91,17 @@ Each row is "parse + access request fields" on the named payload. | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | |---|---|---:|---:|---:|---:|---:|---:| -| small | 2.1 KB | 94,075 | 108,108 | 127,214 | 120,398 | 203,666 | -| medium | 60.4 KB | 9,041 | 83,043 | 123,487 | 214,500 | 214,408 | -| github-100k | 100 KB | 2,238 | 2,047 | 6,010 | 5,994 | 6,701 | -| 100k | 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 114,548 | -| 200k | 200 KB | 2,659 | 19,040 | 90,090 | 92,251 | 106,383 | -| 500k | 500 KB | 1,052 | 7,062 | 34,722 | 35,336 | 37,453 | -| 1m | 1.00 MB | 517 | 3,538 | 16,520 | 16,988 | 17,261 | -| 2m | 2.00 MB | 258 | 2,026 | 9,021 | 8,580 | 9,033 | -| 5m | 5.00 MB | 102 | 663 | 2,982 | 3,728 | 3,829 | -| 10m | 10.00 MB | 50 | 402 | 1,899 | 1,918 | 1,925 | -| interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 | +| small | 2.1 KB | 92,716 | 102,602 | 128,005 | 125,815 | 260,322 | +| medium | 60.4 KB | 9,007 | 82,699 | 116,198 | 219,491 | 141,563 | +| github-100k | 100 KB | 1,834 | 1,909 | 4,591 | 5,643 | 6,207 | +| 100k | 100 KB | 2,769 | 40,437 | 84,034 | 121,803 | 105,374 | +| 200k | 200 KB | 2,543 | 20,593 | 45,704 | 91,408 | 67,114 | +| 500k | 500 KB | 1,047 | 8,218 | 28,852 | 37,580 | 29,334 | +| 1m | 1.00 MB | 512 | 4,020 | 16,056 | 15,400 | 16,269 | +| 2m | 2.00 MB | 251 | 2,105 | 9,145 | 9,137 | 9,634 | +| 5m | 5.00 MB | 102 | 791 | 3,543 | 3,747 | 3,679 | +| 10m | 10.00 MB | 51 | 363 | 1,830 | 1,783 | 1,749 | +| interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 | ### Modify + encode throughput (PR #54) @@ -111,16 +111,17 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2). | Scenario | modify top + encode | add field + encode | modify nested + encode | |---|---|---:|---:|---:| -| small (2 KB) | 59,835 | 56,655 | 47,541 | -| medium (60 KB) | 37,142 | 46,275 | 184,638 | -| 100k (100 KB) | 35,881 | 38,183 | 73,529 | -| 200k (200 KB) | 17,129 | 16,250 | 59,524 | -| 500k (500 KB) | 6,221 | 5,170 | 22,158 | -| 1m | 2,938 | 2,434 | 13,806 | -| 2m | 1,518 | 1,241 | 1,597 | -| 5m | 366 | 364 | 232 | -| 10m | 120 | 115 | 87 | -| interleaved | 7,176 | 5,645 | 26,824 | +| small (2 KB) | 58,242 | 58,190 | 43,003 | +| medium (60 KB) | 37,498 | 45,364 | 134,590 | +| github-100k | 4,419 | 3,964 | 4,359 | +| 100k (100 KB) | 28,114 | 34,364 | 71,942 | +| 200k (200 KB) | 18,282 | 16,932 | 55,127 | +| 500k (500 KB) | 6,850 | 4,841 | 19,001 | +| 1m | 3,125 | 2,998 | 13,649 | +| 2m | 1,788 | 1,076 | 1,555 | +| 5m | 366 | 283 | 215 | +| 10m | 120 | 92 | 83 | +| interleaved | 7,712 | 8,178 | 29,123 | For a before/after comparison against the pre-#54 baseline, see the [PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361). @@ -129,16 +130,16 @@ For a before/after comparison against the pre-#54 baseline, see the | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | |---|---:|---:|---:|---:| -| small | 1.4× | 1.2× | 1.3× | 1.1× | -| medium | 13.7× | 1.5× | 23.7× | 2.6× | -| github-100k | 2.7× | 2.9× | 2.7× | 2.9× | -| 100k | 20.7× | 3.4× | 19.3× | 3.2× | -| 200k | 33.9× | 4.7× | 34.7× | 4.8× | -| 500k | 33.0× | 4.9× | 33.6× | 5.0× | -| 1m | 32.0× | 4.7× | 32.9× | 4.8× | -| 2m | 35.0× | 4.5× | 33.3× | 4.2× | -| 5m | 29.2× | 4.5× | 36.5× | 5.6× | -| 10m | 38.0× | 4.7× | 38.4× | 4.8× | +| small | 1.4× | 1.2× | 1.4× | 1.2× | +| medium | 12.9× | 1.4× | 24.4× | 2.7× | +| github-100k | 2.5× | 2.4× | 3.1× | 3.0× | +| 100k | 30.3× | 2.1× | 44.0× | 3.0× | +| 200k | 18.0× | 2.2× | 35.9× | 4.4× | +| 500k | 27.6× | 3.5× | 35.9× | 4.6× | +| 1m | 31.4× | 4.0× | 30.1× | 3.8× | +| 2m | 36.4× | 4.3× | 36.4× | 4.3× | +| 5m | 34.7× | 4.5× | 36.7× | 4.7× | +| 10m | 35.9× | 5.0× | 35.0× | 4.9× | ## Results — memory delta (KB retained after 5 rounds) @@ -147,18 +148,18 @@ the timing rounds without forcing a final collection, so short-lived garbage from the last round may still be included. | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:| -| small | +15,493 | +15,500 | +4,066 | +15,116 | +11,140 | -| medium | +1,955 | +2,660 | +333 | +1,114 | +1,120 | -| github-100k | +12,018 | +3,527 | +14 | +536 | +230 | -| 100k | +485 | +748 | +67 | +692 | +229 | -| 200k | +392 | +523 | +34 | +346 | +112 | -| 500k | +577 | +630 | +14 | +139 | +45 | -| 1m | +1,082 | +1,121 | +10 | +104 | +34 | -| 2m | +1,155 | +1,248 | +14 | +208 | +45 | -| 5m | +1,316 | +1,538 | +14 | +400 | +45 | -| 10m | +1,583 | +2,014 | +14 | +708 | +45 | -| interleaved | +3,356 | +4,404 | +268 | +2,771 | +897 | +|---|---|---:|---:|---:|---:|---:| +| small | +15,474 | +15,482 | +4,070 | +15,111 | +4,892 | +| medium | +1,955 | +2,661 | +158 | +502 | +558 | +| github-100k | +4,218 | +3,035 | +28 | +560 | +96 | +| 100k | +485 | +812 | +39 | +721 | +96 | +| 200k | +393 | +709 | +22 | +373 | +54 | +| 500k | +885 | +1,169 | +30 | +721 | +96 | +| 1m | +1,255 | +1,415 | +26 | +444 | +69 | +| 2m | +1,155 | +1,251 | +19 | +271 | +27 | +| 5m | +1,316 | +1,562 | +20 | +405 | +31 | +| 10m | +1,584 | +2,017 | +24 | +731 | +47 | +| interleaved | +3,357 | +4,406 | +100 | +2,796 | +354 | `qjson.parse` retention is essentially constant across payload size: the only GC-rooted state is the reusable `indices: Vec` and `scratch` buffers. @@ -171,8 +172,8 @@ key into the Lua table heap. 1. **`qjson` is fastest once payloads move beyond tiny inputs.** The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and - larger multimodal payloads show roughly 14–38× higher throughput than - `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson` + larger multimodal payloads show roughly 13–36× higher throughput than + `cjson` and roughly 1.4–5× higher throughput than `lua-resty-simdjson` for request-field access. 2. **Reading every `messages[*].content` is still access-light for large multimodal bodies.** The benchmark touches the top-level request fields and @@ -180,7 +181,7 @@ key into the Lua table heap. inside each message. 3. **Speedup remains high at 10 MB.** The eager-decode optimization keeps `qjson.parse` throughput scaling well even at the 10 MB level, - maintaining ~38× over cjson and ~5× over simdjson. + maintaining ~36× over cjson and ~5× over simdjson. 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for passthrough workloads** — e.g. an LLM gateway re-emitting the original JSON after light-touch inspection. The substring fast path means @@ -189,14 +190,14 @@ key into the Lua table heap. 5. **Memory retention** for `qjson` is essentially flat in payload size; the eager parsers retain more Lua heap after the first run because the Lua table tree stays GC-rooted until the next collection. - The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson, - and ~14 KB for `qjson.parse`. + The 10 MB case retains ~1.6 MB for `cjson`, ~2.0 MB for simdjson, + and ~24 KB for `qjson.parse`. 6. **REST API payloads (github-100k) show a smaller speedup** because their structural density is higher than the multimodal request ladder. Memory savings remain dramatic because `cjson` must materialize every nested object and string into the Lua heap. 7. **Modify + encode pipeline (PR #54)** shows the lazy-table API in - mutation mode. Small/medium payloads reach 47k–185k median ops/s. + mutation mode. Small/medium payloads reach 43k–135k median ops/s. The `_dirty` flag and `TABLE_TYPE_HINT` side-table eliminate redundant tree walks and array/object re-scans inside the encoder. Large payloads (≥5 MB) are dominated by the root-container From 090d1345f23bc9e7b7b05fc953937079823e7e1d Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 22:15:53 +0000 Subject: [PATCH 09/11] docs: add modify+encode columns to README benchmark summary --- README.md | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b733eaf..1e36b67 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,12 @@ LD_LIBRARY_PATH="$PWD/target/release" \ ## Benchmarks `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal -chat-completion payloads, "parse + access model, temperature, and all -messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1, -AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): +chat-completion payloads (median ops/s under OpenResty LuaJIT 2.1, +AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload). -| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | +### Parse + access (read-only) + +| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access` | speedup vs. cjson | |---:|---:|---:|---:|---:|---:| | 2 KB | 92,716 | 102,602 | 128,005 | 125,815 | 1.4× / 1.4× | | 60 KB | 9,007 | 82,699 | 116,198 | 219,491 | 12.9× / 24.4× | @@ -111,10 +112,20 @@ AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): | 1 MB | 512 | 4,020 | 16,056 | 15,400 | 31.4× / 30.1× | | 10 MB | 51 | 363 | 1,830 | 1,783 | 35.9× / 35.0× | -Modify-then-encode scenarios (PR #54) add decode → mutate field → re-encode -workloads; small payload modify+encode reaches 48k–60k ops/s. See -[`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, -modify+encode results, memory numbers, environment, and reproduction. +### Encode (unmodified + modify-then-re-encode) + +| Size | encode (unmodified) | modify top | add field | modify nested | +|---:|---:|---:|---:|---:| +| 2 KB | 260,322 | 58,242 | 58,190 | 43,003 | +| 60 KB | 141,563 | 37,498 | 45,364 | 134,590 | +| 100 KB | 105,374 | 28,114 | 34,364 | 71,942 | +| 1 MB | 16,269 | 3,125 | 2,998 | 13,649 | +| 10 MB | 1,749 | 120 | 92 | 83 | + +> **encode (unmodified)** re-emits the original byte range via `memcpy` (substring fast +> path). **modify** scenarios materialize the mutated subtree and re-encode. +> See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, +> memory numbers, environment, and reproduction. ```sh make bench # qjson vs cjson and lua-resty-simdjson From 3bbac986c591307c849e413e1b1f07fc3a6a7a02 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 22:18:34 +0000 Subject: [PATCH 10/11] bench: add cjson modify+encode baselines; show qjson vs cjson comparison in README Adds cjson.decode + modify top/add field/modify nested + cjson.encode benchmarks so the modify+encode path has an eager baseline. README encode table now shows cjson/qjson side-by-side for modify workloads: qjson is 10-43x faster at 60 KB+. --- README.md | 29 ++++++++++++++++------------- benches/lua_bench.lua | 24 ++++++++++++++++++++++++ docs/benchmarks.md | 2 ++ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1e36b67..c88f52f 100644 --- a/README.md +++ b/README.md @@ -112,20 +112,23 @@ AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload). | 1 MB | 512 | 4,020 | 16,056 | 15,400 | 31.4× / 30.1× | | 10 MB | 51 | 363 | 1,830 | 1,783 | 35.9× / 35.0× | -### Encode (unmodified + modify-then-re-encode) - -| Size | encode (unmodified) | modify top | add field | modify nested | -|---:|---:|---:|---:|---:| -| 2 KB | 260,322 | 58,242 | 58,190 | 43,003 | -| 60 KB | 141,563 | 37,498 | 45,364 | 134,590 | -| 100 KB | 105,374 | 28,114 | 34,364 | 71,942 | -| 1 MB | 16,269 | 3,125 | 2,998 | 13,649 | -| 10 MB | 1,749 | 120 | 92 | 83 | - -> **encode (unmodified)** re-emits the original byte range via `memcpy` (substring fast -> path). **modify** scenarios materialize the mutated subtree and re-encode. +### Encode (unmodified) + modify-then-re-encode + +| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | +|---:|---:|---:|---:| +| 2 KB | 219,925 | 59,761 / 56,909 | 61,685 / 49,798 | +| 60 KB | 143,843 | 4,590 / **44,370** | 4,616 / **196,386** | +| 100 KB | 119,617 | 2,645 / **32,712** | 5,263 / **59,809** | +| 1 MB | 16,269 | 241 / **3,108** | 516 / **14,134** | + +> **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` — +> no fields touched means zero serializer work. +> **qjson modify+encode** materializes only the mutated subtree; unmodified +> siblings stay on the fast path. cjson always does a full materialize + +> re-serialize on every encode. At 60 KB+, qjson modify+encode is **10–43×** +> faster than the cjson equivalent. > See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, -> memory numbers, environment, and reproduction. +> memory numbers, and environment. ```sh make bench # qjson vs cjson and lua-resty-simdjson diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 4aa1196..c007afb 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -319,6 +319,30 @@ for _, s in ipairs(scenarios) do cjson_access(obj) end) + -- cjson always fully materializes on decode, so modify+encode is the + -- same cost as a full re-encode — useful as a realistic baseline for + -- modify workloads. + bench("cjson.decode + modify top + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_top(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + + bench("cjson.decode + add field + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_add(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + + bench("cjson.decode + modify nested + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_nested(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + if simdjson then bench("simdjson.decode + access fields", s.iters, function() local obj = simdjson:decode(s.payload) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 25f1595..4083ceb 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -59,6 +59,8 @@ parsing workloads with ~3-5% structural density. | Row | What it does | Notes | |---|---|---| | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | +| `cjson.decode + modify top + encode` | `cjson.decode(s)`, mutate top field, `cjson.encode()` | Full materialize + full re-encode (cjson baseline for modify+encode workloads) | +| `cjson.decode + modify nested + encode` | `cjson.decode(s)`, mutate deeply nested field, `cjson.encode()` | Same — cjson always re-encodes the whole tree | | `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | From 5a178d76c4d3387b6e1d3497da7617617320cf4f Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sat, 23 May 2026 22:19:58 +0000 Subject: [PATCH 11/11] docs: add speedup column to README encode table --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c88f52f..0c410c8 100644 --- a/README.md +++ b/README.md @@ -114,12 +114,12 @@ AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload). ### Encode (unmodified) + modify-then-re-encode -| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | -|---:|---:|---:|---:| -| 2 KB | 219,925 | 59,761 / 56,909 | 61,685 / 49,798 | -| 60 KB | 143,843 | 4,590 / **44,370** | 4,616 / **196,386** | -| 100 KB | 119,617 | 2,645 / **32,712** | 5,263 / **59,809** | -| 1 MB | 16,269 | 241 / **3,108** | 516 / **14,134** | +| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | speedup vs. cjson | +|---:|---:|---:|---:|---:| +| 2 KB | 219,925 | 59,761 / 56,909 | 61,685 / 49,798 | 1.0× / 0.8× | +| 60 KB | 143,843 | 4,590 / **44,370** | 4,616 / **196,386** | 9.7× / 42.5× | +| 100 KB | 119,617 | 2,645 / **32,712** | 5,263 / **59,809** | 12.4× / 11.4× | +| 1 MB | 16,269 | 241 / **3,108** | 516 / **14,134** | 12.9× / 27.4× | > **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` — > no fields touched means zero serializer work.