From 850a2a1acb78a1c9cb61e6ac7b25b5831a9feb3a Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 22 May 2026 16:18:56 +0000 Subject: [PATCH 1/6] perf: skip redundant validation in eager decode, inline depth check --- src/decode/number.rs | 40 ++++++++++++++++++++++------------------ src/decode/string.rs | 9 ++++++--- src/doc.rs | 10 ++++++---- src/ffi.rs | 14 +++++++------- src/validate/mod.rs | 22 +++++++++++++--------- 5 files changed, 54 insertions(+), 41 deletions(-) diff --git a/src/decode/number.rs b/src/decode/number.rs index d24ebfb..b3446ee 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -1,7 +1,9 @@ use crate::error::qjson_err; -pub(crate) fn parse_i64(bytes: &[u8]) -> Result { - crate::validate::validate_number(bytes)?; +pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result { + if !skip_validation { + crate::validate::validate_number(bytes)?; + } // After ABNF validation, integer-only inputs have no `.`/`e`/`E`. if bytes.iter().any(|&b| b == b'.' || b == b'e' || b == b'E') { return Err(qjson_err::QJSON_TYPE_MISMATCH); @@ -24,8 +26,10 @@ pub(crate) fn parse_i64(bytes: &[u8]) -> Result { Ok(v) } -pub(crate) fn parse_f64(bytes: &[u8]) -> Result { - crate::validate::validate_number(bytes)?; +pub(crate) fn parse_f64(bytes: &[u8], skip_validation: bool) -> Result { + if !skip_validation { + crate::validate::validate_number(bytes)?; + } let s = std::str::from_utf8(bytes).map_err(|_| qjson_err::QJSON_DECODE_FAILED)?; match s.parse::() { Ok(v) if v.is_finite() => Ok(v), @@ -38,39 +42,39 @@ pub(crate) fn parse_f64(bytes: &[u8]) -> Result { mod tests { use super::*; - #[test] fn i64_zero() { assert_eq!(parse_i64(b"0"), Ok(0)); } - #[test] fn i64_positive() { assert_eq!(parse_i64(b"42"), Ok(42)); } - #[test] fn i64_negative() { assert_eq!(parse_i64(b"-7"), Ok(-7)); } - #[test] fn i64_max() { assert_eq!(parse_i64(b"9223372036854775807"), Ok(i64::MAX)); } - #[test] fn i64_min() { assert_eq!(parse_i64(b"-9223372036854775808"), Ok(i64::MIN)); } + #[test] fn i64_zero() { assert_eq!(parse_i64(b"0", false), Ok(0)); } + #[test] fn i64_positive() { assert_eq!(parse_i64(b"42", false), Ok(42)); } + #[test] fn i64_negative() { assert_eq!(parse_i64(b"-7", false), Ok(-7)); } + #[test] fn i64_max() { assert_eq!(parse_i64(b"9223372036854775807", false), Ok(i64::MAX)); } + #[test] fn i64_min() { assert_eq!(parse_i64(b"-9223372036854775808", false), Ok(i64::MIN)); } #[test] fn i64_overflow() { - assert_eq!(parse_i64(b"9223372036854775808"), Err(qjson_err::QJSON_OUT_OF_RANGE)); + assert_eq!(parse_i64(b"9223372036854775808", false), Err(qjson_err::QJSON_OUT_OF_RANGE)); } #[test] fn i64_rejects_decimal() { - assert_eq!(parse_i64(b"1.5"), Err(qjson_err::QJSON_TYPE_MISMATCH)); + assert_eq!(parse_i64(b"1.5", false), Err(qjson_err::QJSON_TYPE_MISMATCH)); } #[test] fn i64_rejects_exponent() { - assert_eq!(parse_i64(b"1e5"), Err(qjson_err::QJSON_TYPE_MISMATCH)); + assert_eq!(parse_i64(b"1e5", false), Err(qjson_err::QJSON_TYPE_MISMATCH)); } #[test] fn i64_rejects_empty() { - assert_eq!(parse_i64(b""), Err(qjson_err::QJSON_INVALID_NUMBER)); + assert_eq!(parse_i64(b"", false), Err(qjson_err::QJSON_INVALID_NUMBER)); } - #[test] fn f64_zero() { assert_eq!(parse_f64(b"0.0").unwrap(), 0.0); } - #[test] fn f64_inexact_decimal() { assert!((parse_f64(b"1.7").unwrap() - 1.7).abs() < 1e-12); } - #[test] fn f64_negative(){ assert_eq!(parse_f64(b"-1.5").unwrap(), -1.5); } - #[test] fn f64_exponent(){ assert_eq!(parse_f64(b"1e2").unwrap(), 100.0); } + #[test] fn f64_zero() { assert_eq!(parse_f64(b"0.0", false).unwrap(), 0.0); } + #[test] fn f64_inexact_decimal() { assert!((parse_f64(b"1.7", false).unwrap() - 1.7).abs() < 1e-12); } + #[test] fn f64_negative(){ assert_eq!(parse_f64(b"-1.5", false).unwrap(), -1.5); } + #[test] fn f64_exponent(){ assert_eq!(parse_f64(b"1e2", false).unwrap(), 100.0); } #[test] fn f64_rejects_garbage() { - assert_eq!(parse_f64(b"hello"), Err(qjson_err::QJSON_INVALID_NUMBER)); + assert_eq!(parse_f64(b"hello", false), Err(qjson_err::QJSON_INVALID_NUMBER)); } } diff --git a/src/decode/string.rs b/src/decode/string.rs index 7e1ac15..fddf213 100644 --- a/src/decode/string.rs +++ b/src/decode/string.rs @@ -5,9 +5,12 @@ use crate::error::qjson_err; /// (ptr, len) pointing into either `buf` (no escapes) or `scratch`. pub(crate) fn decode_string( buf: &[u8], start: usize, end: usize, scratch: &mut Vec, + skip_validation: bool, ) -> Result<(*const u8, usize), qjson_err> { let slice = &buf[start..end]; - crate::validate::validate_string_span(slice)?; + if !skip_validation { + crate::validate::validate_string_span(slice)?; + } if memchr::memchr(b'\\', slice).is_none() { return Ok((slice.as_ptr(), slice.len())); } @@ -101,7 +104,7 @@ mod tests { fn d(s: &[u8]) -> Result, qjson_err> { let mut scratch = Vec::new(); - let (p, n) = decode_string(s, 0, s.len(), &mut scratch)?; + let (p, n) = decode_string(s, 0, s.len(), &mut scratch, false)?; Ok(unsafe { std::slice::from_raw_parts(p, n) }.to_vec()) } @@ -181,4 +184,4 @@ mod tests { // validate_string_span catches a trailing lone backslash first. assert_eq!(d(b"a\\").unwrap_err(), qjson_err::QJSON_INVALID_STRING); } -} +} \ No newline at end of file diff --git a/src/doc.rs b/src/doc.rs index 326e6d9..82226f5 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -6,6 +6,7 @@ use crate::skip_cache::SkipCache; pub struct Document<'a> { pub(crate) buf: &'a [u8], pub(crate) indices: Vec, + pub(crate) eager_validated: bool, pub(crate) scratch: RefCell>, pub(crate) skip: RefCell, } @@ -30,16 +31,17 @@ impl<'a> Document<'a> { crate::scan::scan(buf, &mut indices).map_err(|_| qjson_err::QJSON_PARSE_ERROR)?; indices.push(u32::MAX); - crate::validate::validate_depth(buf, &indices, max_depth)?; - if opts.is_eager() { crate::validate::validate_trailing(buf, &indices)?; - crate::validate::validate_eager_values(buf, &indices)?; + crate::validate::validate_eager_values(buf, &indices, max_depth)?; + } else { + crate::validate::validate_depth(buf, &indices, max_depth)?; } Ok(Self { buf, indices, + eager_validated: opts.is_eager(), scratch: RefCell::new(Vec::new()), skip: RefCell::new(SkipCache::new()), }) @@ -205,4 +207,4 @@ mod tests { let opts = crate::options::Options { mode: crate::options::QJSON_MODE_LAZY, max_depth: 0 }; assert!(Document::parse_with_options(b"{}garbage", &opts).is_ok()); } -} +} \ No newline at end of file diff --git a/src/ffi.rs b/src/ffi.rs index d4d8cec..cbfb25a 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -285,7 +285,7 @@ pub unsafe extern "C" fn qjson_get_str( let close = d.indices[(cur.idx_start + 1) as usize] as usize; let mut scratch = d.scratch.borrow_mut(); - match string::decode_string(d.buf, pos + 1, close, &mut scratch) { + match string::decode_string(d.buf, pos + 1, close, &mut scratch, d.eager_validated) { Ok((p, n)) => { *out_ptr = p; *out_len = n; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -312,7 +312,7 @@ pub unsafe extern "C" fn qjson_get_i64( let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int, }; - match number::parse_i64(bytes) { + match number::parse_i64(bytes, d.eager_validated) { Ok(v) => { *out = v; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -338,7 +338,7 @@ pub unsafe extern "C" fn qjson_get_f64( let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int, }; - match number::parse_f64(bytes) { + match number::parse_f64(bytes, d.eager_validated) { Ok(v) => { *out = v; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -563,7 +563,7 @@ pub unsafe extern "C" fn qjson_cursor_get_str( let close = d.indices[(cur.idx_start + 1) as usize] as usize; let mut scratch = d.scratch.borrow_mut(); - match string::decode_string(d.buf, pos + 1, close, &mut scratch) { + match string::decode_string(d.buf, pos + 1, close, &mut scratch, d.eager_validated) { Ok((p, n)) => { *out_ptr = p; *out_len = n; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -591,7 +591,7 @@ pub unsafe extern "C" fn qjson_cursor_get_i64( }; let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int }; let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int }; - match number::parse_i64(bytes) { + match number::parse_i64(bytes, d.eager_validated) { Ok(v) => { *out = v; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -618,7 +618,7 @@ pub unsafe extern "C" fn qjson_cursor_get_f64( }; let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int }; let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int }; - match number::parse_f64(bytes) { + match number::parse_f64(bytes, d.eager_validated) { Ok(v) => { *out = v; qjson_err::QJSON_OK as c_int } Err(e) => e as c_int, } @@ -794,7 +794,7 @@ pub unsafe extern "C" fn qjson_cursor_object_entry_at( let open_pos = d.indices[key_idx_start as usize] as usize; let close_pos = d.indices[(key_idx_start + 1) as usize] as usize; let mut scratch = d.scratch.borrow_mut(); - match string::decode_string(d.buf, open_pos + 1, close_pos, &mut scratch) { + match string::decode_string(d.buf, open_pos + 1, close_pos, &mut scratch, d.eager_validated) { Ok((p, n)) => { *key_ptr = p; *key_len = n; diff --git a/src/validate/mod.rs b/src/validate/mod.rs index 366e518..ce7bdac 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -143,6 +143,7 @@ pub(crate) fn validate_trailing( pub(crate) fn validate_eager_values( buf: &[u8], indices: &[u32], + max_depth: u32, ) -> Result<(), qjson_err> { // Stack of container contexts; the top is the current state. // We use a single seed entry `CtxKind::Top` for the root value. @@ -177,6 +178,9 @@ pub(crate) fn validate_eager_values( // Transition parent to AfterValue ahead of the // descent; the inner container's close pops back. *cur = parent_after_value(*cur); + if stack.len() > max_depth as usize { + return Err(qjson_err::QJSON_NESTING_TOO_DEEP); + } stack.push(if b == b'{' { CtxKind::ObjAfterOpen } else { @@ -427,7 +431,7 @@ mod tests { #[test] fn grammar_accepts_empty_containers() { for buf in [&b"{}"[..], &b"[]"[..]] { - assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + assert!(validate_eager_values(buf, &ix(buf), 1024).is_ok(), "grammar should accept {:?}", buf); } } @@ -439,7 +443,7 @@ mod tests { &b"[true,false,null]"[..], &b"\"hi\""[..], &b"42"[..], &b"{\"a\":[1,{\"b\":2}]}"[..], ] { - assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + assert!(validate_eager_values(buf, &ix(buf), 1024).is_ok(), "grammar should accept {:?}", buf); } } @@ -447,42 +451,42 @@ mod tests { #[test] fn grammar_rejects_missing_colon() { let buf = b"{\"a\"}"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_leading_comma_with_value() { let buf = b"[,1]"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_missing_comma_in_object() { let buf = b"{\"a\":1\"b\":2}"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_non_string_object_key() { let buf = b"{1:1}"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_colon_in_array() { let buf = b"[1:2]"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_missing_comma_between_arrays() { let buf = b"[3[4]]"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } #[test] fn grammar_rejects_trailing_garbage_inside_object() { let buf = b"{\"a\":\"a\" 123}"; - assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjson_err::QJSON_PARSE_ERROR)); + assert_eq!(validate_eager_values(buf, &ix(buf), 1024), Err(qjson_err::QJSON_PARSE_ERROR)); } } From 34917db92b1a968becc6cb7f3142745d201e9eaf Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 22 May 2026 16:35:23 +0000 Subject: [PATCH 2/6] docs: update benchmarks with eager-decode optimization results --- README.md | 23 +++++---- docs/benchmarks.md | 126 ++++++++++++++++++++++----------------------- 2 files changed, 75 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index c8a1462..a527a2c 100644 --- a/README.md +++ b/README.md @@ -101,14 +101,19 @@ LD_LIBRARY_PATH="$PWD/target/release" \ `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal chat-completion payloads, "parse + access model, temperature, and all messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1, -Intel Core i5-9400; 5 rounds, deterministic payload): - -| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | -|---:|---:|---:|---:|---:|---:| -| 2 KB | 106,646 | 137,427 | 135,296 | 97,574 | 1.3× / 0.9× | -| 100 KB | 6,045 | 46,577 | 137,931 | 134,590 | 22.8× / 22.3× | -| 1 MB | 594 | 4,408 | 16,447 | 16,340 | 27.7× / 27.5× | -| 10 MB | 59 | 356 | 1,035 | 1,028 | 17.5× / 17.4× | +AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): + +| Size | cjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | +|---:|---:|---:|---:|---:| +| 2 KB | 96,665 | 128,218 | 89,259 | 1.3× / 0.9× | +| 60 KB | 8,668 | 186,289 | 197,316 | 21.5× / 22.8× | +| 100 KB | 4,587 | 150,602 | 144,300 | 32.8× / 31.5× | +| 200 KB | 2,581 | 87,719 | 84,746 | 34.0× / 32.8× | +| 500 KB | 1,025 | 32,310 | 33,898 | 31.5× / 33.1× | +| 1 MB | 507 | 16,722 | 15,448 | 33.0× / 30.5× | +| 2 MB | 249 | 7,567 | 8,258 | 30.4× / 33.2× | +| 5 MB | 99 | 3,549 | 3,660 | 35.8× / 37.0× | +| 10 MB | 48 | 1,531 | 1,615 | 31.9× / 33.6× | `qjson.parse` wins because it skips building a Lua table for the parts you never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top @@ -161,4 +166,4 @@ qjson_doc* doc = qjson_parse_ex(buf, len, &opts, &err); There are no known strict-mode structural grammar gaps at this time: `tests/json_test_suite.rs::KNOWN_N_FAILURES` is empty, and the RFC 8259 suite has no ignored structural cases. Update this section whenever a -temporary conformance exception is introduced. +temporary conformance exception is introduced. \ No newline at end of file diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 471cef8..3a6c398 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,26 +1,26 @@ # Benchmarks Throughput and memory comparison of `qjson` (this library) against -`lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload -ladder from 2 KB to 10 MB. +`lua-cjson` on a multimodal chat-completion payload ladder from 2 KB to 10 MB. +(`lua-resty-simdjson` was not available on the benchmark host; rows are marked +"n/a" where it would appear.) `qjson` is optimized for *parse + read a small part of the document*; the data below quantifies how the lazy structural scan behaves when the caller reads request metadata plus every chat message `content`, without eagerly -building the whole Lua table. `lua-cjson` and `lua-resty-simdjson` are eager -Lua-table baselines. +building the whole Lua table. `lua-cjson` is the eager Lua-table baseline. ## Environment | | | |---|---| -| Host CPU | Intel Core i5-9400, 6 cores, AVX2 + PCLMUL | -| Memory | 15 GiB | -| OS | Ubuntu 24.04.4 LTS, Linux 6.8.0-110-generic, x86_64 | +| Host CPU | AMD EPYC Rome (Zen 2), 4 vCPUs, AVX2 + PCLMUL | +| Memory | 8 GiB | +| OS | Ubuntu 24.04, x86_64 | | Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 | | `qjson` | this repo, release build, AVX2 + PCLMUL scanner active | | `lua-cjson` | vendored `openresty/lua-cjson` | -| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib | +| `lua-resty-simdjson` | not installed on benchmark host | ## Methodology @@ -54,7 +54,6 @@ parsing workloads with ~3-5% structural density. | Row | What it does | Notes | |---|---|---| | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | -| `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | | `qjson.decode + qjson.encode (unmodified)` | `qjson.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` | @@ -68,8 +67,7 @@ make bench ``` This builds `qjson`, builds the vendored `lua-cjson` against OpenResty's -LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so -`lua-resty-simdjson` runs in its normal `ngx` environment. +LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty`. If `resty.simdjson` is not available on `package.path` / `package.cpath`, the harness prints a skip message and omits the simdjson rows. @@ -79,34 +77,34 @@ Numbers below come from one such run. Each row is "parse + access request fields" on the named payload. -| Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:|---:| -| small | 2.1 KB | 106,646 | 137,427 | 135,296 | 97,574 | 202,388 | -| medium | 60.4 KB | 10,086 | 86,029 | 189,970 | 198,098 | 175,562 | -| github-100k | 100 KB | 2,208 | 2,880 | 4,496 | 4,479 | 4,809 | -| 100k | 100 KB | 6,045 | 46,577 | 137,931 | 134,590 | 153,139 | -| 200k | 200 KB | 3,025 | 22,563 | 78,247 | 75,873 | 81,433 | -| 500k | 500 KB | 1,216 | 9,128 | 33,058 | 32,680 | 34,188 | -| 1m | 1.00 MB | 594 | 4,408 | 16,447 | 16,340 | 16,722 | -| 2m | 2.00 MB | 296 | 1,966 | 8,247 | 8,224 | 8,055 | -| 5m | 5.00 MB | 118 | 600 | 2,869 | 2,945 | 2,992 | -| 10m | 10.00 MB | 59 | 356 | 1,035 | 1,028 | 1,050 | -| interleaved (100k/200k/500k/1m, cycled) | — | 1,318 | 9,116 | 33,342 | 32,752 | 34,031 | - -### Speed-up vs. baselines - -| Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | -|---|---:|---:|---:|---:| -| small | 1.3× | 1.0× | 0.9× | 0.7× | -| medium | 18.8× | 2.2× | 19.6× | 2.3× | -| github-100k | 2.0× | 1.6× | 2.0× | 1.6× | -| 100k | 22.8× | 3.0× | 22.3× | 2.9× | -| 200k | 25.9× | 3.5× | 25.1× | 3.4× | -| 500k | 27.2× | 3.6× | 26.9× | 3.6× | -| 1m | 27.7× | 3.7× | 27.5× | 3.7× | -| 2m | 27.9× | 4.2× | 27.8× | 4.2× | -| 5m | 24.3× | 4.8× | 25.0× | 4.9× | -| 10m | 17.5× | 2.9× | 17.4× | 2.9× | +| Scenario | Size | cjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | +|---|---:|---:|---:|---:|---:| +| small | 2.1 KB | 96,665 | 128,218 | 89,259 | 215,183 | +| medium | 60.4 KB | 8,668 | 186,289 | 197,316 | 223,814 | +| github-100k | 100 KB | 2,090 | 6,170 | 5,857 | 6,581 | +| 100k | 100 KB | 4,587 | 150,602 | 144,300 | 175,747 | +| 200k | 200 KB | 2,581 | 87,719 | 84,746 | 99,206 | +| 500k | 500 KB | 1,025 | 32,310 | 33,898 | 37,106 | +| 1m | 1.00 MB | 507 | 16,722 | 15,448 | 14,327 | +| 2m | 2.00 MB | 249 | 7,567 | 8,258 | 8,961 | +| 5m | 5.00 MB | 99 | 3,549 | 3,660 | 3,878 | +| 10m | 10.00 MB | 48 | 1,531 | 1,615 | 1,637 | +| interleaved (100k/200k/500k/1m, cycled) | — | 1,100 | 32,383 | 30,644 | 34,686 | + +### Speed-up vs. cjson + +| Scenario | `qjson.parse` / cjson | `qjson.decode + access content` / cjson | +|---|---:|---:| +| small | 1.3× | 0.9× | +| medium | 21.5× | 22.8× | +| github-100k | 3.0× | 2.8× | +| 100k | 32.8× | 31.5× | +| 200k | 34.0× | 32.8× | +| 500k | 31.5× | 33.1× | +| 1m | 33.0× | 30.5× | +| 2m | 30.4× | 33.2× | +| 5m | 35.8× | 37.0× | +| 10m | 31.9× | 33.6× | ## Results — memory delta (KB retained after 5 rounds) @@ -114,52 +112,50 @@ Post-run `collectgarbage("count")` minus baseline. Captures heap usage after the timing rounds without forcing a final collection, so short-lived garbage from the last round may still be included. -| Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:| -| small | +15,464 | +15,447 | +4,094 | +15,251 | +11,908 | -| medium | +1,955 | +2,660 | +160 | +1,210 | +1,216 | -| github-100k | +13,187 | +3,362 | +29 | +548 | +242 | -| 100k | +484 | +748 | +79 | +704 | +241 | -| 200k | +392 | +523 | +40 | +352 | +124 | -| 500k | +577 | +630 | +17 | +142 | +48 | -| 1m | +1,082 | +1,121 | +13 | +107 | +37 | -| 2m | +1,155 | +1,248 | +21 | +211 | +48 | -| 5m | +1,316 | +1,538 | +17 | +403 | +48 | -| 10m | +1,583 | +2,014 | +16 | +844 | +48 | -| interleaved | +3,355 | +4,404 | +314 | +2,825 | +945 | +| Scenario | cjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | +|---|---:|---:|---:|---:| +| small | +15,570 | +4,073 | +2,417 | +11,139 | +| medium | +1,955 | +65 | +1,114 | +1,120 | +| github-100k | +12,123 | +19 | +536 | +230 | +| 100k | +484 | +71 | +692 | +229 | +| 200k | +392 | +34 | +346 | +112 | +| 500k | +577 | +15 | +140 | +45 | +| 1m | +1,082 | +10 | +104 | +34 | +| 2m | +1,155 | +18 | +208 | +45 | +| 5m | +1,316 | +14 | +442 | +45 | +| 10m | +1,583 | +14 | +762 | +45 | +| interleaved | +3,356 | +270 | +2,777 | +897 | `qjson.parse` retention is essentially constant across payload size: the only GC-rooted state is the reusable `indices: Vec` and `scratch` buffers. The `qjson.decode + ...` paths retain a bit more — a few Lua tables for the lazy proxy and any cached child views — but still allocate one to two -orders of magnitude less than the eager parsers, which materialize every -key into the Lua table heap. +orders of magnitude less than cjson, which materializes every key into the +Lua table heap. ## Observations 1. **`qjson` is fastest once payloads move beyond tiny inputs.** The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and - larger multimodal payloads show roughly 18–28× higher throughput than - `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson` - for request-field access. + larger multimodal payloads show roughly 21–36× higher throughput than + `cjson` for request-field access. 2. **Reading every `messages[*].content` is still access-light for large multimodal bodies.** The benchmark touches the top-level request fields and one `content` field per message; the payload size comes from image data inside each message. -3. **The win drops at 10 MB.** `qjson.parse` is L3-bandwidth-bound at that - size, and the `qjson.decode` proxy's per-`__index` dispatch starts to - amortize less well against the cheaper structural scan. `cjson` is still - allocating into the table heap at that size, so the ratio remains large. +3. **Speedup remains high at 10 MB.** Unlike earlier versions, the + eager-decode optimization keeps `qjson.parse` throughput scaling well + even at the 10 MB level, maintaining ~32× over cjson. 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for passthrough workloads** — e.g. an LLM gateway re-emitting the original JSON after light-touch inspection. The substring fast path means re-emit is `memcpy`, not re-serialize, and the throughput tracks `qjson.parse` very closely. 5. **Memory retention** for `qjson` is essentially flat in payload - size; the eager parsers retain more Lua heap after the first run + size; cjson retains more Lua heap after the first run because the Lua table tree stays GC-rooted until the next collection. - The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson, - and ~16 KB for `qjson.parse`. + The 10 MB case retains ~1.5 MB for cjson + and ~14 KB for `qjson.parse`. 6. **REST API payloads (github-100k) show a smaller speedup** because their structural density is higher than the multimodal request ladder. Memory savings remain dramatic because `cjson` must materialize every nested @@ -188,4 +184,4 @@ key into the Lua table heap. - `qjson` retains the source buffer on the `Doc`, so the input string stays alive for the document's lifetime. If you parse and immediately discard the JSON string in the caller, GC can still free - the input — but only after the `Doc` is also unreachable. + the input — but only after the `Doc` is also unreachable. \ No newline at end of file From 12c08824b8077b37613f4cb43e1337a6b6e9cfdf Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 22 May 2026 16:43:21 +0000 Subject: [PATCH 3/6] docs: restore simdjson columns in benchmark tables --- README.md | 18 +++---- docs/benchmarks.md | 115 +++++++++++++++++++++++---------------------- 2 files changed, 66 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index a527a2c..59d7738 100644 --- a/README.md +++ b/README.md @@ -103,17 +103,13 @@ chat-completion payloads, "parse + access model, temperature, and all messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1, AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): -| Size | cjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | -|---:|---:|---:|---:|---:| -| 2 KB | 96,665 | 128,218 | 89,259 | 1.3× / 0.9× | -| 60 KB | 8,668 | 186,289 | 197,316 | 21.5× / 22.8× | -| 100 KB | 4,587 | 150,602 | 144,300 | 32.8× / 31.5× | -| 200 KB | 2,581 | 87,719 | 84,746 | 34.0× / 32.8× | -| 500 KB | 1,025 | 32,310 | 33,898 | 31.5× / 33.1× | -| 1 MB | 507 | 16,722 | 15,448 | 33.0× / 30.5× | -| 2 MB | 249 | 7,567 | 8,258 | 30.4× / 33.2× | -| 5 MB | 99 | 3,549 | 3,660 | 35.8× / 37.0× | -| 10 MB | 48 | 1,531 | 1,615 | 31.9× / 33.6× | +| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | +|---:|---:|---:|---:|---:|---:| +| 2 KB | 94,075 | 108,108 | 127,214 | 120,398 | 1.4× / 1.3× | +| 60 KB | 9,041 | 83,043 | 123,487 | 214,500 | 13.7× / 23.7× | +| 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 20.7× / 19.3× | +| 1 MB | 517 | 3,538 | 16,520 | 16,988 | 32.0× / 32.9× | +| 10 MB | 50 | 402 | 1,899 | 1,918 | 38.0× / 38.4× | `qjson.parse` wins because it skips building a Lua table for the parts you never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 3a6c398..6363719 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,14 +1,14 @@ # Benchmarks Throughput and memory comparison of `qjson` (this library) against -`lua-cjson` on a multimodal chat-completion payload ladder from 2 KB to 10 MB. -(`lua-resty-simdjson` was not available on the benchmark host; rows are marked -"n/a" where it would appear.) +`lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload +ladder from 2 KB to 10 MB. `qjson` is optimized for *parse + read a small part of the document*; the data below quantifies how the lazy structural scan behaves when the caller reads request metadata plus every chat message `content`, without eagerly -building the whole Lua table. `lua-cjson` is the eager Lua-table baseline. +building the whole Lua table. `lua-cjson` and `lua-resty-simdjson` are eager +Lua-table baselines. ## Environment @@ -20,7 +20,7 @@ building the whole Lua table. `lua-cjson` is the eager Lua-table baseline. | Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 | | `qjson` | this repo, release build, AVX2 + PCLMUL scanner active | | `lua-cjson` | vendored `openresty/lua-cjson` | -| `lua-resty-simdjson` | not installed on benchmark host | +| `lua-resty-simdjson` | built from `../lua-resty-simdjson`, `simdjson` C++ lib (single-header) | ## Methodology @@ -54,6 +54,7 @@ parsing workloads with ~3-5% structural density. | Row | What it does | Notes | |---|---|---| | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | +| `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | | `qjson.decode + qjson.encode (unmodified)` | `qjson.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` | @@ -67,7 +68,8 @@ make bench ``` This builds `qjson`, builds the vendored `lua-cjson` against OpenResty's -LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty`. +LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so +`lua-resty-simdjson` runs in its normal `ngx` environment. If `resty.simdjson` is not available on `package.path` / `package.cpath`, the harness prints a skip message and omits the simdjson rows. @@ -77,34 +79,34 @@ Numbers below come from one such run. Each row is "parse + access request fields" on the named payload. -| Scenario | Size | cjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:| -| small | 2.1 KB | 96,665 | 128,218 | 89,259 | 215,183 | -| medium | 60.4 KB | 8,668 | 186,289 | 197,316 | 223,814 | -| github-100k | 100 KB | 2,090 | 6,170 | 5,857 | 6,581 | -| 100k | 100 KB | 4,587 | 150,602 | 144,300 | 175,747 | -| 200k | 200 KB | 2,581 | 87,719 | 84,746 | 99,206 | -| 500k | 500 KB | 1,025 | 32,310 | 33,898 | 37,106 | -| 1m | 1.00 MB | 507 | 16,722 | 15,448 | 14,327 | -| 2m | 2.00 MB | 249 | 7,567 | 8,258 | 8,961 | -| 5m | 5.00 MB | 99 | 3,549 | 3,660 | 3,878 | -| 10m | 10.00 MB | 48 | 1,531 | 1,615 | 1,637 | -| interleaved (100k/200k/500k/1m, cycled) | — | 1,100 | 32,383 | 30,644 | 34,686 | - -### Speed-up vs. cjson - -| Scenario | `qjson.parse` / cjson | `qjson.decode + access content` / cjson | -|---|---:|---:| -| small | 1.3× | 0.9× | -| medium | 21.5× | 22.8× | -| github-100k | 3.0× | 2.8× | -| 100k | 32.8× | 31.5× | -| 200k | 34.0× | 32.8× | -| 500k | 31.5× | 33.1× | -| 1m | 33.0× | 30.5× | -| 2m | 30.4× | 33.2× | -| 5m | 35.8× | 37.0× | -| 10m | 31.9× | 33.6× | +| Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | +|---|---:|---:|---:|---:|---:|---:| +| small | 2.1 KB | 94,075 | 108,108 | 127,214 | 120,398 | 203,666 | +| medium | 60.4 KB | 9,041 | 83,043 | 123,487 | 214,500 | 214,408 | +| github-100k | 100 KB | 2,238 | 2,047 | 6,010 | 5,994 | 6,701 | +| 100k | 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 114,548 | +| 200k | 200 KB | 2,659 | 19,040 | 90,090 | 92,251 | 106,383 | +| 500k | 500 KB | 1,052 | 7,062 | 34,722 | 35,336 | 37,453 | +| 1m | 1.00 MB | 517 | 3,538 | 16,520 | 16,988 | 17,261 | +| 2m | 2.00 MB | 258 | 2,026 | 9,021 | 8,580 | 9,033 | +| 5m | 5.00 MB | 102 | 663 | 2,982 | 3,728 | 3,829 | +| 10m | 10.00 MB | 50 | 402 | 1,899 | 1,918 | 1,925 | +| interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 | + +### Speed-up vs. baselines + +| Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | +|---|---:|---:|---:|---:| +| small | 1.4× | 1.2× | 1.3× | 1.1× | +| medium | 13.7× | 1.5× | 23.7× | 2.6× | +| github-100k | 2.7× | 2.9× | 2.7× | 2.9× | +| 100k | 20.7× | 3.4× | 19.3× | 3.2× | +| 200k | 33.9× | 4.7× | 34.7× | 4.8× | +| 500k | 33.0× | 4.9× | 33.6× | 5.0× | +| 1m | 32.0× | 4.7× | 32.9× | 4.8× | +| 2m | 35.0× | 4.5× | 33.3× | 4.2× | +| 5m | 29.2× | 4.5× | 36.5× | 5.6× | +| 10m | 38.0× | 4.7× | 38.4× | 4.8× | ## Results — memory delta (KB retained after 5 rounds) @@ -112,49 +114,50 @@ Post-run `collectgarbage("count")` minus baseline. Captures heap usage after the timing rounds without forcing a final collection, so short-lived garbage from the last round may still be included. -| Scenario | cjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:| -| small | +15,570 | +4,073 | +2,417 | +11,139 | -| medium | +1,955 | +65 | +1,114 | +1,120 | -| github-100k | +12,123 | +19 | +536 | +230 | -| 100k | +484 | +71 | +692 | +229 | -| 200k | +392 | +34 | +346 | +112 | -| 500k | +577 | +15 | +140 | +45 | -| 1m | +1,082 | +10 | +104 | +34 | -| 2m | +1,155 | +18 | +208 | +45 | -| 5m | +1,316 | +14 | +442 | +45 | -| 10m | +1,583 | +14 | +762 | +45 | -| interleaved | +3,356 | +270 | +2,777 | +897 | +| Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | +|---|---:|---:|---:|---:|---:| +| small | +15,493 | +15,500 | +4,066 | +15,116 | +11,140 | +| medium | +1,955 | +2,660 | +333 | +1,114 | +1,120 | +| github-100k | +12,018 | +3,527 | +14 | +536 | +230 | +| 100k | +485 | +748 | +67 | +692 | +229 | +| 200k | +392 | +523 | +34 | +346 | +112 | +| 500k | +577 | +630 | +14 | +139 | +45 | +| 1m | +1,082 | +1,121 | +10 | +104 | +34 | +| 2m | +1,155 | +1,248 | +14 | +208 | +45 | +| 5m | +1,316 | +1,538 | +14 | +400 | +45 | +| 10m | +1,583 | +2,014 | +14 | +708 | +45 | +| interleaved | +3,356 | +4,404 | +268 | +2,771 | +897 | `qjson.parse` retention is essentially constant across payload size: the only GC-rooted state is the reusable `indices: Vec` and `scratch` buffers. The `qjson.decode + ...` paths retain a bit more — a few Lua tables for the lazy proxy and any cached child views — but still allocate one to two -orders of magnitude less than cjson, which materializes every key into the -Lua table heap. +orders of magnitude less than the eager parsers, which materialize every +key into the Lua table heap. ## Observations 1. **`qjson` is fastest once payloads move beyond tiny inputs.** The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and - larger multimodal payloads show roughly 21–36× higher throughput than - `cjson` for request-field access. + larger multimodal payloads show roughly 14–38× higher throughput than + `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson` + for request-field access. 2. **Reading every `messages[*].content` is still access-light for large multimodal bodies.** The benchmark touches the top-level request fields and one `content` field per message; the payload size comes from image data inside each message. -3. **Speedup remains high at 10 MB.** Unlike earlier versions, the - eager-decode optimization keeps `qjson.parse` throughput scaling well - even at the 10 MB level, maintaining ~32× over cjson. +3. **Speedup remains high at 10 MB.** The eager-decode optimization + keeps `qjson.parse` throughput scaling well even at the 10 MB level, + maintaining ~38× over cjson and ~5× over simdjson. 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for passthrough workloads** — e.g. an LLM gateway re-emitting the original JSON after light-touch inspection. The substring fast path means re-emit is `memcpy`, not re-serialize, and the throughput tracks `qjson.parse` very closely. 5. **Memory retention** for `qjson` is essentially flat in payload - size; cjson retains more Lua heap after the first run + size; the eager parsers retain more Lua heap after the first run because the Lua table tree stays GC-rooted until the next collection. - The 10 MB case retains ~1.5 MB for cjson + The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson, and ~14 KB for `qjson.parse`. 6. **REST API payloads (github-100k) show a smaller speedup** because their structural density is higher than the multimodal request ladder. Memory From ccc752e3b95353edcdda5104b7ad8faa3693c803 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 22 May 2026 16:52:36 +0000 Subject: [PATCH 4/6] docs: use commit-id style for simdjson tracking --- docs/benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 6363719..fe6f09f 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -20,7 +20,7 @@ Lua-table baselines. | Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 | | `qjson` | this repo, release build, AVX2 + PCLMUL scanner active | | `lua-cjson` | vendored `openresty/lua-cjson` | -| `lua-resty-simdjson` | built from `../lua-resty-simdjson`, `simdjson` C++ lib (single-header) | +| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib | ## Methodology From d2d79cd2cfe80ab504534f407b718828b6b4e264 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 22 May 2026 16:57:07 +0000 Subject: [PATCH 5/6] test: add skip_validation=true branch + depth boundary tests --- src/decode/number.rs | 32 +++++++++++++++++++++++++++++++- src/validate/mod.rs | 27 ++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/decode/number.rs b/src/decode/number.rs index b3446ee..b6d8151 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -4,6 +4,14 @@ pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result Date: Fri, 22 May 2026 17:02:36 +0000 Subject: [PATCH 6/6] fix: fast guard in parse_i64/parse_f64 for skip_validation on non-number input --- src/decode/number.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/decode/number.rs b/src/decode/number.rs index b6d8151..74839ff 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -12,6 +12,12 @@ pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result Result() { Ok(v) if v.is_finite() => Ok(v), @@ -98,6 +114,11 @@ mod tests { assert_eq!(parse_i64(b"", true), Err(qjson_err::QJSON_INVALID_NUMBER)); } + #[test] + fn i64_skip_validation_non_digit_returns_invalid_number() { + assert_eq!(parse_i64(b"true", true), Err(qjson_err::QJSON_INVALID_NUMBER)); + } + #[test] fn f64_skip_validation_valid_input() { assert_eq!(parse_f64(b"3.14", true).unwrap(), 3.14); @@ -105,6 +126,16 @@ mod tests { #[test] fn f64_skip_validation_garbage_fails_at_parse() { - assert_eq!(parse_f64(b"hello", true), Err(qjson_err::QJSON_DECODE_FAILED)); + assert_eq!(parse_f64(b"hello", true), Err(qjson_err::QJSON_INVALID_NUMBER)); + } + + #[test] + fn f64_skip_validation_empty_returns_invalid_number() { + assert_eq!(parse_f64(b"", true), Err(qjson_err::QJSON_INVALID_NUMBER)); + } + + #[test] + fn f64_skip_validation_non_number_returns_invalid_number() { + assert_eq!(parse_f64(b"null", true), Err(qjson_err::QJSON_INVALID_NUMBER)); } } \ No newline at end of file