diff --git a/Makefile b/Makefile index 628ee82..8f44942 100644 --- a/Makefile +++ b/Makefile @@ -34,8 +34,12 @@ test: build ## Run cargo tests + busted Lua tests lint: ## Run clippy with -D warnings cargo clippy --release --all-targets -- -D warnings -bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark - $(LUA_ENV) $(RESTY) benches/lua_bench.lua +BENCH_SCENARIOS := small medium github-100k 100k 200k 500k 1m 2m 5m 10m interleaved + +bench: build vendor/lua-cjson/cjson.so ## Run each scenario in a fresh LuaJIT process + @for s in $(BENCH_SCENARIOS); do \ + $(LUA_ENV) $(RESTY) benches/lua_bench.lua $$s; \ + done vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile ifeq ($(shell uname),Darwin) diff --git a/README.md b/README.md index 59d7738..0c410c8 100644 --- a/README.md +++ b/README.md @@ -99,29 +99,36 @@ LD_LIBRARY_PATH="$PWD/target/release" \ ## Benchmarks `qjson` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal -chat-completion payloads, "parse + access model, temperature, and all -messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1, -AMD EPYC Rome (Zen 2, 4 vCPUs); 5 rounds, deterministic payload): +chat-completion payloads (median ops/s under OpenResty LuaJIT 2.1, +AMD EPYC Rome, Zen 2, 4 vCPUs; 5 rounds, deterministic payload). -| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | speedup vs. cjson | +### Parse + access (read-only) + +| Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access` | speedup vs. cjson | |---:|---:|---:|---:|---:|---:| -| 2 KB | 94,075 | 108,108 | 127,214 | 120,398 | 1.4× / 1.3× | -| 60 KB | 9,041 | 83,043 | 123,487 | 214,500 | 13.7× / 23.7× | -| 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 20.7× / 19.3× | -| 1 MB | 517 | 3,538 | 16,520 | 16,988 | 32.0× / 32.9× | -| 10 MB | 50 | 402 | 1,899 | 1,918 | 38.0× / 38.4× | - -`qjson.parse` wins because it skips building a Lua table for the parts you -never read; `qjson.decode + t.field` adds a cjson-shaped table proxy on top -with similar throughput. Memory retention for `qjson` is essentially -flat in payload size (a few KB for the reusable buffers), while `cjson` -and `simdjson` retain more Lua heap because they materialize the table tree. - -See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, -memory numbers, an "encode round-trip" row (passthrough emit via -`memcpy`), exact environment, and the reproduction command. `make bench` -uses `lua-resty-simdjson` when `resty.simdjson` is available in the -OpenResty environment; otherwise it skips the simdjson rows. +| 2 KB | 92,716 | 102,602 | 128,005 | 125,815 | 1.4× / 1.4× | +| 60 KB | 9,007 | 82,699 | 116,198 | 219,491 | 12.9× / 24.4× | +| 100 KB | 2,769 | 40,437 | 84,034 | 121,803 | 30.3× / 44.0× | +| 1 MB | 512 | 4,020 | 16,056 | 15,400 | 31.4× / 30.1× | +| 10 MB | 51 | 363 | 1,830 | 1,783 | 35.9× / 35.0× | + +### Encode (unmodified) + modify-then-re-encode + +| Size | encode (unmodified) | modify top (cjson / qjson) | modify nested (cjson / qjson) | speedup vs. cjson | +|---:|---:|---:|---:|---:| +| 2 KB | 219,925 | 59,761 / 56,909 | 61,685 / 49,798 | 1.0× / 0.8× | +| 60 KB | 143,843 | 4,590 / **44,370** | 4,616 / **196,386** | 9.7× / 42.5× | +| 100 KB | 119,617 | 2,645 / **32,712** | 5,263 / **59,809** | 12.4× / 11.4× | +| 1 MB | 16,269 | 241 / **3,108** | 516 / **14,134** | 12.9× / 27.4× | + +> **qjson.encode(unmodified)** re-emits the original byte range via `memcpy` — +> no fields touched means zero serializer work. +> **qjson modify+encode** materializes only the mutated subtree; unmodified +> siblings stay on the fast path. cjson always does a full materialize + +> re-serialize on every encode. At 60 KB+, qjson modify+encode is **10–43×** +> faster than the cjson equivalent. +> See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, +> memory numbers, and environment. ```sh make bench # qjson vs cjson and lua-resty-simdjson diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 30a3977..c007afb 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -145,7 +145,10 @@ local ROUNDS = 5 local function bench(name, iters, fn) -- Warmup pass: lets JIT compile hot traces and any one-time pools fill -- before measurement starts. Excluded from timing and memory delta. - local warmup = math.max(3, math.floor(iters / 5)) + -- Floor at 50: LuaJIT hotloop default is 56, so fewer iterations leave + -- the bench measuring interpreter mode for the large-payload scenarios + -- (e.g. 500k has iters=100, iters/5=20 → without floor, traces may not compile). + local warmup = math.max(50, math.floor(iters / 5)) for _ = 1, warmup do fn() end collectgarbage("collect") @@ -220,6 +223,21 @@ local function default_table_access(t) end end +local function default_table_modify_top(t) + t.model = "new-model" + t.temperature = 0.0 +end + +local function default_table_modify_add(t) + t.stream = true +end + +local function default_table_modify_nested(t) + if t.messages and qjson.len(t.messages) > 0 then + t.messages[1].content = "modified" + end +end + -- GitHub issues accessors: array of issues, access first issue's fields local function github_cjson_access(obj) local _ = obj[1] and obj[1].id @@ -239,15 +257,32 @@ local function github_table_access(t) local _ = t[1] and t[1].user and t[1].user.login end +local function github_table_modify_top(t) + t[1].title = "modified title" +end + +local function github_table_modify_add(t) + if t[1] then + t[1].extra_field = true + end +end + +local function github_table_modify_nested(t) + if t[1] and t[1].user then + t[1].user.login = "modified-user" + end +end + local scenarios = { {name = "small", iters = 5000, payload = read_file("benches/fixtures/small_api.json")}, {name = "medium", iters = 500, payload = read_file("benches/fixtures/medium_resp.json")}, {name = "github-100k", iters = 100, payload = make_github_issues_payload(100 * 1024), - cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access}, + cjson_access = github_cjson_access, qjson_access = github_qjson_access, table_access = github_table_access, + modify_top = github_table_modify_top, modify_add = github_table_modify_add, modify_nested = github_table_modify_nested}, {name = "100k", iters = 100, payload = make_payload(100 * 1024)}, {name = "200k", iters = 50, payload = make_payload(200 * 1024)}, - {name = "500k", iters = 20, payload = make_payload(500 * 1024)}, - {name = "1m", iters = 15, payload = make_payload(1024 * 1024)}, + {name = "500k", iters = 100, payload = make_payload(500 * 1024)}, + {name = "1m", iters = 60, payload = make_payload(1024 * 1024)}, {name = "2m", iters = 20, payload = make_payload(2 * 1024 * 1024)}, {name = "5m", iters = 20, payload = make_payload(5 * 1024 * 1024)}, {name = "10m", iters = 20, payload = make_payload(10 * 1024 * 1024)}, @@ -258,23 +293,56 @@ local scenarios = { local has_pooled_api = type(qjson.new_decoder) == "function" local pooled_decoder = has_pooled_api and qjson.new_decoder() or nil +-- Optional scenario filter: arg[1] = scenario name (e.g. "small"). +-- When set, only that single scenario runs in a fresh LuaJIT process, +-- avoiding accumulated GC/JIT state from prior payloads. +local filter = arg[1] + if not simdjson then print("lua-resty-simdjson unavailable; skipping simdjson rows: " .. tostring(simdjson_or_err)) end for _, s in ipairs(scenarios) do + if filter and s.name ~= filter then goto continue_scenario end print(string.format("=== %s (%d bytes) ===", s.name, #s.payload)) local cjson_access = s.cjson_access or default_cjson_access local qjson_access = s.qjson_access or default_qjson_access local table_access = s.table_access or default_table_access + local modify_top = s.modify_top or default_table_modify_top + local modify_add = s.modify_add or default_table_modify_add + local modify_nested = s.modify_nested or default_table_modify_nested bench("cjson.decode + access fields", s.iters, function() local obj = cjson.decode(s.payload) cjson_access(obj) end) + -- cjson always fully materializes on decode, so modify+encode is the + -- same cost as a full re-encode — useful as a realistic baseline for + -- modify workloads. + bench("cjson.decode + modify top + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_top(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + + bench("cjson.decode + add field + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_add(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + + bench("cjson.decode + modify nested + encode", s.iters, function() + local obj = cjson.decode(s.payload) + modify_nested(obj) + local _enc = cjson.encode(obj) + if #_enc < 2 then error("cjson.encode produced too-short result") end + end) + if simdjson then bench("simdjson.decode + access fields", s.iters, function() local obj = simdjson:decode(s.payload) @@ -307,8 +375,31 @@ for _, s in ipairs(scenarios) do bench("qjson.decode + qjson.encode (unmodified)", s.iters, function() local t = qjson.decode(s.payload) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) + + bench("qjson.decode + modify top + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_top(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + + bench("qjson.decode + add field + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_add(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + + bench("qjson.decode + modify nested + encode", s.iters, function() + local t = qjson.decode(s.payload) + modify_nested(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + ::continue_scenario:: end -- Interleaved scenario: cycle through several payloads of different sizes @@ -338,6 +429,8 @@ local function make_cycler(items) end end +if not filter or filter == "interleaved" then + print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ","))) do @@ -384,6 +477,36 @@ do bench("qjson.decode + qjson.encode (unmodified)", 400, function() local p = next_p() local t = qjson.decode(p) - local _ = qjson.encode(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + modify top + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_top(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + add field + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_add(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end + end) + + next_p = make_cycler(interleaved) + bench("qjson.decode + modify nested + encode", 400, function() + local p = next_p() + local t = qjson.decode(p) + default_table_modify_nested(t) + local _enc = qjson.encode(t) + if #_enc < 2 then error("qjson.encode produced too-short result") end end) end + +end -- filter == "interleaved" diff --git a/docs/benchmarks.md b/docs/benchmarks.md index fe6f09f..4083ceb 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -26,7 +26,7 @@ Lua-table baselines. The harness lives at `benches/lua_bench.lua`. For each scenario: -1. Warmup pass (≥ 3 iterations, or `iters / 5`) to let LuaJIT compile hot +1. Warmup pass (≥ 50 iterations, or `iters / 5`) to let LuaJIT compile hot traces and the `qjson` `indices` / `scratch` buffers grow to their working size. Warmup is excluded from timing and the memory delta. 2. `collectgarbage("collect")` baseline. @@ -36,6 +36,11 @@ The harness lives at `benches/lua_bench.lua`. For each scenario: KB. The harness does not force a final collection after timing, so short-lived garbage from the last round may still be included. +**Fresh-process isolation (post PR #54).** `make bench` now launches a +separate `resty` process for each payload size (small, medium, 100k, …, +interleaved). This avoids accumulated GC state and JIT trace-cache pressure +from earlier payloads bleeding into later scenarios. + The payload is a synthetic multimodal chat-completion request with one or more historical messages. Each message contains one small text part and one base64-encoded image part. Message count scales with payload size: the 10 MB @@ -54,10 +59,17 @@ parsing workloads with ~3-5% structural density. | Row | What it does | Notes | |---|---|---| | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | +| `cjson.decode + modify top + encode` | `cjson.decode(s)`, mutate top field, `cjson.encode()` | Full materialize + full re-encode (cjson baseline for modify+encode workloads) | +| `cjson.decode + modify nested + encode` | `cjson.decode(s)`, mutate deeply nested field, `cjson.encode()` | Same — cjson always re-encodes the whole tree | | `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | | `qjson.parse + access fields` | `qjson.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | | `qjson.decode + access content` | `qjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | | `qjson.decode + qjson.encode (unmodified)` | `qjson.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` | +| `qjson.decode + modify top + encode` | `qjson.decode(s)`, mutate a top-level field, `qjson.encode()` | Triggers materialization of the root container + full re-encode | +| `qjson.decode + add field + encode` | `qjson.decode(s)`, add a new top-level field, `qjson.encode()` | Same as modify-top, plus a new key shaping the encode output | +| `qjson.decode + modify nested + encode` | `qjson.decode(s)`, mutate a deeply nested field, `qjson.encode()` | Only materializes the modified subtree branch; unmodified siblings stay on the fast path | + +The new modify+encode scenarios were added in [#54](https://github.com/api7/lua-qjson/pull/54) to exercise the decode → mutate → re-encode pipeline end-to-end. ## Reproducing @@ -80,33 +92,56 @@ Numbers below come from one such run. Each row is "parse + access request fields" on the named payload. | Scenario | Size | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:|---:| -| small | 2.1 KB | 94,075 | 108,108 | 127,214 | 120,398 | 203,666 | -| medium | 60.4 KB | 9,041 | 83,043 | 123,487 | 214,500 | 214,408 | -| github-100k | 100 KB | 2,238 | 2,047 | 6,010 | 5,994 | 6,701 | -| 100k | 100 KB | 5,302 | 32,248 | 109,649 | 102,564 | 114,548 | -| 200k | 200 KB | 2,659 | 19,040 | 90,090 | 92,251 | 106,383 | -| 500k | 500 KB | 1,052 | 7,062 | 34,722 | 35,336 | 37,453 | -| 1m | 1.00 MB | 517 | 3,538 | 16,520 | 16,988 | 17,261 | -| 2m | 2.00 MB | 258 | 2,026 | 9,021 | 8,580 | 9,033 | -| 5m | 5.00 MB | 102 | 663 | 2,982 | 3,728 | 3,829 | -| 10m | 10.00 MB | 50 | 402 | 1,899 | 1,918 | 1,925 | -| interleaved (100k/200k/500k/1m, cycled) | — | 1,141 | 9,544 | 34,043 | 33,611 | 32,752 | +|---|---|---:|---:|---:|---:|---:|---:| +| small | 2.1 KB | 92,716 | 102,602 | 128,005 | 125,815 | 260,322 | +| medium | 60.4 KB | 9,007 | 82,699 | 116,198 | 219,491 | 141,563 | +| github-100k | 100 KB | 1,834 | 1,909 | 4,591 | 5,643 | 6,207 | +| 100k | 100 KB | 2,769 | 40,437 | 84,034 | 121,803 | 105,374 | +| 200k | 200 KB | 2,543 | 20,593 | 45,704 | 91,408 | 67,114 | +| 500k | 500 KB | 1,047 | 8,218 | 28,852 | 37,580 | 29,334 | +| 1m | 1.00 MB | 512 | 4,020 | 16,056 | 15,400 | 16,269 | +| 2m | 2.00 MB | 251 | 2,105 | 9,145 | 9,137 | 9,634 | +| 5m | 5.00 MB | 102 | 791 | 3,543 | 3,747 | 3,679 | +| 10m | 10.00 MB | 51 | 363 | 1,830 | 1,783 | 1,749 | +| interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 | + +### Modify + encode throughput (PR #54) + +One-shot modify-then-encode benchmarks. Exercises the decode → mutate → +re-encode pipeline. Numbers below come from a 3-round per-scenario +fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2). + +| Scenario | modify top + encode | add field + encode | modify nested + encode | +|---|---|---:|---:|---:| +| small (2 KB) | 58,242 | 58,190 | 43,003 | +| medium (60 KB) | 37,498 | 45,364 | 134,590 | +| github-100k | 4,419 | 3,964 | 4,359 | +| 100k (100 KB) | 28,114 | 34,364 | 71,942 | +| 200k (200 KB) | 18,282 | 16,932 | 55,127 | +| 500k (500 KB) | 6,850 | 4,841 | 19,001 | +| 1m | 3,125 | 2,998 | 13,649 | +| 2m | 1,788 | 1,076 | 1,555 | +| 5m | 366 | 283 | 215 | +| 10m | 120 | 92 | 83 | +| interleaved | 7,712 | 8,178 | 29,123 | + +For a before/after comparison against the pre-#54 baseline, see the +[PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361). ### Speed-up vs. baselines | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | |---|---:|---:|---:|---:| -| small | 1.4× | 1.2× | 1.3× | 1.1× | -| medium | 13.7× | 1.5× | 23.7× | 2.6× | -| github-100k | 2.7× | 2.9× | 2.7× | 2.9× | -| 100k | 20.7× | 3.4× | 19.3× | 3.2× | -| 200k | 33.9× | 4.7× | 34.7× | 4.8× | -| 500k | 33.0× | 4.9× | 33.6× | 5.0× | -| 1m | 32.0× | 4.7× | 32.9× | 4.8× | -| 2m | 35.0× | 4.5× | 33.3× | 4.2× | -| 5m | 29.2× | 4.5× | 36.5× | 5.6× | -| 10m | 38.0× | 4.7× | 38.4× | 4.8× | +| small | 1.4× | 1.2× | 1.4× | 1.2× | +| medium | 12.9× | 1.4× | 24.4× | 2.7× | +| github-100k | 2.5× | 2.4× | 3.1× | 3.0× | +| 100k | 30.3× | 2.1× | 44.0× | 3.0× | +| 200k | 18.0× | 2.2× | 35.9× | 4.4× | +| 500k | 27.6× | 3.5× | 35.9× | 4.6× | +| 1m | 31.4× | 4.0× | 30.1× | 3.8× | +| 2m | 36.4× | 4.3× | 36.4× | 4.3× | +| 5m | 34.7× | 4.5× | 36.7× | 4.7× | +| 10m | 35.9× | 5.0× | 35.0× | 4.9× | ## Results — memory delta (KB retained after 5 rounds) @@ -115,18 +150,18 @@ the timing rounds without forcing a final collection, so short-lived garbage from the last round may still be included. | Scenario | cjson | simdjson | `qjson.parse` | `qjson.decode + access content` | `qjson.decode + qjson.encode` | -|---|---:|---:|---:|---:|---:| -| small | +15,493 | +15,500 | +4,066 | +15,116 | +11,140 | -| medium | +1,955 | +2,660 | +333 | +1,114 | +1,120 | -| github-100k | +12,018 | +3,527 | +14 | +536 | +230 | -| 100k | +485 | +748 | +67 | +692 | +229 | -| 200k | +392 | +523 | +34 | +346 | +112 | -| 500k | +577 | +630 | +14 | +139 | +45 | -| 1m | +1,082 | +1,121 | +10 | +104 | +34 | -| 2m | +1,155 | +1,248 | +14 | +208 | +45 | -| 5m | +1,316 | +1,538 | +14 | +400 | +45 | -| 10m | +1,583 | +2,014 | +14 | +708 | +45 | -| interleaved | +3,356 | +4,404 | +268 | +2,771 | +897 | +|---|---|---:|---:|---:|---:|---:| +| small | +15,474 | +15,482 | +4,070 | +15,111 | +4,892 | +| medium | +1,955 | +2,661 | +158 | +502 | +558 | +| github-100k | +4,218 | +3,035 | +28 | +560 | +96 | +| 100k | +485 | +812 | +39 | +721 | +96 | +| 200k | +393 | +709 | +22 | +373 | +54 | +| 500k | +885 | +1,169 | +30 | +721 | +96 | +| 1m | +1,255 | +1,415 | +26 | +444 | +69 | +| 2m | +1,155 | +1,251 | +19 | +271 | +27 | +| 5m | +1,316 | +1,562 | +20 | +405 | +31 | +| 10m | +1,584 | +2,017 | +24 | +731 | +47 | +| interleaved | +3,357 | +4,406 | +100 | +2,796 | +354 | `qjson.parse` retention is essentially constant across payload size: the only GC-rooted state is the reusable `indices: Vec` and `scratch` buffers. @@ -139,8 +174,8 @@ key into the Lua table heap. 1. **`qjson` is fastest once payloads move beyond tiny inputs.** The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and - larger multimodal payloads show roughly 14–38× higher throughput than - `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson` + larger multimodal payloads show roughly 13–36× higher throughput than + `cjson` and roughly 1.4–5× higher throughput than `lua-resty-simdjson` for request-field access. 2. **Reading every `messages[*].content` is still access-light for large multimodal bodies.** The benchmark touches the top-level request fields and @@ -148,7 +183,7 @@ key into the Lua table heap. inside each message. 3. **Speedup remains high at 10 MB.** The eager-decode optimization keeps `qjson.parse` throughput scaling well even at the 10 MB level, - maintaining ~38× over cjson and ~5× over simdjson. + maintaining ~36× over cjson and ~5× over simdjson. 4. **`qjson.decode + qjson.encode (unmodified)` is the headline number for passthrough workloads** — e.g. an LLM gateway re-emitting the original JSON after light-touch inspection. The substring fast path means @@ -157,12 +192,22 @@ key into the Lua table heap. 5. **Memory retention** for `qjson` is essentially flat in payload size; the eager parsers retain more Lua heap after the first run because the Lua table tree stays GC-rooted until the next collection. - The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson, - and ~14 KB for `qjson.parse`. + The 10 MB case retains ~1.6 MB for `cjson`, ~2.0 MB for simdjson, + and ~24 KB for `qjson.parse`. 6. **REST API payloads (github-100k) show a smaller speedup** because their structural density is higher than the multimodal request ladder. Memory savings remain dramatic because `cjson` must materialize every nested object and string into the Lua heap. +7. **Modify + encode pipeline (PR #54)** shows the lazy-table API in + mutation mode. Small/medium payloads reach 43k–135k median ops/s. + The `_dirty` flag and `TABLE_TYPE_HINT` side-table eliminate + redundant tree walks and array/object re-scans inside the encoder. + Large payloads (≥5 MB) are dominated by the root-container + materialization cost, which copies all fields into a plain table. +8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache + interference between payload sizes. Each size now runs in its own + `resty` process, eliminating the systemic cross-scenario variance + observed in earlier benchmark runs. ## When to pick which diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 86f50d0..7c1ccf1 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -23,6 +23,10 @@ else _M.empty_array_mt = { __jsontype = "array" } end +-- Weak side-table for container type hints, avoiding collision with +-- user-visible keys. Maps materialized table → "object" | "array". +local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" }) + -- Box scratch used for one-shot FFI returns. Reused across calls to avoid -- per-call allocation; safe because the parent Doc / lazy view holds the -- buffer alive and these are read-and-copy. @@ -65,6 +69,8 @@ local function wrap_child(parent_view, src_box) local own_box = ffi.new("qjson_cursor[1]") ffi.copy(own_box, src_box, ffi.sizeof("qjson_cursor")) return { + _parent = parent_view, + _dirty = false, _doc = parent_view._doc, _cur_box = own_box, -- keep cdata alive _cur = own_box[0], -- stable reference into own_box @@ -248,10 +254,13 @@ local function materialize_array_contents(view) end -- The set of keys reserved by the lazy view bookkeeping; user-supplied JSON --- keys with these names would collide (minor, deferred). Centralized here so --- the dirty check and __newindex can share the list. +-- keys with these names would collide (minor, deferred). Centralized so +-- __newindex (cache snapshotting before materialization) and +-- encode_lazy_object_walking (skipping internals while encoding a dirty +-- proxy) share one source of truth. local INTERNAL_KEYS = { _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true, + _parent = true, _dirty = true, } -- On first write, walk all existing key/value pairs into a plain table, @@ -260,10 +269,16 @@ local INTERNAL_KEYS = { -- Existing rawget-cached entries (e.g. previously returned child proxies) -- are preserved so callers' references remain valid. LazyObject.__newindex = function(t, k, v) + -- Mark dirty from this view up to the root. + local cur = t + while cur do + local mt = getmetatable(cur) + if mt ~= LazyObject and mt ~= LazyArray then break end + rawset(cur, "_dirty", true) + cur = rawget(cur, "_parent") + end local contents = materialize_object_contents(t) -- Snapshot user-key cache BEFORE nilling internals. - -- Use next() for raw iteration: pairs() invokes __pairs on lazy tables, - -- walking the full JSON via FFI instead of the Lua-side rawget cache. local cache = {} local ck, cv = next(t) while ck ~= nil do @@ -272,8 +287,15 @@ LazyObject.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil + rawset(t, "_parent", nil) + rawset(t, "_dirty", nil) + rawset(t, "_doc", nil) + rawset(t, "_cur_box", nil) + rawset(t, "_cur", nil) + rawset(t, "_bs", nil) + rawset(t, "_be", nil) setmetatable(t, nil) + TABLE_TYPE_HINT[t] = "object" for _, kv in ipairs(contents) do rawset(t, kv[1], cache[kv[1]] or kv[2]) end @@ -284,10 +306,16 @@ end -- switch to empty_array_mt (no lazy machinery), then apply the assignment. -- Existing rawget-cached entries are preserved so callers' references remain valid. LazyArray.__newindex = function(t, k, v) + -- Mark dirty from this view up to the root. + local cur = t + while cur do + local mt = getmetatable(cur) + if mt ~= LazyObject and mt ~= LazyArray then break end + rawset(cur, "_dirty", true) + cur = rawget(cur, "_parent") + end local contents = materialize_array_contents(t) -- Snapshot integer-key cache BEFORE nilling internals. - -- Use next() for raw iteration: pairs() would invoke __pairs on lazy arrays, - -- walking the full JSON via FFI instead of the Lua-side rawget cache. local cache = {} local ck, cv = next(t) while ck ~= nil do @@ -296,8 +324,15 @@ LazyArray.__newindex = function(t, k, v) end ck, cv = next(t, ck) end - t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil + rawset(t, "_parent", nil) + rawset(t, "_dirty", nil) + rawset(t, "_doc", nil) + rawset(t, "_cur_box", nil) + rawset(t, "_cur", nil) + rawset(t, "_bs", nil) + rawset(t, "_be", nil) setmetatable(t, _M.empty_array_mt) + TABLE_TYPE_HINT[t] = "array" for i, x in ipairs(contents) do rawset(t, i, cache[i] or x) end @@ -328,6 +363,7 @@ function _M.decode(json_str) error("qjson: root byte-span failed") end local view = { + _dirty = false, _doc = doc, _cur_box = root_box, -- keep the box alive; _cur is a stable reference _cur = root_box[0], @@ -370,23 +406,42 @@ _M.materialize = materialize local string_byte = string.byte local string_format = string.format --- Minimal JSON string escaper covering the cjson default set. +-- Escape lookup table: byte value → escape sequence string (or nil if safe). +local ESCAPES = { + [0x22] = '\\"', + [0x5C] = '\\\\', + [0x0A] = '\\n', + [0x0D] = '\\r', + [0x09] = '\\t', + [0x08] = '\\b', + [0x0C] = '\\f', +} + +-- JSON string escaper with bulk-copy fast path. +-- Scans for bytes that need escaping; copies clean segments via s:sub. +-- For strings with no escapes, returns '"' .. s .. '"' with zero table allocations. local function encode_string(s) - local out = {'"'} - for i = 1, #s do + local n = #s + local last, i = 1, 1 + local out = nil -- lazily create table only when escapes found + while i <= n do local b = string_byte(s, i) - if b == 0x22 then out[#out+1] = '\\"' - elseif b == 0x5C then out[#out+1] = '\\\\' - elseif b == 0x0A then out[#out+1] = '\\n' - elseif b == 0x0D then out[#out+1] = '\\r' - elseif b == 0x09 then out[#out+1] = '\\t' - elseif b == 0x08 then out[#out+1] = '\\b' - elseif b == 0x0C then out[#out+1] = '\\f' - elseif b < 0x20 then out[#out+1] = string_format('\\u%04x', b) - else out[#out+1] = string.char(b) + local esc = ESCAPES[b] + if esc or b < 0x20 then + if not out then out = {'"'} end + if i > last then out[#out + 1] = s:sub(last, i - 1) end + if esc then + out[#out + 1] = esc + else + out[#out + 1] = string_format('\\u%04x', b) + end + last = i + 1 end + i = i + 1 end - out[#out+1] = '"' + if not out then return '"' .. s .. '"' end + if last <= n then out[#out + 1] = s:sub(last, n) end + out[#out + 1] = '"' return table.concat(out) end @@ -400,27 +455,6 @@ local function encode_number(n) return string_format("%.14g", n) end --- A lazy subtree is "dirty" if any cached descendant has been materialized --- (no longer carries Lazy* metatable). Non-cached descendants are guaranteed --- untouched, so we only need to walk the rawget-cached entries. -local function is_dirty(v) - if type(v) ~= "table" then return false end - local mt = getmetatable(v) - if mt ~= LazyObject and mt ~= LazyArray then - return true -- materialized - end - -- Use next() for raw table iteration: pairs() would invoke __pairs on - -- lazy tables, walking the full JSON via FFI instead of the Lua cache. - local k, child = next(v) - while k ~= nil do - if not INTERNAL_KEYS[k] then - if is_dirty(child) then return true end - end - k, child = next(v, k) - end - return false -end - -- Forward declaration so encode_lazy_object_walking, encode_lazy_array_walking, -- and encode_array/encode_object can reference encode before its definition is -- complete (Lua resolves upvalues at call time, but the slot must be declared first). @@ -471,7 +505,7 @@ local function encode_lazy_array_walking(t) end local function encode_proxy(t) - if not is_dirty(t) then + if not t._dirty then -- Fast path: no mutations — slice the original buffer bytes. return t._doc._hold:sub(t._bs + 1, t._be) end @@ -514,6 +548,26 @@ local function encode_object(t) return "{" .. table.concat(parts, ",") .. "}" end +-- Dispatch for plain (non-lazy) tables. Separated from the main encode +-- function to keep the lazy-proxy fast path narrow for LuaJIT traces. +local function encode_plain_table(v) + local mt = getmetatable(v) + if mt == _M.empty_array_mt then + return encode_array(v) + end + local hint = TABLE_TYPE_HINT[v] + if hint == "object" then + return encode_object(v) + end + if hint == "array" then + return encode_array(v) + end + if is_array(v) then + return encode_array(v) + end + return encode_object(v) +end + encode = function(v) if rawequal(v, _M.null) then return "null" @@ -530,10 +584,7 @@ encode = function(v) if mt == LazyObject or mt == LazyArray then return encode_proxy(v) end - if is_array(v) then - return encode_array(v) - end - return encode_object(v) + return encode_plain_table(v) end error("qjson.encode: unsupported value type: " .. tv) end diff --git a/src/cursor.rs b/src/cursor.rs index bf38d40..82ce885 100644 --- a/src/cursor.rs +++ b/src/cursor.rs @@ -1,6 +1,7 @@ use crate::doc::Document; use crate::error::qjson_err; use crate::path::{PathIter, PathSeg}; +use std::rc::Rc; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub(crate) struct Cursor { @@ -62,9 +63,10 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result Result Result Ok(c), diff --git a/src/decode/number.rs b/src/decode/number.rs index 74839ff..ba1176f 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -19,7 +19,7 @@ pub(crate) fn parse_i64(bytes: &[u8], skip_validation: bool) -> Result Result) -> Result<(), usize> { #[inline(always)] unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 { - // For each byte, set 1 if byte is one of: { } [ ] : , " - // Bit-OR results from 7 byte-equality compares. + // 7 parallel byte-equality compares. On AMD Zen2 these dispatch across + // multiple FP ports and beat a PSHUFB-LUT nibble classifier (PSHUFB ymm + // is split into 2 micro-ops per lane, the LUT chain lengthens the + // critical path, and VPMOVMSKB has lat=4 — the 14-movemask total is + // still cheaper than the LUT path's serial dependency). PR #54 tried + // PSHUFB-LUT but measured -45% parse on small payloads on Zen2; this + // form is what shipped through #51. let chars: [u8; 7] = [b'{', b'}', b'[', b']', b':', b',', b'"']; let mut mask_lo: i32 = 0; let mut mask_hi: i32 = 0; diff --git a/src/skip_cache.rs b/src/skip_cache.rs index 6b6b1dd..bacf4ca 100644 --- a/src/skip_cache.rs +++ b/src/skip_cache.rs @@ -1,29 +1,37 @@ use rustc_hash::FxHashMap; +use std::rc::Rc; -#[derive(Default)] pub(crate) struct SkipCache { /// Slot 0 reserved as "no cache" marker (never written to). slots: Vec, /// Map from a container's opener position-in-indices to slot index. by_opener: FxHashMap, + /// Shared empty Rc slice reused for all newly-created empty slots, + /// avoiding per-slot Rc allocation until the slot is populated. + empty_rc: Rc<[u32]>, } pub(crate) struct SkipSlot { /// child_starts[i] = position in doc.indices of the i-th child's leading /// marker. For object children this is the key's opening '"'; for array /// children, the value's first marker. - pub(crate) child_starts: Vec, + pub(crate) child_starts: Rc<[u32]>, /// child_ends[i] = the `cursor_end` value for the i-th child (i.e. the /// idx_end to put in a Cursor pointing at that child's value). Storing /// this lets cache-hit resolution skip the brace-counting find_value_span. - pub(crate) child_ends: Vec, + pub(crate) child_ends: Rc<[u32]>, } impl SkipCache { pub(crate) fn new() -> Self { + let empty: Rc<[u32]> = Rc::from([]); Self { - slots: vec![SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() }], + slots: vec![SkipSlot { + child_starts: Rc::clone(&empty), + child_ends: Rc::clone(&empty), + }], by_opener: FxHashMap::default(), + empty_rc: empty, } } @@ -34,7 +42,10 @@ impl SkipCache { return (slot, true); } let new = self.slots.len() as u32; - self.slots.push(SkipSlot { child_starts: Vec::new(), child_ends: Vec::new() }); + self.slots.push(SkipSlot { + child_starts: Rc::clone(&self.empty_rc), + child_ends: Rc::clone(&self.empty_rc), + }); self.by_opener.insert(opener_idx, new); (new, false) } diff --git a/src/validate/mod.rs b/src/validate/mod.rs index a9ce958..aa6f53b 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -145,10 +145,56 @@ pub(crate) fn validate_eager_values( indices: &[u32], max_depth: u32, ) -> Result<(), qjson_err> { - // Stack of container contexts; the top is the current state. - // We use a single seed entry `CtxKind::Top` for the root value. - let mut stack: Vec = Vec::with_capacity(16); - stack.push(CtxKind::Top); + // Fixed-size stack avoids heap allocation for typical JSON depths. + const STACK_CAP: usize = 64; + let mut stack_buf: [CtxKind; STACK_CAP] = [CtxKind::Top; STACK_CAP]; + let mut sp: usize = 1; // next free slot (= len) + let mut fallback: Option> = None; + + macro_rules! push { + ($kind:expr) => { + if sp < STACK_CAP { + stack_buf[sp] = $kind; + sp += 1; + } else { + let fb = fallback.get_or_insert_with(|| { + let mut v: Vec = Vec::with_capacity(STACK_CAP + 16); + v.extend_from_slice(&stack_buf[..sp]); + v + }); + sp = STACK_CAP.wrapping_add(fb.len() + 1); + fb.push($kind); + } + }; + } + macro_rules! pop { + () => {{ + if sp <= STACK_CAP { + if sp == 0 { None } + else { sp -= 1; Some(stack_buf[sp]) } + } else { + let fb = fallback.as_mut().unwrap(); + let val = fb.pop(); + if fb.is_empty() { sp = STACK_CAP; } + val + } + }}; + } + macro_rules! last_mut { + () => {{ + if sp <= STACK_CAP { + if sp == 0 { None } else { Some(&mut stack_buf[sp - 1]) } + } else { + fallback.as_mut().unwrap().last_mut() + } + }}; + } + macro_rules! stack_len { + () => { if sp <= STACK_CAP { sp } else { fallback.as_ref().map_or(0, |v| v.len()) } }; + } + macro_rules! stack_is_empty { + () => { stack_len!() == 0 }; + } // Byte position just past the previous structural we consumed — // i.e. the start of the current gap. A gap may contain a scalar @@ -165,11 +211,11 @@ pub(crate) fn validate_eager_values( // First, consume any scalar token sitting in the gap before // this structural. This may transition the current state from // a value-expecting form to its "AfterValue" form. - consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?; + consume_scalar_gap(buf, prev_end, pos, last_mut!().unwrap())?; match b { b'{' | b'[' => { - let cur = stack.last_mut().unwrap(); + let cur = last_mut!().unwrap(); match *cur { CtxKind::Top | CtxKind::ArrAfterOpen @@ -178,10 +224,10 @@ pub(crate) fn validate_eager_values( // Transition parent to AfterValue ahead of the // descent; the inner container's close pops back. *cur = parent_after_value(*cur); - if stack.len() > max_depth as usize { + if stack_len!() > max_depth as usize { return Err(qjson_err::QJSON_NESTING_TOO_DEEP); } - stack.push(if b == b'{' { + push!(if b == b'{' { CtxKind::ObjAfterOpen } else { CtxKind::ArrAfterOpen @@ -193,25 +239,25 @@ pub(crate) fn validate_eager_values( i += 1; } b'}' => { - let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) { return Err(qjson_err::QJSON_PARSE_ERROR); } - if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); } + if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); } prev_end = pos + 1; i += 1; } b']' => { - let top = stack.pop().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let top = pop!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) { return Err(qjson_err::QJSON_PARSE_ERROR); } - if stack.is_empty() { return Err(qjson_err::QJSON_PARSE_ERROR); } + if stack_is_empty!() { return Err(qjson_err::QJSON_PARSE_ERROR); } prev_end = pos + 1; i += 1; } b',' => { - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma, CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma, @@ -221,7 +267,7 @@ pub(crate) fn validate_eager_values( i += 1; } b':' => { - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon, _ => return Err(qjson_err::QJSON_PARSE_ERROR), @@ -239,7 +285,7 @@ pub(crate) fn validate_eager_values( } strings::validate_string_span(&buf[pos + 1 .. close])?; - let cur = stack.last_mut().ok_or(qjson_err::QJSON_PARSE_ERROR)?; + let cur = last_mut!().ok_or(qjson_err::QJSON_PARSE_ERROR)?; match *cur { // Key position in an object. CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => { @@ -264,11 +310,11 @@ pub(crate) fn validate_eager_values( // Tail: a top-level scalar root (e.g. `42`, `true`) lives in the // gap after the last structural — or, if there are no structurals, // the whole buffer. - consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?; + consume_scalar_gap(buf, prev_end, buf.len(), last_mut!().unwrap())?; // After the walk, the stack must hold exactly one frame: the root // context, which must be `TopDone` (root value consumed). - if stack.len() != 1 || stack[0] != CtxKind::TopDone { + if stack_len!() != 1 || stack_buf[0] != CtxKind::TopDone { return Err(qjson_err::QJSON_PARSE_ERROR); } Ok(()) @@ -495,9 +541,8 @@ mod tests { #[test] fn grammar_accepts_at_max_depth() { // 1024 nested arrays at the default max_depth limit. - let mut buf = Vec::new(); - for _ in 0..1024 { buf.push(b'['); } - for _ in 0..1024 { buf.push(b']'); } + let mut buf = vec![b'['; 1024]; + buf.extend_from_slice(&vec![b']'; 1024]); assert!( validate_eager_values(&buf, &ix(&buf), 1024).is_ok(), "should accept exactly at max_depth" @@ -507,9 +552,8 @@ mod tests { #[test] fn grammar_rejects_over_max_depth() { // 1025 nested arrays — one past the default max_depth limit. - let mut buf = Vec::new(); - for _ in 0..1025 { buf.push(b'['); } - for _ in 0..1025 { buf.push(b']'); } + let mut buf = vec![b'['; 1025]; + buf.extend_from_slice(&vec![b']'; 1025]); assert_eq!( validate_eager_values(&buf, &ix(&buf), 1024), Err(qjson_err::QJSON_NESTING_TOO_DEEP), ); diff --git a/tests/lua/lazy_table_spec.lua b/tests/lua/lazy_table_spec.lua index 2769d39..532833c 100644 --- a/tests/lua/lazy_table_spec.lua +++ b/tests/lua/lazy_table_spec.lua @@ -390,4 +390,54 @@ describe("qjson.encode — nested mutations propagate", function() inner.x = 99 assert.are.equal(99, t.a.x) end) + + it("modifies top-level field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"model":"gpt-4","temperature":0.7}') + t.model = "gpt-5" + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal("gpt-5", parsed.model) + assert.are.equal(0.7, parsed.temperature) + end) + + it("adds new field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"a":1}') + t.b = true + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal(1, parsed.a) + assert.are.equal(true, parsed.b) + end) + + it("modifies nested field and encodes correctly", function() + local cjson = require("cjson") + local t = qjson.decode('{"messages":[{"role":"user","content":"hello"}]}') + t.messages[1].content = "world" + local out = qjson.encode(t) + local parsed = cjson.decode(out) + assert.are.equal("user", parsed.messages[1].role) + assert.are.equal("world", parsed.messages[1].content) + end) + + it("encodes unmodified proxy via fast path", function() + local json = '{"a":1,"b":"text","c":true}' + local t = qjson.decode(json) + local out = qjson.encode(t) + local cjson = require("cjson") + local parsed = cjson.decode(out) + assert.are.equal(1, parsed.a) + assert.are.equal("text", parsed.b) + assert.are.equal(true, parsed.c) + end) + + it("encodes string with escapes correctly", function() + local t = qjson.decode('{"key":"value"}') + t.key = 'line1\nline2\t"quoted"' + local out = qjson.encode(t) + local cjson = require("cjson") + local parsed = cjson.decode(out) + assert.are.equal('line1\nline2\t"quoted"', parsed.key) + end) end)