diff --git a/CLAUDE.md b/CLAUDE.md index cec0555..f2ab0c8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,7 +14,7 @@ The `Makefile` is the canonical entry point; `make help` lists targets. make build # cargo build --release → target/release/libquickdecode.so make test # cargo test --release + busted Lua tests make lint # cargo clippy -D warnings + cargo fmt --check -make bench # LuaJIT vs lua-cjson on benches/fixtures +make bench # OpenResty LuaJIT benchmark vs lua-cjson and simdjson ``` Under the hood / for narrower invocations: @@ -79,7 +79,7 @@ src/ lua/quickdecode.lua LuaJIT wrapper (ffi.cdef + Doc/Cursor metatables) include/lua_quick_decode.h public C header tests/ Rust integration tests + tests/lua/ busted suite -benches/ lua_bench.lua vs lua-cjson; fixtures/ has small_api.json + medium_resp.json +benches/ lua_bench.lua vs lua-cjson/simdjson; fixtures/ has small_api.json + medium_resp.json ``` The enum values in `src/error.rs` are duplicated in `include/lua_quick_decode.h` and `lua/quickdecode.lua` (the latter only encodes the `T_*` type tags and `NOT_FOUND = 2`). Keep all three in sync when adding/renumbering codes. diff --git a/Makefile b/Makefile index dcf93ee..086b577 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,20 @@ -# Overridable: `make bench LUAJIT=/path/to/luajit LUA_CPATH='...'` -LUAJIT ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit) -LUA_CPATH ?= ./vendor/lua-cjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so - -LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit))) +# Overridable: `make bench LUAJIT=/path/to/luajit RESTY=/path/to/resty LUA_CPATH='...'` +OPENRESTY ?= /usr/local/openresty +OPENRESTY_LUAJIT := $(OPENRESTY)/luajit/bin/luajit +OPENRESTY_RESTY := $(OPENRESTY)/bin/resty +LUAJIT ?= $(shell if [ -x "$(OPENRESTY_LUAJIT)" ]; then echo "$(OPENRESTY_LUAJIT)"; else command -v luajit 2>/dev/null || echo luajit; fi) +RESTY ?= $(shell if [ -x "$(OPENRESTY_RESTY)" ]; then echo "$(OPENRESTY_RESTY)"; else command -v resty 2>/dev/null || echo resty; fi) +LUA_PATH ?= ./lua/?.lua;$(OPENRESTY)/lualib/?.lua;$(OPENRESTY)/lualib/?/init.lua;; +LUA_CPATH ?= ./vendor/lua-cjson/?.so;./target/release/lib?.so;./?.so;$(OPENRESTY)/lualib/?.so;/usr/local/lib/lua/5.1/?.so;$(OPENRESTY)/luajit/lib/lua/5.1/?.so + +LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo $(OPENRESTY_LUAJIT)))) LUAJIT_INC ?= $(LUAJIT_PREFIX)/include/luajit-2.1 LIB_DIR := $(CURDIR)/target/release ifeq ($(shell uname),Darwin) -LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)' +LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)' else -LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)' +LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)' endif .PHONY: help build test lint bench clean @@ -29,8 +34,8 @@ test: build ## Run cargo tests + busted Lua tests lint: ## Run clippy with -D warnings cargo clippy --release --all-targets -- -D warnings -bench: build vendor/lua-cjson/cjson.so ## Run the LuaJIT vs cjson benchmark - $(LUA_ENV) $(LUAJIT) benches/lua_bench.lua +bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark + $(LUA_ENV) $(RESTY) benches/lua_bench.lua vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile ifeq ($(shell uname),Darwin) diff --git a/README.md b/README.md index bc9d0c2..18d93c8 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the ## Status -Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below. +Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson. ## Building @@ -83,38 +83,31 @@ busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so' ## Benchmarks `quickdecode` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal -chat-completion payloads, "parse + access 3 fields" workload (median ops/s -under LuaJIT 2.1, Skylake; 5 rounds, deterministic payload): +chat-completion payloads, "parse + access model, temperature, and all +messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1, +Intel Core i5-9400; 5 rounds, deterministic payload): -| Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | +| Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | speedup vs. cjson | |---:|---:|---:|---:|---:|---:| -| 2 KB | 39,414 | 54,395 | 117,233 | 126,807 | 3.0× / 3.2× | -| 100 KB | 2,589 | 19,944 | 72,202 | 61,162 | 27.9× / 23.6× | -| 1 MB | 355 | 2,048 | 12,723 | 12,448 | 35.8× / 35.1× | -| 10 MB | 32 | 128 | 537 | 609 | 16.8× / 19.0× | +| 2 KB | 106,646 | 137,427 | 135,296 | 97,574 | 1.3× / 0.9× | +| 100 KB | 6,045 | 46,577 | 137,931 | 134,590 | 22.8× / 22.3× | +| 1 MB | 594 | 4,408 | 16,447 | 16,340 | 27.7× / 27.5× | +| 10 MB | 59 | 356 | 1,035 | 1,028 | 17.5× / 17.4× | `qd.parse` wins because it skips building a Lua table for the parts you never read; `qd.decode + t.field` adds a cjson-shaped table proxy on top with similar throughput. Memory retention for `quickdecode` is essentially -flat in payload size (a few KB for the reusable buffers), where `cjson` -and `simdjson` retain ~1× the input size as live Lua-table state. - -ARM64 (Apple M4, NEON/PMULL scanner, same workload): - -| Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson | -|---:|---:|---:|---:|---:| -| 2 KB | 237,124 | 705,000 | 390,000 | 3.0× / 1.6× | -| 100 KB | 14,667 | 232,000 | 208,000 | 15.8× / 14.2× | -| 1 MB | 1,494 | 33,700 | 33,000 | 22.6× / 22.1× | -| 10 MB | 150 | 3,376 | 3,454 | 22.5× / 23.0× | +flat in payload size (a few KB for the reusable buffers), while `cjson` +and `simdjson` retain more Lua heap because they materialize the table tree. See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder, memory numbers, an "encode round-trip" row (passthrough emit via -`memcpy`), the pure-decode (no-access) comparison, and the exact -methodology + reproduction command. +`memcpy`), exact environment, and the reproduction command. `make bench` +uses `lua-resty-simdjson` when `resty.simdjson` is available in the +OpenResty environment; otherwise it skips the simdjson rows. ```sh -make bench # quickdecode vs cjson +make bench # quickdecode vs cjson and lua-resty-simdjson ``` ## RFC 8259 conformance diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 7f2c1de..a1b26d4 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -3,6 +3,10 @@ package.cpath = package.cpath .. ";./target/release/lib?.so" local qd = require("quickdecode") local cjson = require("cjson") +local simdjson_ok, simdjson_or_err = pcall(function() + return require("resty.simdjson").new() +end) +local simdjson = simdjson_ok and simdjson_or_err or nil local function read_file(p) local f = assert(io.open(p, "rb")) @@ -11,19 +15,14 @@ local function read_file(p) return s end --- Shape: a multimodal chat-completion request with one ~1.5K text question --- and N base64-encoded image parts (each 50-500 KB) until the payload reaches --- target_bytes. Mirrors the production case the bench is meant to reflect. +-- Shape: a multimodal chat-completion request with one or more historical +-- messages. Each message contains one small text part and one base64-encoded +-- image part. The number of messages scales with payload size: a 10 MB request +-- has roughly ten 1 MB image-bearing messages. -- --- Image sizes are drawn from a deterministic Park-Miller LCG (not math.random, --- which delegates to libc rand() and varies across machines) so the same --- target_bytes produces byte-identical output on any LuaJIT 2.1 host. --- --- Size accuracy: the normal-branch upper is `min(500K, remaining)` so the --- loop cannot overshoot during steady state. When fewer than 50 KB remain --- the final image falls through to `math.max(1024, remaining)` — undershoot --- is at most a few hundred bytes; worst-case overshoot is ~1 KB (only when --- `remaining < 1024`, which the seed=42 walk does not hit for our ladder). +-- Size accuracy: payload sizing is approximate. Message separators, role +-- strings, and the 1 KB minimum image size can add small drift from +-- `target_bytes` on tiny scenarios; larger scenarios stay close to target. -- GitHub-style payload: simulates /repos/{owner}/{repo}/issues response. -- Each issue has ~20 fields including nested user object, labels array, -- and realistic string lengths (URLs, timestamps, markdown body). @@ -117,41 +116,28 @@ local function make_b64(size) end local function make_payload(target_bytes) - local rng_state = 42 - local function rng_range(lo, hi) - -- Park-Miller minimal-standard LCG: a=48271, m=2^31-1. Multiplication - -- fits in double precision (48271 * 2^31 < 2^53). - rng_state = (rng_state * 48271) % 2147483647 - return lo + (rng_state % (hi - lo + 1)) - end - - local text = string.rep("Q", 1500) + local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024))) + local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}' + local text = string.rep("Q", 256) local text_part = '{"type":"text","text":"' .. text .. '"}' - local parts = { text_part } - local current = 200 + #text_part -- approx outer envelope overhead - - while current < target_bytes do - local remaining = target_bytes - current - local img_size - if remaining < 50 * 1024 then - -- Final image: shrink below the 50 KB floor so the label matches - -- the actual payload size. Bench iters all see the same payload - -- regardless, so the smaller tail blob doesn't change what's - -- being measured. - img_size = math.max(1024, remaining) - else - local upper = math.min(500 * 1024, remaining) - img_size = rng_range(50 * 1024, upper) - end - local b64 = make_b64(img_size) - local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,' - .. b64 .. '"}}' - parts[#parts + 1] = img_part - current = current + #img_part + 1 -- +1 for comma + local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,' + local image_suffix = '"}}' + local message_overhead = #('{"role":"user","content":[,]}') + #text_part + + #image_prefix + #image_suffix + local remaining = target_bytes - #envelope - (message_count * message_overhead) + local image_size = math.max(1024, math.floor(remaining / message_count)) + + local messages = {} + for i = 1, message_count do + local role = i % 2 == 1 and "user" or "assistant" + local b64 = make_b64(image_size) + local image_part = image_prefix .. b64 .. image_suffix + messages[i] = '{"role":"' .. role .. '","content":[' + .. text_part .. "," .. image_part .. ']}' end - return '{"model":"gpt-4-vision","temperature":0.7,"messages":' - .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}' + return '{"model":"gpt-4-vision","temperature":0.7,"messages":[' + .. table.concat(messages, ",") .. ']}' end local ROUNDS = 5 @@ -190,19 +176,48 @@ end local function default_cjson_access(obj) local _ = obj.model local _ = obj.temperature - local _ = obj.messages and obj.messages[1] and obj.messages[1].role + if obj.messages then + for _, msg in ipairs(obj.messages) do + local _ = msg.content + end + end +end + +local content_paths_by_message_count = {} + +local function content_paths(n) + local paths = content_paths_by_message_count[n] + if paths then + return paths + end + + paths = {} + for i = 0, n - 1 do + paths[i + 1] = "messages[" .. i .. "].content" + end + content_paths_by_message_count[n] = paths + return paths end local function default_qd_access(d) local _ = d:get_str("model") local _ = d:get_f64("temperature") - local _ = d:get_str("messages[0].role") + local n = d:len("messages") or 0 + local paths = content_paths(n) + for i = 1, n do + local _ = d:typeof(paths[i]) + end end local function default_table_access(t) local _ = t.model local _ = t.temperature - local _ = t.messages and t.messages[1] and t.messages[1].role + if t.messages then + for i = 1, qd.len(t.messages) do + local msg = t.messages[i] + local _ = msg.content + end + end end -- GitHub issues accessors: array of issues, access first issue's fields @@ -243,6 +258,11 @@ local scenarios = { local has_pooled_api = type(qd.new_decoder) == "function" local pooled_decoder = has_pooled_api and qd.new_decoder() or nil +if not simdjson then + print("lua-resty-simdjson unavailable; skipping simdjson rows: " + .. tostring(simdjson_or_err)) +end + for _, s in ipairs(scenarios) do print(string.format("=== %s (%d bytes) ===", s.name, #s.payload)) @@ -250,18 +270,25 @@ for _, s in ipairs(scenarios) do local qd_access = s.qd_access or default_qd_access local table_access = s.table_access or default_table_access - bench("cjson.decode + access 3 fields", s.iters, function() + bench("cjson.decode + access fields", s.iters, function() local obj = cjson.decode(s.payload) cjson_access(obj) end) - bench("quickdecode.parse + access 3 fields", s.iters, function() + if simdjson then + bench("simdjson.decode + access fields", s.iters, function() + local obj = simdjson:decode(s.payload) + cjson_access(obj) + end) + end + + bench("quickdecode.parse + access fields", s.iters, function() local d = qd.parse(s.payload) qd_access(d) end) if has_pooled_api then - bench("quickdecode pooled :parse + access 3 fields", s.iters, function() + bench("quickdecode pooled :parse + access fields", s.iters, function() local d = pooled_decoder:parse(s.payload) qd_access(d) end) @@ -273,7 +300,7 @@ for _, s in ipairs(scenarios) do end) end - bench("qd.decode + t.field x3", s.iters, function() + bench("qd.decode + access content", s.iters, function() local t = qd.decode(s.payload) table_access(t) end) @@ -315,41 +342,42 @@ print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ", do local next_p = make_cycler(interleaved) - bench("cjson.decode + access 3 fields", 400, function() + bench("cjson.decode + access fields", 400, function() local p = next_p() local obj = cjson.decode(p) - local _ = obj.model - local _ = obj.temperature - local _ = obj.messages and obj.messages[1] and obj.messages[1].role + default_cjson_access(obj) end) + if simdjson then + next_p = make_cycler(interleaved) + bench("simdjson.decode + access fields", 400, function() + local p = next_p() + local obj = simdjson:decode(p) + default_cjson_access(obj) + end) + end + next_p = make_cycler(interleaved) - bench("quickdecode.parse + access 3 fields", 400, function() + bench("quickdecode.parse + access fields", 400, function() local p = next_p() local d = qd.parse(p) - local _ = d:get_str("model") - local _ = d:get_f64("temperature") - local _ = d:get_str("messages[0].role") + default_qd_access(d) end) if has_pooled_api then next_p = make_cycler(interleaved) - bench("quickdecode pooled :parse + access 3 fields", 400, function() + bench("quickdecode pooled :parse + access fields", 400, function() local p = next_p() local d = pooled_decoder:parse(p) - local _ = d:get_str("model") - local _ = d:get_f64("temperature") - local _ = d:get_str("messages[0].role") + default_qd_access(d) end) end next_p = make_cycler(interleaved) - bench("qd.decode + t.field x3", 400, function() + bench("qd.decode + access content", 400, function() local p = next_p() local t = qd.decode(p) - local _ = t.model - local _ = t.temperature - local _ = t.messages and t.messages[1] and t.messages[1].role + default_table_access(t) end) next_p = make_cycler(interleaved) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index ea4eae9..ef0dab9 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -4,29 +4,23 @@ Throughput and memory comparison of `quickdecode` (this library) against `lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload ladder from 2 KB to 10 MB. -`quickdecode` is optimized for *parse + read a small number of fields*; the -data below quantifies how the lazy structural scan beats an eager build-the- -whole-table parser, and where the gap narrows. `lua-cjson` is the baseline. -`lua-resty-simdjson` (a Lua binding over the simdjson C++ library, eager) is -included to show how much of the win comes from SIMD vs. from skipping the -table build. +`quickdecode` is optimized for *parse + read a small part of the document*; +the data below quantifies how the lazy structural scan behaves when the caller +reads request metadata plus every chat message `content`, without eagerly +building the whole Lua table. `lua-cjson` and `lua-resty-simdjson` are eager +Lua-table baselines. ## Environment | | | |---|---| -| Host CPU | Intel Xeon (Skylake, IBRS), 4 cores | -| Memory | 7.6 GiB | -| OS | Linux x86_64 | -| Runtime | OpenResty `resty` 0.29 / openresty 1.29.2.3 / LuaJIT 2.1 ROLLING | +| Host CPU | Intel Core i5-9400, 6 cores, AVX2 + PCLMUL | +| Memory | 15 GiB | +| OS | Ubuntu 24.04.4 LTS, Linux 6.8.0-110-generic, x86_64 | +| Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 | | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active | -| `lua-cjson` | bundled with OpenResty | -| `lua-resty-simdjson` | upstream `main` build at `/tmp/lua-resty-simdjson`, simdjson C++ pinned by that repo | - -The bench uses the OpenResty `resty` CLI because `lua-resty-simdjson` pulls in -`ngx.null` / `ngx.sleep` at load time and cannot run under bare LuaJIT -without an OpenResty environment. `lua-cjson` and `quickdecode` themselves -run fine under bare LuaJIT. +| `lua-cjson` | vendored `openresty/lua-cjson` | +| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib | ## Methodology @@ -39,14 +33,15 @@ The harness lives at `benches/lua_bench.lua`. For each scenario: 3. 5 rounds × N iterations of the workload; report the **median** ops/s across rounds (mean + range also reported in the raw output). 4. Final `collectgarbage("count")` to capture the post-run memory delta in - KB — measures GC-rooted state retained by the parser, not transient - per-call allocations. + KB. The harness does not force a final collection after timing, so + short-lived garbage from the last round may still be included. -The payload is a synthetic multimodal chat-completion request — one -~1.5 KB text part plus N base64-encoded image parts of 50–500 KB each -until the target size is reached. The image size sequence comes from a -Park–Miller LCG with `seed=42` rather than `math.random` so the payload is -byte-identical across hosts. +The payload is a synthetic multimodal chat-completion request with one or more +historical messages. Each message contains one small text part and one +base64-encoded image part. Message count scales with payload size: the 10 MB +scenario has roughly ten messages, each carrying one ~1 MB image, so the +access pattern matches request bodies where every historical message includes +an image. A separate `github-100k` scenario simulates a GitHub Issues API response (`/repos/{owner}/{repo}/issues`) with ~100 KB of realistic REST API @@ -58,92 +53,80 @@ parsing workloads with ~3-5% structural density. | Row | What it does | Notes | |---|---|---| -| `cjson.decode + access 3 fields` | `cjson.decode(s)` then read 3 fields | Eager Lua table | -| `resty.simdjson:decode + access 3 fields` | `parser:decode(s)` then read 3 fields | Eager Lua table; **parser instance is reused** across iterations (the upstream-recommended pattern) | -| `quickdecode.parse + access 3 fields` | `qd.parse(s)` then `d:get_str/get_f64` × 3 | Lazy structural scan; explicit path-based reads | -| `qd.decode + t.field x3` | `qd.decode(s)` then `t.model` / `t.temperature` / `t.messages[1].role` | Lazy table proxy; reads go through `__index` | +| `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | +| `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table | +| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads | +| `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` | | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` | ## Reproducing -The straight comparison against `cjson` is one command: +Run the full comparison with one command: ```sh make bench ``` -This invokes `benches/lua_bench.lua` with `LD_LIBRARY_PATH=target/release` -and a `LUA_CPATH` that picks up `cjson` from the system locations. It does -**not** include `lua-resty-simdjson`. - -To also include `lua-resty-simdjson` you need (1) the library installed -somewhere `package.cpath` can reach the `.so`, (2) its Lua wrapper on -`package.path`, and (3) the bench script patched to require it. The patch -that adds the bench rows is a small `pcall(require, "resty.simdjson")` block; -keep it local — it is not part of the upstream bench file. Run it through -`resty` so the `ngx.*` symbols are available: - -```sh -LD_LIBRARY_PATH=$PWD/target/release \ -LUA_CPATH='/path/to/lua-resty-simdjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so' \ -LUA_PATH='/path/to/lua-resty-simdjson/lib/?.lua;/path/to/lua-resty-simdjson/lib/?/init.lua;./lua/?.lua;;' \ -/usr/local/openresty/bin/resty benches/lua_bench.lua -``` +This builds `quickdecode`, builds the vendored `lua-cjson` against OpenResty's +LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so +`lua-resty-simdjson` runs in its normal `ngx` environment. +If `resty.simdjson` is not available on `package.path` / `package.cpath`, the +harness prints a skip message and omits the simdjson rows. Numbers below come from one such run. ## Results — throughput (median ops/s) -Each row is "parse + access 3 fields" on the named payload. +Each row is "parse + access request fields" on the named payload. -| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` | +| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` | |---|---:|---:|---:|---:|---:|---:| -| small | 2.1 KB | 39,414 | 54,395 | 117,233 | 126,807 | 268,240 | -| medium | 60.4 KB | 5,600 | 40,180 | 90,074 | 120,627 | 126,263 | -| github-100k | 100 KB | 5,373 | — | 27,020 | 27,367 | 36,430 | -| 100k | 100 KB | 2,589 | 19,944 | 72,202 | 61,162 | 80,257 | -| 200k | 200 KB | 1,414 | 14,397 | 57,670 | 48,031 | 58,548 | -| 500k | 500 KB | 722 | 5,882 | 34,602 | 33,167 | 36,900 | -| 1m | 1.00 MB | 355 | 2,048 | 12,723 | 12,448 | 12,669 | -| 2m | 2.00 MB | 157 | 886 | 7,143 | 6,521 | 7,432 | -| 5m | 5.00 MB | 64 | 250 | 2,509 | 2,235 | 2,552 | -| 10m | 10.00 MB | 32 | 128 | 537 | 609 | 540 | -| interleaved (100k/200k/500k/1m, cycled) | — | 723 | 4,399 | 21,424 | 23,378 | 24,004 | +| small | 2.1 KB | 106,646 | 137,427 | 135,296 | 97,574 | 202,388 | +| medium | 60.4 KB | 10,086 | 86,029 | 189,970 | 198,098 | 175,562 | +| github-100k | 100 KB | 2,208 | 2,880 | 4,496 | 4,479 | 4,809 | +| 100k | 100 KB | 6,045 | 46,577 | 137,931 | 134,590 | 153,139 | +| 200k | 200 KB | 3,025 | 22,563 | 78,247 | 75,873 | 81,433 | +| 500k | 500 KB | 1,216 | 9,128 | 33,058 | 32,680 | 34,188 | +| 1m | 1.00 MB | 594 | 4,408 | 16,447 | 16,340 | 16,722 | +| 2m | 2.00 MB | 296 | 1,966 | 8,247 | 8,224 | 8,055 | +| 5m | 5.00 MB | 118 | 600 | 2,869 | 2,945 | 2,992 | +| 10m | 10.00 MB | 59 | 356 | 1,035 | 1,028 | 1,050 | +| interleaved (100k/200k/500k/1m, cycled) | — | 1,318 | 9,116 | 33,342 | 32,752 | 34,031 | ### Speed-up vs. baselines -| Scenario | simdjson / cjson | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access` / cjson | +| Scenario | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access content` / cjson | `qd.decode + access content` / simdjson | |---|---:|---:|---:|---:| -| small | 1.4× | 3.0× | 2.2× | 3.2× | -| medium | 7.2× | 16.1× | 2.2× | 21.5× | -| github-100k | — | 5.0× | — | 5.1× | -| 100k | 7.7× | 27.9× | 3.6× | 23.6× | -| 200k | 10.2× | 40.8× | 4.0× | 34.0× | -| 500k | 8.1× | 47.9× | 5.9× | 45.9× | -| 1m | 5.8× | 35.8× | 6.2× | 35.1× | -| 2m | 5.6× | 45.5× | 8.1× | 41.5× | -| 5m | 3.9× | 39.2× | 10.0× | 34.9× | -| 10m | 4.0× | 16.8× | 4.2× | 19.0× | +| small | 1.3× | 1.0× | 0.9× | 0.7× | +| medium | 18.8× | 2.2× | 19.6× | 2.3× | +| github-100k | 2.0× | 1.6× | 2.0× | 1.6× | +| 100k | 22.8× | 3.0× | 22.3× | 2.9× | +| 200k | 25.9× | 3.5× | 25.1× | 3.4× | +| 500k | 27.2× | 3.6× | 26.9× | 3.6× | +| 1m | 27.7× | 3.7× | 27.5× | 3.7× | +| 2m | 27.9× | 4.2× | 27.8× | 4.2× | +| 5m | 24.3× | 4.8× | 25.0× | 4.9× | +| 10m | 17.5× | 2.9× | 17.4× | 2.9× | ## Results — memory delta (KB retained after 5 rounds) -Post-run `collectgarbage("count")` minus baseline. Captures GC-rooted state -the parser retains across iterations; transient per-call allocations are -collected before the snapshot. +Post-run `collectgarbage("count")` minus baseline. Captures heap usage after +the timing rounds without forcing a final collection, so short-lived garbage +from the last round may still be included. -| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` | +| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` | |---|---:|---:|---:|---:|---:| -| small | +15,881 | +16,284 | +1,338 | +4,337 | +11,140 | -| medium | +1,955 | +2,661 | +66 | +500 | +1,120 | -| github-100k | +12,867 | — | +19 | +592 | +273 | -| 100k | +601 | +950 | +18 | +429 | +229 | -| 200k | +505 | +722 | +7 | +206 | +112 | -| 500k | +648 | +757 | +3 | +83 | +45 | -| 1m | +1,151 | +1,246 | +2 | +62 | +34 | -| 2m | +2,311 | +2,510 | +3 | +82 | +45 | -| 5m | +5,723 | +6,191 | +3 | +82 | +45 | -| 10m | +11,262 | +12,053 | +3 | +83 | +45 | -| interleaved | +4,509 | +6,464 | +53 | +1,671 | +898 | +| small | +15,464 | +15,447 | +4,094 | +15,251 | +11,908 | +| medium | +1,955 | +2,660 | +160 | +1,210 | +1,216 | +| github-100k | +13,187 | +3,362 | +29 | +548 | +242 | +| 100k | +484 | +748 | +79 | +704 | +241 | +| 200k | +392 | +523 | +40 | +352 | +124 | +| 500k | +577 | +630 | +17 | +142 | +48 | +| 1m | +1,082 | +1,121 | +13 | +107 | +37 | +| 2m | +1,155 | +1,248 | +21 | +211 | +48 | +| 5m | +1,316 | +1,538 | +17 | +403 | +48 | +| 10m | +1,583 | +2,014 | +16 | +844 | +48 | +| interleaved | +3,355 | +4,404 | +314 | +2,825 | +945 | `qd.parse` retention is essentially constant across payload size: the only GC-rooted state is the reusable `indices: Vec` and `scratch` buffers. @@ -152,62 +135,40 @@ lazy proxy and any cached child views — but still allocate one to two orders of magnitude less than the eager parsers, which materialize every key into the Lua table heap. -## Pure-decode comparison (no field access) - -Where the rows above measure "decode + use a few fields", this isolates -parse-time only. `cjson` and `simdjson` must still materialize a full Lua -table (no API to stop short of that); `qd.parse` does only the structural -scan and the skip-cache prep, deferring all per-field decode to whoever -later asks. Captures the upper bound of the lazy win. - -| Scenario | cjson | simdjson | `qd.parse` | `qd.parse` / cjson | `qd.parse` / simdjson | -|---|---:|---:|---:|---:|---:| -| small | 47,699 | 72,776 | 264,985 | 5.6× | 3.6× | -| medium | 6,698 | 48,328 | 105,485 | 15.7× | 2.2× | -| 100k | 3,944 | 35,753 | 154,321 | 39.1× | 4.3× | -| 200k | 1,974 | 17,403 | 80,386 | 40.7× | 4.6× | -| 500k | 773 | 6,911 | 35,149 | 45.5× | 5.1× | -| 1m | 362 | 2,611 | 14,691 | 40.6× | 5.6× | -| 2m | 179 | 1,197 | 7,516 | 42.0× | 6.3× | -| 5m | 74 | 293 | 2,876 | 38.9× | 9.8× | -| 10m | 37 | 143 | 665 | 18.0× | 4.7× | - ## Observations -1. **`simdjson` is 4–10× faster than `cjson` in the medium-to-large range**; - the gap narrows at both ends — very small payloads are dominated by - fixed per-call overhead, very large ones become memory-bandwidth bound on - the Lua-table build. -2. **`quickdecode` is 16–48× faster than `cjson` and 2–10× faster than - `simdjson`** on this workload. The win is not from SIMD — `simdjson` - already has that — but from never building a Lua table. Field reads pay - their own cost, but most fields are never read. +1. **`quickdecode` is fastest once payloads move beyond tiny inputs.** + The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and + larger multimodal payloads show roughly 18–28× higher throughput than + `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson` + for request-field access. +2. **Reading every `messages[*].content` is still access-light for large + multimodal bodies.** The benchmark touches the top-level request fields and + one `content` field per message; the payload size comes from image data + inside each message. 3. **The win drops at 10 MB.** `qd.parse` is L3-bandwidth-bound at that size, and the `qd.decode` proxy's per-`__index` dispatch starts to - amortize less well against the cheaper structural scan. Other parsers - are still allocating into the table heap at that size, so they degrade - too, but the ratio compresses. + amortize less well against the cheaper structural scan. `cjson` is still + allocating into the table heap at that size, so the ratio remains large. 4. **`qd.decode + qd.encode (unmodified)` is the headline number for passthrough workloads** — e.g. an LLM gateway re-emitting the original JSON after light-touch inspection. The substring fast path means re-emit is `memcpy`, not re-serialize, and the throughput tracks `qd.parse` very closely. 5. **Memory retention** for `quickdecode` is essentially flat in payload - size; the eager parsers retain ~1× the input size after the first run + size; the eager parsers retain more Lua heap after the first run because the Lua table tree stays GC-rooted until the next collection. - The 10 MB case retains ~11 MB for `cjson` / `simdjson`, ~3 KB for - `qd.parse`. -6. **REST API payloads (github-100k) show a 5× speedup** — lower than the - multimodal payloads because the structural density is higher (~3-5% vs - <0.1%). However, memory savings remain dramatic: 677× less retention - (12.8 MB → 19 KB) because `cjson` must materialize every nested object - and string into the Lua heap. + The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson, + and ~16 KB for `qd.parse`. +6. **REST API payloads (github-100k) show a smaller speedup** because their + structural density is higher than the multimodal request ladder. Memory + savings remain dramatic because `cjson` must materialize every nested + object and string into the Lua heap. ## When to pick which -- **Read most/all fields** → `cjson` or `simdjson`. `simdjson` is a near- - drop-in faster replacement (pool the parser). -- **Parse, read a few fields, discard / re-emit** → `quickdecode`. The +- **Read most/all fields** → `cjson`. +- **Parse, read selected fields, discard / re-emit** → `quickdecode`. The bigger the payload and the smaller the read fraction, the larger the win. `qd.decode` / `qd.encode` gives a `cjson`-shaped surface; `qd.parse` + path getters is the lower-level API with slightly higher peak @@ -222,9 +183,8 @@ later asks. Captures the upper bound of the lazy win. do, broadly. - Workload is biased toward string-heavy payloads (chat-completion image parts). Object-key-heavy JSON shifts the picture: more structural work - per byte and less raw `memcpy`, so the SIMD scanners (`simdjson`, - `quickdecode`'s AVX2 path) get further ahead of `cjson` and the - table-build cost on the eager side rises. + per byte and less raw `memcpy`, while the table-build cost on the eager + side rises. - `quickdecode` retains the source buffer on the `Doc`, so the input string stays alive for the document's lifetime. If you parse and immediately discard the JSON string in the caller, GC can still free