From d1f7457fea638a60923a02f04cb5c8ed7850aa55 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 10:06:31 +0800
Subject: [PATCH 1/9] bench: read message content in lua benchmark

---
 benches/lua_bench.lua |  57 +++++++++----
 docs/benchmarks.md    | 192 ++++++++++++++++--------------------------
 2 files changed, 114 insertions(+), 135 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 7f2c1de..c64dc84 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -190,19 +190,31 @@ end
 local function default_cjson_access(obj)
     local _ = obj.model
     local _ = obj.temperature
-    local _ = obj.messages and obj.messages[1] and obj.messages[1].role
+    if obj.messages then
+        for _, msg in ipairs(obj.messages) do
+            local _ = msg.content
+        end
+    end
 end
 
 local function default_qd_access(d)
     local _ = d:get_str("model")
     local _ = d:get_f64("temperature")
-    local _ = d:get_str("messages[0].role")
+    local n = d:len("messages") or 0
+    for i = 0, n - 1 do
+        local _ = d:typeof("messages[" .. i .. "].content")
+    end
 end
 
 local function default_table_access(t)
     local _ = t.model
     local _ = t.temperature
-    local _ = t.messages and t.messages[1] and t.messages[1].role
+    if t.messages then
+        for i = 1, qd.len(t.messages) do
+            local msg = t.messages[i]
+            local _ = msg.content
+        end
+    end
 end
 
 -- GitHub issues accessors: array of issues, access first issue's fields
@@ -250,18 +262,18 @@ for _, s in ipairs(scenarios) do
     local qd_access = s.qd_access or default_qd_access
     local table_access = s.table_access or default_table_access
 
-    bench("cjson.decode + access 3 fields", s.iters, function()
+    bench("cjson.decode + access fields", s.iters, function()
         local obj = cjson.decode(s.payload)
         cjson_access(obj)
     end)
 
-    bench("quickdecode.parse + access 3 fields", s.iters, function()
+    bench("quickdecode.parse + access fields", s.iters, function()
         local d = qd.parse(s.payload)
         qd_access(d)
     end)
 
     if has_pooled_api then
-        bench("quickdecode pooled :parse + access 3 fields", s.iters, function()
+        bench("quickdecode pooled :parse + access fields", s.iters, function()
             local d = pooled_decoder:parse(s.payload)
             qd_access(d)
         end)
@@ -273,7 +285,7 @@ for _, s in ipairs(scenarios) do
         end)
     end
 
-    bench("qd.decode + t.field x3", s.iters, function()
+    bench("qd.decode + access content", s.iters, function()
         local t = qd.decode(s.payload)
         table_access(t)
     end)
@@ -315,41 +327,56 @@ print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ",
 
 do
     local next_p = make_cycler(interleaved)
-    bench("cjson.decode + access 3 fields", 400, function()
+    bench("cjson.decode + access fields", 400, function()
         local p = next_p()
         local obj = cjson.decode(p)
         local _ = obj.model
         local _ = obj.temperature
-        local _ = obj.messages and obj.messages[1] and obj.messages[1].role
+        if obj.messages then
+            for _, msg in ipairs(obj.messages) do
+                local _ = msg.content
+            end
+        end
     end)
 
     next_p = make_cycler(interleaved)
-    bench("quickdecode.parse + access 3 fields", 400, function()
+    bench("quickdecode.parse + access fields", 400, function()
         local p = next_p()
         local d = qd.parse(p)
         local _ = d:get_str("model")
         local _ = d:get_f64("temperature")
-        local _ = d:get_str("messages[0].role")
+        local n = d:len("messages") or 0
+        for i = 0, n - 1 do
+            local _ = d:typeof("messages[" .. i .. "].content")
+        end
     end)
 
     if has_pooled_api then
         next_p = make_cycler(interleaved)
-        bench("quickdecode pooled :parse + access 3 fields", 400, function()
+        bench("quickdecode pooled :parse + access fields", 400, function()
             local p = next_p()
             local d = pooled_decoder:parse(p)
             local _ = d:get_str("model")
             local _ = d:get_f64("temperature")
-            local _ = d:get_str("messages[0].role")
+            local n = d:len("messages") or 0
+            for i = 0, n - 1 do
+                local _ = d:typeof("messages[" .. i .. "].content")
+            end
         end)
     end
 
     next_p = make_cycler(interleaved)
-    bench("qd.decode + t.field x3", 400, function()
+    bench("qd.decode + access content", 400, function()
         local p = next_p()
         local t = qd.decode(p)
         local _ = t.model
         local _ = t.temperature
-        local _ = t.messages and t.messages[1] and t.messages[1].role
+        if t.messages then
+            for i = 1, qd.len(t.messages) do
+                local msg = t.messages[i]
+                local _ = msg.content
+            end
+        end
     end)
 
     next_p = make_cycler(interleaved)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ea4eae9..764eb4e 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -1,32 +1,23 @@
 # Benchmarks
 
 Throughput and memory comparison of `quickdecode` (this library) against
-`lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload
-ladder from 2 KB to 10 MB.
+`lua-cjson` on a multimodal chat-completion payload ladder from 2 KB to 10 MB.
 
-`quickdecode` is optimized for *parse + read a small number of fields*; the
-data below quantifies how the lazy structural scan beats an eager build-the-
-whole-table parser, and where the gap narrows. `lua-cjson` is the baseline.
-`lua-resty-simdjson` (a Lua binding over the simdjson C++ library, eager) is
-included to show how much of the win comes from SIMD vs. from skipping the
-table build.
+`quickdecode` is optimized for *parse + read a small part of the document*;
+the data below quantifies how the lazy structural scan behaves when the caller
+reads request metadata plus every chat message `content`, without eagerly
+building the whole Lua table. `lua-cjson` is the eager-table baseline.
 
 ## Environment
 
 | | |
 |---|---|
 | Host CPU | Intel Xeon (Skylake, IBRS), 4 cores |
-| Memory | 7.6 GiB |
+| Memory | 15 GiB |
 | OS | Linux x86_64 |
-| Runtime | OpenResty `resty` 0.29 / openresty 1.29.2.3 / LuaJIT 2.1 ROLLING |
+| Runtime | Homebrew LuaJIT 2.1.1774896198 |
 | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active |
-| `lua-cjson` | bundled with OpenResty |
-| `lua-resty-simdjson` | upstream `main` build at `/tmp/lua-resty-simdjson`, simdjson C++ pinned by that repo |
-
-The bench uses the OpenResty `resty` CLI because `lua-resty-simdjson` pulls in
-`ngx.null` / `ngx.sleep` at load time and cannot run under bare LuaJIT
-without an OpenResty environment. `lua-cjson` and `quickdecode` themselves
-run fine under bare LuaJIT.
+| `lua-cjson` | vendored `openresty/lua-cjson` |
 
 ## Methodology
 
@@ -58,10 +49,9 @@ parsing workloads with ~3-5% structural density.
 
 | Row | What it does | Notes |
 |---|---|---|
-| `cjson.decode + access 3 fields` | `cjson.decode(s)` then read 3 fields | Eager Lua table |
-| `resty.simdjson:decode + access 3 fields` | `parser:decode(s)` then read 3 fields | Eager Lua table; **parser instance is reused** across iterations (the upstream-recommended pattern) |
-| `quickdecode.parse + access 3 fields` | `qd.parse(s)` then `d:get_str/get_f64` × 3 | Lazy structural scan; explicit path-based reads |
-| `qd.decode + t.field x3` | `qd.decode(s)` then `t.model` / `t.temperature` / `t.messages[1].role` | Lazy table proxy; reads go through `__index` |
+| `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `message.content` | Eager Lua table |
+| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then read every `messages[i].content` type | Lazy structural scan; explicit path-based reads |
+| `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `message.content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
 ## Reproducing
@@ -73,57 +63,42 @@ make bench
 ```
 
 This invokes `benches/lua_bench.lua` with `LD_LIBRARY_PATH=target/release`
-and a `LUA_CPATH` that picks up `cjson` from the system locations. It does
-**not** include `lua-resty-simdjson`.
-
-To also include `lua-resty-simdjson` you need (1) the library installed
-somewhere `package.cpath` can reach the `.so`, (2) its Lua wrapper on
-`package.path`, and (3) the bench script patched to require it. The patch
-that adds the bench rows is a small `pcall(require, "resty.simdjson")` block;
-keep it local — it is not part of the upstream bench file. Run it through
-`resty` so the `ngx.*` symbols are available:
-
-```sh
-LD_LIBRARY_PATH=$PWD/target/release \
-LUA_CPATH='/path/to/lua-resty-simdjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so' \
-LUA_PATH='/path/to/lua-resty-simdjson/lib/?.lua;/path/to/lua-resty-simdjson/lib/?/init.lua;./lua/?.lua;;' \
-/usr/local/openresty/bin/resty benches/lua_bench.lua
-```
+and a `LUA_CPATH` that picks up the vendored `lua-cjson` build.
 
 Numbers below come from one such run.
 
 ## Results — throughput (median ops/s)
 
-Each row is "parse + access 3 fields" on the named payload.
-
-| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` |
-|---|---:|---:|---:|---:|---:|---:|
-| small      |   2.1 KB | 39,414 | 54,395 | 117,233 | 126,807 | 268,240 |
-| medium     |  60.4 KB |  5,600 | 40,180 |  90,074 | 120,627 | 126,263 |
-| github-100k |   100 KB |  5,373 |      — |  27,020 |  27,367 |  36,430 |
-| 100k       |   100 KB |  2,589 | 19,944 |  72,202 |  61,162 |  80,257 |
-| 200k       |   200 KB |  1,414 | 14,397 |  57,670 |  48,031 |  58,548 |
-| 500k       |   500 KB |    722 |  5,882 |  34,602 |  33,167 |  36,900 |
-| 1m         |  1.00 MB |    355 |  2,048 |  12,723 |  12,448 |  12,669 |
-| 2m         |  2.00 MB |    157 |    886 |   7,143 |   6,521 |   7,432 |
-| 5m         |  5.00 MB |     64 |    250 |   2,509 |   2,235 |   2,552 |
-| 10m        | 10.00 MB |     32 |    128 |     537 |     609 |     540 |
-| interleaved (100k/200k/500k/1m, cycled) |  — |    723 |  4,399 |  21,424 |  23,378 |  24,004 |
+Each row is "parse + access request fields" on the named payload.
+
+| Scenario | Size | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
+|---|---:|---:|---:|---:|---:|
+| small      |   2.1 KB | 114,427 | 133,536 |  82,816 | 146,101 |
+| medium     |  60.4 KB |   8,178 | 161,342 | 143,719 | 147,275 |
+| github-100k |   100 KB |   2,431 |   4,474 |   4,449 |   4,774 |
+| 100k       |   100 KB |   4,865 | 135,501 | 102,987 | 114,943 |
+| 200k       |   200 KB |   2,443 |  72,780 |  62,189 |  67,295 |
+| 500k       |   500 KB |     979 |  32,000 |  29,412 |  30,534 |
+| 1m         |  1.00 MB |     478 |  16,538 |  15,723 |  16,043 |
+| 2m         |  2.00 MB |     238 |   8,319 |   8,055 |   8,183 |
+| 5m         |  5.00 MB |      94 |   2,933 |   2,926 |   2,982 |
+| 10m        | 10.00 MB |      47 |   1,015 |   1,042 |   1,065 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,066 | 32,717 | 29,089 | 30,969 |
 
 ### Speed-up vs. baselines
 
-| Scenario | simdjson / cjson | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access` / cjson |
-|---|---:|---:|---:|---:|
-| small  | 1.4× |  3.0× | 2.2× |  3.2× |
-| medium | 7.2× | 16.1× | 2.2× | 21.5× |
-| github-100k | — | 5.0× | — | 5.1× |
-| 100k   | 7.7× | 27.9× | 3.6× | 23.6× |
-| 200k   | 10.2× | 40.8× | 4.0× | 34.0× |
-| 500k   | 8.1× | 47.9× | 5.9× | 45.9× |
-| 1m     | 5.8× | 35.8× | 6.2× | 35.1× |
-| 2m     | 5.6× | 45.5× | 8.1× | 41.5× |
-| 5m     | 3.9× | 39.2× | 10.0× | 34.9× |
-| 10m    | 4.0× | 16.8× | 4.2× | 19.0× |
+| Scenario | `qd.parse` / cjson | `qd.decode + access` / cjson |
+|---|---:|---:|
+| small  |  1.2× |  0.7× |
+| medium | 19.7× | 17.6× |
+| github-100k | 1.8× | 1.8× |
+| 100k   | 27.9× | 21.2× |
+| 200k   | 29.8× | 25.5× |
+| 500k   | 32.7× | 30.0× |
+| 1m     | 34.6× | 32.9× |
+| 2m     | 35.0× | 33.8× |
+| 5m     | 31.2× | 31.1× |
+| 10m    | 21.6× | 22.2× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -131,19 +106,19 @@ Post-run `collectgarbage("count")` minus baseline. Captures GC-rooted state
 the parser retains across iterations; transient per-call allocations are
 collected before the snapshot.
 
-| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` |
-|---|---:|---:|---:|---:|---:|
-| small      | +15,881 | +16,284 | +1,338 | +4,337 | +11,140 |
-| medium     |  +1,955 |  +2,661 |    +66 |   +500 |  +1,120 |
-| github-100k | +12,867 |       — |    +19 |   +592 |    +273 |
-| 100k       |    +601 |    +950 |    +18 |   +429 |    +229 |
-| 200k       |    +505 |    +722 |     +7 |   +206 |    +112 |
-| 500k       |    +648 |    +757 |     +3 |    +83 |     +45 |
-| 1m         |  +1,151 |  +1,246 |     +2 |    +62 |     +34 |
-| 2m         |  +2,311 |  +2,510 |     +3 |    +82 |     +45 |
-| 5m         |  +5,723 |  +6,191 |     +3 |    +82 |     +45 |
-| 10m        | +11,262 | +12,053 |     +3 |    +83 |     +45 |
-| interleaved |  +4,509 |  +6,464 |    +53 | +1,671 |    +898 |
+| Scenario | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
+|---|---:|---:|---:|---:|
+| small      | +15,974 | +4,069 | +17,425 | +13,478 |
+| medium     |  +1,955 |    +81 |  +1,349 |  +1,349 |
+| github-100k | +12,655 |   +83 |    +592 |    +273 |
+| 100k       |    +601 |   +77 |    +739 |    +270 |
+| 200k       |    +506 |   +34 |    +370 |    +135 |
+| 500k       |    +648 |   +14 |    +149 |     +54 |
+| 1m         |  +1,151 |   +10 |    +111 |     +41 |
+| 2m         |  +2,312 |   +14 |    +148 |     +55 |
+| 5m         |  +5,723 |   +14 |    +148 |     +55 |
+| 10m        | +11,262 |   +14 |    +148 |     +54 |
+| interleaved | +4,508 |   +70 |  +3,002 |  +1,079 |
 
 `qd.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -152,41 +127,21 @@ lazy proxy and any cached child views — but still allocate one to two
 orders of magnitude less than the eager parsers, which materialize every
 key into the Lua table heap.
 
-## Pure-decode comparison (no field access)
-
-Where the rows above measure "decode + use a few fields", this isolates
-parse-time only. `cjson` and `simdjson` must still materialize a full Lua
-table (no API to stop short of that); `qd.parse` does only the structural
-scan and the skip-cache prep, deferring all per-field decode to whoever
-later asks. Captures the upper bound of the lazy win.
-
-| Scenario | cjson | simdjson | `qd.parse` | `qd.parse` / cjson | `qd.parse` / simdjson |
-|---|---:|---:|---:|---:|---:|
-| small  |  47,699 | 72,776 | 264,985 |  5.6× | 3.6× |
-| medium |   6,698 | 48,328 | 105,485 | 15.7× | 2.2× |
-| 100k   |   3,944 | 35,753 | 154,321 | 39.1× | 4.3× |
-| 200k   |   1,974 | 17,403 |  80,386 | 40.7× | 4.6× |
-| 500k   |     773 |  6,911 |  35,149 | 45.5× | 5.1× |
-| 1m     |     362 |  2,611 |  14,691 | 40.6× | 5.6× |
-| 2m     |     179 |  1,197 |   7,516 | 42.0× | 6.3× |
-| 5m     |      74 |    293 |   2,876 | 38.9× | 9.8× |
-| 10m    |      37 |    143 |     665 | 18.0× | 4.7× |
-
 ## Observations
 
-1. **`simdjson` is 4–10× faster than `cjson` in the medium-to-large range**;
-   the gap narrows at both ends — very small payloads are dominated by
-   fixed per-call overhead, very large ones become memory-bandwidth bound on
-   the Lua-table build.
-2. **`quickdecode` is 16–48× faster than `cjson` and 2–10× faster than
-   `simdjson`** on this workload. The win is not from SIMD — `simdjson`
-   already has that — but from never building a Lua table. Field reads pay
-   their own cost, but most fields are never read.
+1. **`quickdecode` is fastest once payloads move beyond tiny inputs.**
+   The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
+   larger multimodal payloads show roughly 20–35× higher throughput than
+   `cjson` for request-field access.
+2. **Reading every message `content` is still access-light for large
+   multimodal bodies.** The benchmark touches the top-level request fields and
+   one `content` field per message, but it does not materialize every nested
+   image part or base64 string unless that field is read through the lazy table
+   API.
 3. **The win drops at 10 MB.** `qd.parse` is L3-bandwidth-bound at that
    size, and the `qd.decode` proxy's per-`__index` dispatch starts to
-   amortize less well against the cheaper structural scan. Other parsers
-   are still allocating into the table heap at that size, so they degrade
-   too, but the ratio compresses.
+   amortize less well against the cheaper structural scan. `cjson` is still
+   allocating into the table heap at that size, so the ratio remains large.
 4. **`qd.decode + qd.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
    JSON after light-touch inspection. The substring fast path means
@@ -195,19 +150,17 @@ later asks. Captures the upper bound of the lazy win.
 5. **Memory retention** for `quickdecode` is essentially flat in payload
    size; the eager parsers retain ~1× the input size after the first run
    because the Lua table tree stays GC-rooted until the next collection.
-   The 10 MB case retains ~11 MB for `cjson` / `simdjson`, ~3 KB for
+   The 10 MB case retains ~11 MB for `cjson`, ~14 KB for
    `qd.parse`.
-6. **REST API payloads (github-100k) show a 5× speedup** — lower than the
-   multimodal payloads because the structural density is higher (~3-5% vs
-   <0.1%). However, memory savings remain dramatic: 677× less retention
-   (12.8 MB → 19 KB) because `cjson` must materialize every nested object
-   and string into the Lua heap.
+6. **REST API payloads (github-100k) show a smaller speedup** because their
+   structural density is higher than the multimodal request ladder. Memory
+   savings remain dramatic because `cjson` must materialize every nested
+   object and string into the Lua heap.
 
 ## When to pick which
 
-- **Read most/all fields** → `cjson` or `simdjson`. `simdjson` is a near-
-  drop-in faster replacement (pool the parser).
-- **Parse, read a few fields, discard / re-emit** → `quickdecode`. The
+- **Read most/all fields** → `cjson`.
+- **Parse, read selected fields, discard / re-emit** → `quickdecode`. The
   bigger the payload and the smaller the read fraction, the larger the
   win. `qd.decode` / `qd.encode` gives a `cjson`-shaped surface; `qd.parse`
   + path getters is the lower-level API with slightly higher peak
@@ -222,9 +175,8 @@ later asks. Captures the upper bound of the lazy win.
   do, broadly.
 - Workload is biased toward string-heavy payloads (chat-completion image
   parts). Object-key-heavy JSON shifts the picture: more structural work
-  per byte and less raw `memcpy`, so the SIMD scanners (`simdjson`,
-  `quickdecode`'s AVX2 path) get further ahead of `cjson` and the
-  table-build cost on the eager side rises.
+  per byte and less raw `memcpy`, while the table-build cost on the eager
+  side rises.
 - `quickdecode` retains the source buffer on the `Doc`, so the input
   string stays alive for the document's lifetime. If you parse and
   immediately discard the JSON string in the caller, GC can still free

From 99f08757cc069a4647ed21ce6234f5ee37734dd3 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 10:15:09 +0800
Subject: [PATCH 2/9] bench: reuse content access helpers

---
 benches/lua_bench.lua | 31 +++--------------------
 docs/benchmarks.md    | 58 +++++++++++++++++++++----------------------
 2 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index c64dc84..d75607c 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -330,25 +330,14 @@ do
     bench("cjson.decode + access fields", 400, function()
         local p = next_p()
         local obj = cjson.decode(p)
-        local _ = obj.model
-        local _ = obj.temperature
-        if obj.messages then
-            for _, msg in ipairs(obj.messages) do
-                local _ = msg.content
-            end
-        end
+        default_cjson_access(obj)
     end)
 
     next_p = make_cycler(interleaved)
     bench("quickdecode.parse + access fields", 400, function()
         local p = next_p()
         local d = qd.parse(p)
-        local _ = d:get_str("model")
-        local _ = d:get_f64("temperature")
-        local n = d:len("messages") or 0
-        for i = 0, n - 1 do
-            local _ = d:typeof("messages[" .. i .. "].content")
-        end
+        default_qd_access(d)
     end)
 
     if has_pooled_api then
@@ -356,12 +345,7 @@ do
         bench("quickdecode pooled :parse + access fields", 400, function()
             local p = next_p()
             local d = pooled_decoder:parse(p)
-            local _ = d:get_str("model")
-            local _ = d:get_f64("temperature")
-            local n = d:len("messages") or 0
-            for i = 0, n - 1 do
-                local _ = d:typeof("messages[" .. i .. "].content")
-            end
+            default_qd_access(d)
         end)
     end
 
@@ -369,14 +353,7 @@ do
     bench("qd.decode + access content", 400, function()
         local p = next_p()
         local t = qd.decode(p)
-        local _ = t.model
-        local _ = t.temperature
-        if t.messages then
-            for i = 1, qd.len(t.messages) do
-                local msg = t.messages[i]
-                local _ = msg.content
-            end
-        end
+        default_table_access(t)
     end)
 
     next_p = make_cycler(interleaved)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 764eb4e..f931b99 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -50,7 +50,7 @@ parsing workloads with ~3-5% structural density.
 | Row | What it does | Notes |
 |---|---|---|
 | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `message.content` | Eager Lua table |
-| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then read every `messages[i].content` type | Lazy structural scan; explicit path-based reads |
+| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then read every `messages[i].content` | Lazy structural scan; explicit path-based reads |
 | `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `message.content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
@@ -73,32 +73,32 @@ Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|---:|
-| small      |   2.1 KB | 114,427 | 133,536 |  82,816 | 146,101 |
-| medium     |  60.4 KB |   8,178 | 161,342 | 143,719 | 147,275 |
-| github-100k |   100 KB |   2,431 |   4,474 |   4,449 |   4,774 |
-| 100k       |   100 KB |   4,865 | 135,501 | 102,987 | 114,943 |
-| 200k       |   200 KB |   2,443 |  72,780 |  62,189 |  67,295 |
-| 500k       |   500 KB |     979 |  32,000 |  29,412 |  30,534 |
-| 1m         |  1.00 MB |     478 |  16,538 |  15,723 |  16,043 |
-| 2m         |  2.00 MB |     238 |   8,319 |   8,055 |   8,183 |
-| 5m         |  5.00 MB |      94 |   2,933 |   2,926 |   2,982 |
-| 10m        | 10.00 MB |      47 |   1,015 |   1,042 |   1,065 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,066 | 32,717 | 29,089 | 30,969 |
+| small      |   2.1 KB | 113,541 | 132,830 |  82,169 | 148,117 |
+| medium     |  60.4 KB |   8,219 | 198,413 | 140,845 | 149,298 |
+| github-100k |   100 KB |   2,410 |   4,505 |   4,450 |   4,781 |
+| 100k       |   100 KB |   4,869 | 135,501 |  97,752 | 111,982 |
+| 200k       |   200 KB |   2,441 |  73,964 |  60,753 |  65,963 |
+| 500k       |   500 KB |     978 |  31,797 |  28,902 |  30,166 |
+| 1m         |  1.00 MB |     478 |  16,287 |  15,560 |  15,890 |
+| 2m         |  2.00 MB |     237 |   8,180 |   7,877 |   7,764 |
+| 5m         |  5.00 MB |      94 |   2,899 |   2,930 |   2,969 |
+| 10m        | 10.00 MB |      47 |   1,044 |   1,046 |   1,049 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,066 | 32,204 | 28,696 | 30,485 |
 
 ### Speed-up vs. baselines
 
 | Scenario | `qd.parse` / cjson | `qd.decode + access` / cjson |
 |---|---:|---:|
 | small  |  1.2× |  0.7× |
-| medium | 19.7× | 17.6× |
-| github-100k | 1.8× | 1.8× |
-| 100k   | 27.9× | 21.2× |
-| 200k   | 29.8× | 25.5× |
-| 500k   | 32.7× | 30.0× |
-| 1m     | 34.6× | 32.9× |
-| 2m     | 35.0× | 33.8× |
-| 5m     | 31.2× | 31.1× |
-| 10m    | 21.6× | 22.2× |
+| medium | 24.1× | 17.1× |
+| github-100k | 1.9× | 1.8× |
+| 100k   | 27.8× | 20.1× |
+| 200k   | 30.3× | 24.9× |
+| 500k   | 32.5× | 29.6× |
+| 1m     | 34.1× | 32.6× |
+| 2m     | 34.5× | 33.2× |
+| 5m     | 30.8× | 31.2× |
+| 10m    | 22.2× | 22.3× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -108,17 +108,17 @@ collected before the snapshot.
 
 | Scenario | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|
-| small      | +15,974 | +4,069 | +17,425 | +13,478 |
-| medium     |  +1,955 |    +81 |  +1,349 |  +1,349 |
-| github-100k | +12,655 |   +83 |    +592 |    +273 |
-| 100k       |    +601 |   +77 |    +739 |    +270 |
-| 200k       |    +506 |   +34 |    +370 |    +135 |
-| 500k       |    +648 |   +14 |    +149 |     +54 |
+| small      | +15,977 | +4,069 | +17,403 | +13,478 |
+| medium     |  +1,955 |    +66 |  +1,349 |  +1,349 |
+| github-100k | +12,761 |   +19 |    +591 |    +273 |
+| 100k       |    +602 |   +71 |    +739 |    +270 |
+| 200k       |    +505 |   +34 |    +370 |    +136 |
+| 500k       |    +648 |   +14 |    +148 |     +54 |
 | 1m         |  +1,151 |   +10 |    +111 |     +41 |
-| 2m         |  +2,312 |   +14 |    +148 |     +55 |
+| 2m         |  +2,312 |   +14 |    +148 |     +54 |
 | 5m         |  +5,723 |   +14 |    +148 |     +55 |
 | 10m        | +11,262 |   +14 |    +148 |     +54 |
-| interleaved | +4,508 |   +70 |  +3,002 |  +1,079 |
+| interleaved | +4,509 |  +271 |  +2,955 |  +1,079 |
 
 `qd.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.

From 15ae933e492d55e0f0c87378b6990701b320088d Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 10:16:15 +0800
Subject: [PATCH 3/9] docs: clarify benchmark access labels

---
 docs/benchmarks.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index f931b99..81e2c09 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -50,7 +50,7 @@ parsing workloads with ~3-5% structural density.
 | Row | What it does | Notes |
 |---|---|---|
 | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `message.content` | Eager Lua table |
-| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then read every `messages[i].content` | Lazy structural scan; explicit path-based reads |
+| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[i].content` path | Lazy structural scan; explicit path-based reads |
 | `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `message.content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
@@ -71,7 +71,7 @@ Numbers below come from one such run.
 
 Each row is "parse + access request fields" on the named payload.
 
-| Scenario | Size | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
+| Scenario | Size | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|---:|
 | small      |   2.1 KB | 113,541 | 132,830 |  82,169 | 148,117 |
 | medium     |  60.4 KB |   8,219 | 198,413 | 140,845 | 149,298 |
@@ -87,7 +87,7 @@ Each row is "parse + access request fields" on the named payload.
 
 ### Speed-up vs. baselines
 
-| Scenario | `qd.parse` / cjson | `qd.decode + access` / cjson |
+| Scenario | `qd.parse` / cjson | `qd.decode + access content` / cjson |
 |---|---:|---:|
 | small  |  1.2× |  0.7× |
 | medium | 24.1× | 17.1× |
@@ -106,7 +106,7 @@ Post-run `collectgarbage("count")` minus baseline. Captures GC-rooted state
 the parser retains across iterations; transient per-call allocations are
 collected before the snapshot.
 
-| Scenario | cjson | `qd.parse` | `qd.decode + access` | `qd.decode + qd.encode` |
+| Scenario | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|
 | small      | +15,977 | +4,069 | +17,403 | +13,478 |
 | medium     |  +1,955 |    +66 |  +1,349 |  +1,349 |

From 6f3d93f83423c915623ce5577add1de2ca0d5413 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 10:30:01 +0800
Subject: [PATCH 4/9] bench: model image content across messages

---
 benches/lua_bench.lua | 79 ++++++++++++++++++++-----------------
 docs/benchmarks.md    | 92 +++++++++++++++++++++----------------------
 2 files changed, 88 insertions(+), 83 deletions(-)

diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index d75607c..40ca4aa 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -11,9 +11,10 @@ local function read_file(p)
     return s
 end
 
--- Shape: a multimodal chat-completion request with one ~1.5K text question
--- and N base64-encoded image parts (each 50-500 KB) until the payload reaches
--- target_bytes. Mirrors the production case the bench is meant to reflect.
+-- Shape: a multimodal chat-completion request with multiple historical
+-- messages. Each message contains one small text part and one base64-encoded
+-- image part. The number of messages scales with payload size: a 10 MB request
+-- has roughly ten 1 MB image-bearing messages.
 --
 -- Image sizes are drawn from a deterministic Park-Miller LCG (not math.random,
 -- which delegates to libc rand() and varies across machines) so the same
@@ -117,41 +118,28 @@ local function make_b64(size)
 end
 
 local function make_payload(target_bytes)
-    local rng_state = 42
-    local function rng_range(lo, hi)
-        -- Park-Miller minimal-standard LCG: a=48271, m=2^31-1. Multiplication
-        -- fits in double precision (48271 * 2^31 < 2^53).
-        rng_state = (rng_state * 48271) % 2147483647
-        return lo + (rng_state % (hi - lo + 1))
-    end
-
-    local text = string.rep("Q", 1500)
+    local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024)))
+    local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}'
+    local text = string.rep("Q", 256)
     local text_part = '{"type":"text","text":"' .. text .. '"}'
-    local parts = { text_part }
-    local current = 200 + #text_part  -- approx outer envelope overhead
-
-    while current < target_bytes do
-        local remaining = target_bytes - current
-        local img_size
-        if remaining < 50 * 1024 then
-            -- Final image: shrink below the 50 KB floor so the label matches
-            -- the actual payload size. Bench iters all see the same payload
-            -- regardless, so the smaller tail blob doesn't change what's
-            -- being measured.
-            img_size = math.max(1024, remaining)
-        else
-            local upper = math.min(500 * 1024, remaining)
-            img_size = rng_range(50 * 1024, upper)
-        end
-        local b64 = make_b64(img_size)
-        local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
-            .. b64 .. '"}}'
-        parts[#parts + 1] = img_part
-        current = current + #img_part + 1  -- +1 for comma
+    local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
+    local image_suffix = '"}}'
+    local message_overhead = #('{"role":"user","content":[,]}') + #text_part
+        + #image_prefix + #image_suffix
+    local remaining = target_bytes - #envelope - (message_count * message_overhead)
+    local image_size = math.max(1024, math.floor(remaining / message_count))
+
+    local messages = {}
+    for i = 1, message_count do
+        local role = i % 2 == 1 and "user" or "assistant"
+        local b64 = make_b64(image_size)
+        local image_part = image_prefix .. b64 .. image_suffix
+        messages[i] = '{"role":"' .. role .. '","content":['
+            .. text_part .. "," .. image_part .. ']}'
     end
 
-    return '{"model":"gpt-4-vision","temperature":0.7,"messages":'
-        .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}'
+    return '{"model":"gpt-4-vision","temperature":0.7,"messages":['
+        .. table.concat(messages, ",") .. ']}'
 end
 
 local ROUNDS = 5
@@ -197,12 +185,29 @@ local function default_cjson_access(obj)
     end
 end
 
+local content_paths_by_message_count = {}
+
+local function content_paths(n)
+    local paths = content_paths_by_message_count[n]
+    if paths then
+        return paths
+    end
+
+    paths = {}
+    for i = 0, n - 1 do
+        paths[i + 1] = "messages[" .. i .. "].content"
+    end
+    content_paths_by_message_count[n] = paths
+    return paths
+end
+
 local function default_qd_access(d)
     local _ = d:get_str("model")
     local _ = d:get_f64("temperature")
     local n = d:len("messages") or 0
-    for i = 0, n - 1 do
-        local _ = d:typeof("messages[" .. i .. "].content")
+    local paths = content_paths(n)
+    for i = 1, n do
+        local _ = d:typeof(paths[i])
     end
 end
 
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 81e2c09..ff960e3 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -33,11 +33,12 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
    KB — measures GC-rooted state retained by the parser, not transient
    per-call allocations.
 
-The payload is a synthetic multimodal chat-completion request — one
-~1.5 KB text part plus N base64-encoded image parts of 50–500 KB each
-until the target size is reached. The image size sequence comes from a
-Park–Miller LCG with `seed=42` rather than `math.random` so the payload is
-byte-identical across hosts.
+The payload is a synthetic multimodal chat-completion request with multiple
+historical messages. Each message contains one small text part and one
+base64-encoded image part. Message count scales with payload size: the 10 MB
+scenario has roughly ten messages, each carrying one ~1 MB image, so the
+access pattern matches request bodies where every historical message includes
+an image.
 
 A separate `github-100k` scenario simulates a GitHub Issues API response
 (`/repos/{owner}/{repo}/issues`) with ~100 KB of realistic REST API
@@ -49,9 +50,9 @@ parsing workloads with ~3-5% structural density.
 
 | Row | What it does | Notes |
 |---|---|---|
-| `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `message.content` | Eager Lua table |
-| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[i].content` path | Lazy structural scan; explicit path-based reads |
-| `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `message.content` | Lazy table proxy; reads go through `__index` |
+| `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
+| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads |
+| `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
 ## Reproducing
@@ -73,52 +74,52 @@ Each row is "parse + access request fields" on the named payload.
 
 | Scenario | Size | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|---:|
-| small      |   2.1 KB | 113,541 | 132,830 |  82,169 | 148,117 |
-| medium     |  60.4 KB |   8,219 | 198,413 | 140,845 | 149,298 |
-| github-100k |   100 KB |   2,410 |   4,505 |   4,450 |   4,781 |
-| 100k       |   100 KB |   4,869 | 135,501 |  97,752 | 111,982 |
-| 200k       |   200 KB |   2,441 |  73,964 |  60,753 |  65,963 |
-| 500k       |   500 KB |     978 |  31,797 |  28,902 |  30,166 |
-| 1m         |  1.00 MB |     478 |  16,287 |  15,560 |  15,890 |
-| 2m         |  2.00 MB |     237 |   8,180 |   7,877 |   7,764 |
-| 5m         |  5.00 MB |      94 |   2,899 |   2,930 |   2,969 |
-| 10m        | 10.00 MB |      47 |   1,044 |   1,046 |   1,049 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,066 | 32,204 | 28,696 | 30,485 |
+| small      |   2.1 KB | 113,056 | 132,184 |  81,769 | 145,722 |
+| medium     |  60.4 KB |   8,194 | 196,773 | 142,086 | 147,406 |
+| github-100k |   100 KB |   2,424 |   4,510 |   4,444 |   4,783 |
+| 100k       |   100 KB |   4,874 | 144,509 | 100,100 | 107,527 |
+| 200k       |   200 KB |   2,446 |  78,247 |  64,350 |  69,832 |
+| 500k       |   500 KB |     982 |  33,003 |  30,211 |  31,299 |
+| 1m         |  1.00 MB |     478 |  16,930 |  16,146 |  16,358 |
+| 2m         |  2.00 MB |     238 |   8,361 |   8,127 |   8,302 |
+| 5m         |  5.00 MB |      95 |   2,939 |   2,923 |   2,979 |
+| 10m        | 10.00 MB |      48 |   1,046 |   1,045 |     955 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,063 | 33,498 | 30,595 | 31,646 |
 
 ### Speed-up vs. baselines
 
 | Scenario | `qd.parse` / cjson | `qd.decode + access content` / cjson |
 |---|---:|---:|
 | small  |  1.2× |  0.7× |
-| medium | 24.1× | 17.1× |
+| medium | 24.0× | 17.3× |
 | github-100k | 1.9× | 1.8× |
-| 100k   | 27.8× | 20.1× |
-| 200k   | 30.3× | 24.9× |
-| 500k   | 32.5× | 29.6× |
-| 1m     | 34.1× | 32.6× |
-| 2m     | 34.5× | 33.2× |
-| 5m     | 30.8× | 31.2× |
-| 10m    | 22.2× | 22.3× |
+| 100k   | 29.6× | 20.5× |
+| 200k   | 32.0× | 26.3× |
+| 500k   | 33.6× | 30.8× |
+| 1m     | 35.4× | 33.8× |
+| 2m     | 35.1× | 34.1× |
+| 5m     | 30.9× | 30.8× |
+| 10m    | 21.8× | 21.8× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
-Post-run `collectgarbage("count")` minus baseline. Captures GC-rooted state
-the parser retains across iterations; transient per-call allocations are
-collected before the snapshot.
+Post-run `collectgarbage("count")` minus baseline. Captures heap usage after
+the timing rounds without forcing a final collection, so short-lived garbage
+from the last round may still be included.
 
 | Scenario | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|
-| small      | +15,977 | +4,069 | +17,403 | +13,478 |
-| medium     |  +1,955 |    +66 |  +1,349 |  +1,349 |
-| github-100k | +12,761 |   +19 |    +591 |    +273 |
-| 100k       |    +602 |   +71 |    +739 |    +270 |
-| 200k       |    +505 |   +34 |    +370 |    +136 |
-| 500k       |    +648 |   +14 |    +148 |     +54 |
-| 1m         |  +1,151 |   +10 |    +111 |     +41 |
-| 2m         |  +2,312 |   +14 |    +148 |     +54 |
-| 5m         |  +5,723 |   +14 |    +148 |     +55 |
-| 10m        | +11,262 |   +14 |    +148 |     +54 |
-| interleaved | +4,509 |  +271 |  +2,955 |  +1,079 |
+| small      | +15,985 | +4,069 | +17,408 | +13,478 |
+| medium     |  +1,955 |    +67 |  +1,349 |  +1,349 |
+| github-100k | +12,761 |   +20 |    +591 |    +273 |
+| 100k       |    +485 |   +74 |    +739 |    +270 |
+| 200k       |    +392 |   +34 |    +370 |    +135 |
+| 500k       |    +577 |   +14 |    +148 |     +54 |
+| 1m         |  +1,082 |   +10 |    +111 |     +41 |
+| 2m         |  +1,155 |   +18 |    +217 |     +54 |
+| 5m         |  +1,316 |   +14 |    +409 |     +54 |
+| 10m        |  +1,583 |   +14 |    +717 |     +54 |
+| interleaved | +3,356 |  +271 |  +2,955 |  +1,080 |
 
 `qd.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -133,11 +134,10 @@ key into the Lua table heap.
    The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
    larger multimodal payloads show roughly 20–35× higher throughput than
    `cjson` for request-field access.
-2. **Reading every message `content` is still access-light for large
+2. **Reading every `messages[*].content` is still access-light for large
    multimodal bodies.** The benchmark touches the top-level request fields and
-   one `content` field per message, but it does not materialize every nested
-   image part or base64 string unless that field is read through the lazy table
-   API.
+   one `content` field per message; the payload size comes from image data
+   inside each message.
 3. **The win drops at 10 MB.** `qd.parse` is L3-bandwidth-bound at that
    size, and the `qd.decode` proxy's per-`__index` dispatch starts to
    amortize less well against the cheaper structural scan. `cjson` is still
@@ -150,7 +150,7 @@ key into the Lua table heap.
 5. **Memory retention** for `quickdecode` is essentially flat in payload
    size; the eager parsers retain ~1× the input size after the first run
    because the Lua table tree stays GC-rooted until the next collection.
-   The 10 MB case retains ~11 MB for `cjson`, ~14 KB for
+   The 10 MB case retains ~1.5 MB for `cjson`, ~14 KB for
    `qd.parse`.
 6. **REST API payloads (github-100k) show a smaller speedup** because their
    structural density is higher than the multimodal request ladder. Memory

From 4c03716a0b89d0816a9db6461c325a1bac83b562 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 10:40:50 +0800
Subject: [PATCH 5/9] bench: compare simdjson under openresty

---
 CLAUDE.md             |   4 +-
 Makefile              |  21 +++++----
 README.md             |  35 ++++++--------
 benches/lua_bench.lua |  13 ++++++
 docs/benchmarks.md    | 104 ++++++++++++++++++++++--------------------
 5 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index cec0555..f2ab0c8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -14,7 +14,7 @@ The `Makefile` is the canonical entry point; `make help` lists targets.
 make build              # cargo build --release  → target/release/libquickdecode.so
 make test               # cargo test --release + busted Lua tests
 make lint               # cargo clippy -D warnings + cargo fmt --check
-make bench              # LuaJIT vs lua-cjson on benches/fixtures
+make bench              # OpenResty LuaJIT benchmark vs lua-cjson and simdjson
 ```
 
 Under the hood / for narrower invocations:
@@ -79,7 +79,7 @@ src/
 lua/quickdecode.lua    LuaJIT wrapper (ffi.cdef + Doc/Cursor metatables)
 include/lua_quick_decode.h  public C header
 tests/                Rust integration tests + tests/lua/ busted suite
-benches/              lua_bench.lua vs lua-cjson; fixtures/ has small_api.json + medium_resp.json
+benches/              lua_bench.lua vs lua-cjson/simdjson; fixtures/ has small_api.json + medium_resp.json
 ```
 
 The enum values in `src/error.rs` are duplicated in `include/lua_quick_decode.h` and `lua/quickdecode.lua` (the latter only encodes the `T_*` type tags and `NOT_FOUND = 2`). Keep all three in sync when adding/renumbering codes.
diff --git a/Makefile b/Makefile
index dcf93ee..97071b0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,18 @@
-# Overridable: `make bench LUAJIT=/path/to/luajit LUA_CPATH='...'`
-LUAJIT    ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)
-LUA_CPATH ?= ./vendor/lua-cjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so
-
-LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)))
+# Overridable: `make bench LUAJIT=/path/to/luajit RESTY=/path/to/resty LUA_CPATH='...'`
+OPENRESTY ?= /usr/local/openresty
+LUAJIT    ?= $(OPENRESTY)/luajit/bin/luajit
+RESTY     ?= $(OPENRESTY)/bin/resty
+LUA_PATH  ?= ./lua/?.lua;$(OPENRESTY)/lualib/?.lua;$(OPENRESTY)/lualib/?/init.lua;;
+LUA_CPATH ?= ./vendor/lua-cjson/?.so;./target/release/lib?.so;./?.so;$(OPENRESTY)/lualib/?.so;/usr/local/lib/lua/5.1/?.so;$(OPENRESTY)/luajit/lib/lua/5.1/?.so
+
+LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo $(OPENRESTY)/luajit/bin/luajit)))
 LUAJIT_INC    ?= $(LUAJIT_PREFIX)/include/luajit-2.1
 
 LIB_DIR := $(CURDIR)/target/release
 ifeq ($(shell uname),Darwin)
-LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)'
+LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)'
 else
-LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)'
+LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)'
 endif
 
 .PHONY: help build test lint bench clean
@@ -29,8 +32,8 @@ test: build ## Run cargo tests + busted Lua tests
 lint: ## Run clippy with -D warnings
 	cargo clippy --release --all-targets -- -D warnings
 
-bench: build vendor/lua-cjson/cjson.so ## Run the LuaJIT vs cjson benchmark
-	$(LUA_ENV) $(LUAJIT) benches/lua_bench.lua
+bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark
+	$(LUA_ENV) $(RESTY) benches/lua_bench.lua
 
 vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile
 ifeq ($(shell uname),Darwin)
diff --git a/README.md b/README.md
index bc9d0c2..6dd604a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below.
+Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson but tuning is pending — see `Roadmap / Deferred` below.
 
 ## Building
 
@@ -83,38 +83,29 @@ busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so'
 ## Benchmarks
 
 `quickdecode` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
-chat-completion payloads, "parse + access 3 fields" workload (median ops/s
-under LuaJIT 2.1, Skylake; 5 rounds, deterministic payload):
+chat-completion payloads, "parse + access model, temperature, and every
+message content" workload (median ops/s under OpenResty LuaJIT 2.1,
+Skylake; 5 rounds, deterministic payload):
 
-| Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson |
+| Size | cjson | simdjson | `qd.parse` | `qd.decode + content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB | 39,414 | 54,395 | 117,233 | 126,807 |  3.0× / 3.2× |
-| 100 KB |  2,589 | 19,944 |  72,202 |  61,162 | 27.9× / 23.6× |
-|   1 MB |    355 |  2,048 |  12,723 |  12,448 | 35.8× / 35.1× |
-|  10 MB |     32 |    128 |     537 |     609 | 16.8× / 19.0× |
+|   2 KB | 106,646 | 137,427 | 135,296 |  97,574 |  1.3× /  0.9× |
+| 100 KB |   6,045 |  46,577 | 137,931 | 134,590 | 22.8× / 22.3× |
+|   1 MB |     594 |   4,408 |  16,447 |  16,340 | 27.7× / 27.5× |
+|  10 MB |      59 |     356 |   1,035 |   1,028 | 17.5× / 17.4× |
 
 `qd.parse` wins because it skips building a Lua table for the parts you
 never read; `qd.decode + t.field` adds a cjson-shaped table proxy on top
 with similar throughput. Memory retention for `quickdecode` is essentially
-flat in payload size (a few KB for the reusable buffers), where `cjson`
-and `simdjson` retain ~1× the input size as live Lua-table state.
-
-ARM64 (Apple M4, NEON/PMULL scanner, same workload):
-
-| Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson |
-|---:|---:|---:|---:|---:|
-|   2 KB | 237,124 | 705,000 | 390,000 |  3.0× /  1.6× |
-| 100 KB |  14,667 | 232,000 | 208,000 | 15.8× / 14.2× |
-|   1 MB |   1,494 |  33,700 |  33,000 | 22.6× / 22.1× |
-|  10 MB |     150 |   3,376 |   3,454 | 22.5× / 23.0× |
+flat in payload size (a few KB for the reusable buffers), while `cjson`
+and `simdjson` retain more Lua heap because they materialize the table tree.
 
 See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
 memory numbers, an "encode round-trip" row (passthrough emit via
-`memcpy`), the pure-decode (no-access) comparison, and the exact
-methodology + reproduction command.
+`memcpy`), and the exact methodology + reproduction command.
 
 ```sh
-make bench       # quickdecode vs cjson
+make bench       # quickdecode vs cjson and lua-resty-simdjson
 ```
 
 ## RFC 8259 conformance
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 40ca4aa..a92c99b 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -3,6 +3,7 @@ package.cpath = package.cpath .. ";./target/release/lib?.so"
 
 local qd    = require("quickdecode")
 local cjson = require("cjson")
+local simdjson = require("resty.simdjson").new()
 
 local function read_file(p)
     local f = assert(io.open(p, "rb"))
@@ -272,6 +273,11 @@ for _, s in ipairs(scenarios) do
         cjson_access(obj)
     end)
 
+    bench("simdjson.decode + access fields", s.iters, function()
+        local obj = simdjson:decode(s.payload)
+        cjson_access(obj)
+    end)
+
     bench("quickdecode.parse + access fields", s.iters, function()
         local d = qd.parse(s.payload)
         qd_access(d)
@@ -338,6 +344,13 @@ do
         default_cjson_access(obj)
     end)
 
+    next_p = make_cycler(interleaved)
+    bench("simdjson.decode + access fields", 400, function()
+        local p = next_p()
+        local obj = simdjson:decode(p)
+        default_cjson_access(obj)
+    end)
+
     next_p = make_cycler(interleaved)
     bench("quickdecode.parse + access fields", 400, function()
         local p = next_p()
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ff960e3..065e899 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -1,12 +1,14 @@
 # Benchmarks
 
 Throughput and memory comparison of `quickdecode` (this library) against
-`lua-cjson` on a multimodal chat-completion payload ladder from 2 KB to 10 MB.
+`lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload
+ladder from 2 KB to 10 MB.
 
 `quickdecode` is optimized for *parse + read a small part of the document*;
 the data below quantifies how the lazy structural scan behaves when the caller
 reads request metadata plus every chat message `content`, without eagerly
-building the whole Lua table. `lua-cjson` is the eager-table baseline.
+building the whole Lua table. `lua-cjson` and `lua-resty-simdjson` are eager
+Lua-table baselines.
 
 ## Environment
 
@@ -15,9 +17,10 @@ building the whole Lua table. `lua-cjson` is the eager-table baseline.
 | Host CPU | Intel Xeon (Skylake, IBRS), 4 cores |
 | Memory | 15 GiB |
 | OS | Linux x86_64 |
-| Runtime | Homebrew LuaJIT 2.1.1774896198 |
+| Runtime | OpenResty `resty` 0.29 / LuaJIT 2.1.1723681758 |
 | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active |
 | `lua-cjson` | vendored `openresty/lua-cjson` |
+| `lua-resty-simdjson` | OpenResty lualib `resty.simdjson` |
 
 ## Methodology
 
@@ -51,20 +54,22 @@ parsing workloads with ~3-5% structural density.
 | Row | What it does | Notes |
 |---|---|---|
 | `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
+| `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
 | `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads |
 | `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
 ## Reproducing
 
-The straight comparison against `cjson` is one command:
+Run the full comparison with one command:
 
 ```sh
 make bench
 ```
 
-This invokes `benches/lua_bench.lua` with `LD_LIBRARY_PATH=target/release`
-and a `LUA_CPATH` that picks up the vendored `lua-cjson` build.
+This builds `quickdecode`, builds the vendored `lua-cjson` against OpenResty's
+LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so
+`lua-resty-simdjson` runs in its normal `ngx` environment.
 
 Numbers below come from one such run.
 
@@ -72,34 +77,34 @@ Numbers below come from one such run.
 
 Each row is "parse + access request fields" on the named payload.
 
-| Scenario | Size | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
-|---|---:|---:|---:|---:|---:|
-| small      |   2.1 KB | 113,056 | 132,184 |  81,769 | 145,722 |
-| medium     |  60.4 KB |   8,194 | 196,773 | 142,086 | 147,406 |
-| github-100k |   100 KB |   2,424 |   4,510 |   4,444 |   4,783 |
-| 100k       |   100 KB |   4,874 | 144,509 | 100,100 | 107,527 |
-| 200k       |   200 KB |   2,446 |  78,247 |  64,350 |  69,832 |
-| 500k       |   500 KB |     982 |  33,003 |  30,211 |  31,299 |
-| 1m         |  1.00 MB |     478 |  16,930 |  16,146 |  16,358 |
-| 2m         |  2.00 MB |     238 |   8,361 |   8,127 |   8,302 |
-| 5m         |  5.00 MB |      95 |   2,939 |   2,923 |   2,979 |
-| 10m        | 10.00 MB |      48 |   1,046 |   1,045 |     955 |
-| interleaved (100k/200k/500k/1m, cycled) | — | 1,063 | 33,498 | 30,595 | 31,646 |
+| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
+|---|---:|---:|---:|---:|---:|---:|
+| small      |   2.1 KB | 106,646 | 137,427 | 135,296 |  97,574 | 202,388 |
+| medium     |  60.4 KB |  10,086 |  86,029 | 189,970 | 198,098 | 175,562 |
+| github-100k |   100 KB |   2,208 |   2,880 |   4,496 |   4,479 |   4,809 |
+| 100k       |   100 KB |   6,045 |  46,577 | 137,931 | 134,590 | 153,139 |
+| 200k       |   200 KB |   3,025 |  22,563 |  78,247 |  75,873 |  81,433 |
+| 500k       |   500 KB |   1,216 |   9,128 |  33,058 |  32,680 |  34,188 |
+| 1m         |  1.00 MB |     594 |   4,408 |  16,447 |  16,340 |  16,722 |
+| 2m         |  2.00 MB |     296 |   1,966 |   8,247 |   8,224 |   8,055 |
+| 5m         |  5.00 MB |     118 |     600 |   2,869 |   2,945 |   2,992 |
+| 10m        | 10.00 MB |      59 |     356 |   1,035 |   1,028 |   1,050 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,318 | 9,116 | 33,342 | 32,752 | 34,031 |
 
 ### Speed-up vs. baselines
 
-| Scenario | `qd.parse` / cjson | `qd.decode + access content` / cjson |
-|---|---:|---:|
-| small  |  1.2× |  0.7× |
-| medium | 24.0× | 17.3× |
-| github-100k | 1.9× | 1.8× |
-| 100k   | 29.6× | 20.5× |
-| 200k   | 32.0× | 26.3× |
-| 500k   | 33.6× | 30.8× |
-| 1m     | 35.4× | 33.8× |
-| 2m     | 35.1× | 34.1× |
-| 5m     | 30.9× | 30.8× |
-| 10m    | 21.8× | 21.8× |
+| Scenario | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access content` / cjson | `qd.decode + access content` / simdjson |
+|---|---:|---:|---:|---:|
+| small  |  1.3× |  1.0× |  0.9× |  0.7× |
+| medium | 18.8× |  2.2× | 19.6× |  2.3× |
+| github-100k | 2.0× |  1.6× | 2.0× |  1.6× |
+| 100k   | 22.8× |  3.0× | 22.3× |  2.9× |
+| 200k   | 25.9× |  3.5× | 25.1× |  3.4× |
+| 500k   | 27.2× |  3.6× | 26.9× |  3.6× |
+| 1m     | 27.7× |  3.7× | 27.5× |  3.7× |
+| 2m     | 27.9× |  4.2× | 27.8× |  4.2× |
+| 5m     | 24.3× |  4.8× | 25.0× |  4.9× |
+| 10m    | 17.5× |  2.9× | 17.4× |  2.9× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
@@ -107,19 +112,19 @@ Post-run `collectgarbage("count")` minus baseline. Captures heap usage after
 the timing rounds without forcing a final collection, so short-lived garbage
 from the last round may still be included.
 
-| Scenario | cjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
-|---|---:|---:|---:|---:|
-| small      | +15,985 | +4,069 | +17,408 | +13,478 |
-| medium     |  +1,955 |    +67 |  +1,349 |  +1,349 |
-| github-100k | +12,761 |   +20 |    +591 |    +273 |
-| 100k       |    +485 |   +74 |    +739 |    +270 |
-| 200k       |    +392 |   +34 |    +370 |    +135 |
-| 500k       |    +577 |   +14 |    +148 |     +54 |
-| 1m         |  +1,082 |   +10 |    +111 |     +41 |
-| 2m         |  +1,155 |   +18 |    +217 |     +54 |
-| 5m         |  +1,316 |   +14 |    +409 |     +54 |
-| 10m        |  +1,583 |   +14 |    +717 |     +54 |
-| interleaved | +3,356 |  +271 |  +2,955 |  +1,080 |
+| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
+|---|---:|---:|---:|---:|---:|
+| small      | +15,464 | +15,447 | +4,094 | +15,251 | +11,908 |
+| medium     |  +1,955 |  +2,660 |   +160 |  +1,210 |  +1,216 |
+| github-100k | +13,187 | +3,362 |   +29 |    +548 |    +242 |
+| 100k       |    +484 |   +748 |   +79 |    +704 |    +241 |
+| 200k       |    +392 |   +523 |   +40 |    +352 |    +124 |
+| 500k       |    +577 |   +630 |   +17 |    +142 |     +48 |
+| 1m         |  +1,082 | +1,121 |   +13 |    +107 |     +37 |
+| 2m         |  +1,155 | +1,248 |   +21 |    +211 |     +48 |
+| 5m         |  +1,316 | +1,538 |   +17 |    +403 |     +48 |
+| 10m        |  +1,583 | +2,014 |   +16 |    +844 |     +48 |
+| interleaved | +3,355 | +4,404 |  +314 |  +2,825 |    +945 |
 
 `qd.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -132,8 +137,9 @@ key into the Lua table heap.
 
 1. **`quickdecode` is fastest once payloads move beyond tiny inputs.**
    The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
-   larger multimodal payloads show roughly 20–35× higher throughput than
-   `cjson` for request-field access.
+   larger multimodal payloads show roughly 18–28× higher throughput than
+   `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson`
+   for request-field access.
 2. **Reading every `messages[*].content` is still access-light for large
    multimodal bodies.** The benchmark touches the top-level request fields and
    one `content` field per message; the payload size comes from image data
@@ -148,10 +154,10 @@ key into the Lua table heap.
    re-emit is `memcpy`, not re-serialize, and the throughput tracks
    `qd.parse` very closely.
 5. **Memory retention** for `quickdecode` is essentially flat in payload
-   size; the eager parsers retain ~1× the input size after the first run
+   size; the eager parsers retain more Lua heap after the first run
    because the Lua table tree stays GC-rooted until the next collection.
-   The 10 MB case retains ~1.5 MB for `cjson`, ~14 KB for
-   `qd.parse`.
+   The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson,
+   and ~16 KB for `qd.parse`.
 6. **REST API payloads (github-100k) show a smaller speedup** because their
    structural density is higher than the multimodal request ladder. Memory
    savings remain dramatic because `cjson` must materialize every nested

From b2c414b1d0b40546db5b8d8b9eec949b2751d8c1 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 14:41:28 +0800
Subject: [PATCH 6/9] docs: align benchmark environment and comments

---
 README.md             |  6 +++---
 benches/lua_bench.lua | 14 ++++----------
 docs/benchmarks.md    | 12 ++++++------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 6dd604a..cb277f8 100644
--- a/README.md
+++ b/README.md
@@ -84,10 +84,10 @@ busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so'
 
 `quickdecode` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
 chat-completion payloads, "parse + access model, temperature, and every
-message content" workload (median ops/s under OpenResty LuaJIT 2.1,
-Skylake; 5 rounds, deterministic payload):
+messages[*].content path" workload (median ops/s under OpenResty LuaJIT 2.1,
+Intel Core i5-9400; 5 rounds, deterministic payload):
 
-| Size | cjson | simdjson | `qd.parse` | `qd.decode + content` | speedup vs. cjson |
+| Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
 |   2 KB | 106,646 | 137,427 | 135,296 |  97,574 |  1.3× /  0.9× |
 | 100 KB |   6,045 |  46,577 | 137,931 | 134,590 | 22.8× / 22.3× |
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index a92c99b..cbcd050 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -12,20 +12,14 @@ local function read_file(p)
     return s
 end
 
--- Shape: a multimodal chat-completion request with multiple historical
+-- Shape: a multimodal chat-completion request with one or more historical
 -- messages. Each message contains one small text part and one base64-encoded
 -- image part. The number of messages scales with payload size: a 10 MB request
 -- has roughly ten 1 MB image-bearing messages.
 --
--- Image sizes are drawn from a deterministic Park-Miller LCG (not math.random,
--- which delegates to libc rand() and varies across machines) so the same
--- target_bytes produces byte-identical output on any LuaJIT 2.1 host.
---
--- Size accuracy: the normal-branch upper is `min(500K, remaining)` so the
--- loop cannot overshoot during steady state. When fewer than 50 KB remain
--- the final image falls through to `math.max(1024, remaining)` — undershoot
--- is at most a few hundred bytes; worst-case overshoot is ~1 KB (only when
--- `remaining < 1024`, which the seed=42 walk does not hit for our ladder).
+-- Size accuracy: payload sizing is approximate. Message separators, role
+-- strings, and the 1 KB minimum image size can add small drift from
+-- `target_bytes` on tiny scenarios; larger scenarios stay close to target.
 -- GitHub-style payload: simulates /repos/{owner}/{repo}/issues response.
 -- Each issue has ~20 fields including nested user object, labels array,
 -- and realistic string lengths (URLs, timestamps, markdown body).
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 065e899..b387c9f 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -14,10 +14,10 @@ Lua-table baselines.
 
 | | |
 |---|---|
-| Host CPU | Intel Xeon (Skylake, IBRS), 4 cores |
+| Host CPU | Intel Core i5-9400, 6 cores, AVX2 + PCLMUL |
 | Memory | 15 GiB |
-| OS | Linux x86_64 |
-| Runtime | OpenResty `resty` 0.29 / LuaJIT 2.1.1723681758 |
+| OS | Ubuntu 24.04.4 LTS, Linux 6.8.0-110-generic, x86_64 |
+| Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 |
 | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active |
 | `lua-cjson` | vendored `openresty/lua-cjson` |
 | `lua-resty-simdjson` | OpenResty lualib `resty.simdjson` |
@@ -33,10 +33,10 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
 3. 5 rounds × N iterations of the workload; report the **median** ops/s
    across rounds (mean + range also reported in the raw output).
 4. Final `collectgarbage("count")` to capture the post-run memory delta in
-   KB — measures GC-rooted state retained by the parser, not transient
-   per-call allocations.
+   KB. The harness does not force a final collection after timing, so
+   short-lived garbage from the last round may still be included.
 
-The payload is a synthetic multimodal chat-completion request with multiple
+The payload is a synthetic multimodal chat-completion request with one or more
 historical messages. Each message contains one small text part and one
 base64-encoded image part. Message count scales with payload size: the 10 MB
 scenario has roughly ten messages, each carrying one ~1 MB image, so the

From 1864716798b861e4fd0672dee29b486c0792069a Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 14:44:48 +0800
Subject: [PATCH 7/9] docs: remove stale roadmap reference

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb277f8..a8cd2bd 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson but tuning is pending — see `Roadmap / Deferred` below.
+Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
 
 ## Building
 

From 18bc50bbe2162f58048c07463e6f203bb85ea19b Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 14:46:07 +0800
Subject: [PATCH 8/9] docs: clarify content path wording

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a8cd2bd..d3bfad3 100644
--- a/README.md
+++ b/README.md
@@ -83,8 +83,8 @@ busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so'
 ## Benchmarks
 
 `quickdecode` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
-chat-completion payloads, "parse + access model, temperature, and every
-messages[*].content path" workload (median ops/s under OpenResty LuaJIT 2.1,
+chat-completion payloads, "parse + access model, temperature, and all
+messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1,
 Intel Core i5-9400; 5 rounds, deterministic payload):
 
 | Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | speedup vs. cjson |

From 9718a795b34232d1b578b76d9585e0410c93f7f6 Mon Sep 17 00:00:00 2001
From: Jarvis <jarvis@api7.ai>
Date: Mon, 18 May 2026 14:51:18 +0800
Subject: [PATCH 9/9] bench: handle optional simdjson setup

---
 Makefile              |  8 +++++---
 README.md             |  4 +++-
 benches/lua_bench.lua | 34 +++++++++++++++++++++++-----------
 docs/benchmarks.md    |  4 +++-
 4 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 97071b0..086b577 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,13 @@
 # Overridable: `make bench LUAJIT=/path/to/luajit RESTY=/path/to/resty LUA_CPATH='...'`
 OPENRESTY ?= /usr/local/openresty
-LUAJIT    ?= $(OPENRESTY)/luajit/bin/luajit
-RESTY     ?= $(OPENRESTY)/bin/resty
+OPENRESTY_LUAJIT := $(OPENRESTY)/luajit/bin/luajit
+OPENRESTY_RESTY  := $(OPENRESTY)/bin/resty
+LUAJIT    ?= $(shell if [ -x "$(OPENRESTY_LUAJIT)" ]; then echo "$(OPENRESTY_LUAJIT)"; else command -v luajit 2>/dev/null || echo luajit; fi)
+RESTY     ?= $(shell if [ -x "$(OPENRESTY_RESTY)" ]; then echo "$(OPENRESTY_RESTY)"; else command -v resty 2>/dev/null || echo resty; fi)
 LUA_PATH  ?= ./lua/?.lua;$(OPENRESTY)/lualib/?.lua;$(OPENRESTY)/lualib/?/init.lua;;
 LUA_CPATH ?= ./vendor/lua-cjson/?.so;./target/release/lib?.so;./?.so;$(OPENRESTY)/lualib/?.so;/usr/local/lib/lua/5.1/?.so;$(OPENRESTY)/luajit/lib/lua/5.1/?.so
 
-LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo $(OPENRESTY)/luajit/bin/luajit)))
+LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo $(OPENRESTY_LUAJIT))))
 LUAJIT_INC    ?= $(LUAJIT_PREFIX)/include/luajit-2.1
 
 LIB_DIR := $(CURDIR)/target/release
diff --git a/README.md b/README.md
index d3bfad3..18d93c8 100644
--- a/README.md
+++ b/README.md
@@ -102,7 +102,9 @@ and `simdjson` retain more Lua heap because they materialize the table tree.
 
 See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
 memory numbers, an "encode round-trip" row (passthrough emit via
-`memcpy`), and the exact methodology + reproduction command.
+`memcpy`), exact environment, and the reproduction command. `make bench`
+uses `lua-resty-simdjson` when `resty.simdjson` is available in the
+OpenResty environment; otherwise it skips the simdjson rows.
 
 ```sh
 make bench       # quickdecode vs cjson and lua-resty-simdjson
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index cbcd050..a1b26d4 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -3,7 +3,10 @@ package.cpath = package.cpath .. ";./target/release/lib?.so"
 
 local qd    = require("quickdecode")
 local cjson = require("cjson")
-local simdjson = require("resty.simdjson").new()
+local simdjson_ok, simdjson_or_err = pcall(function()
+    return require("resty.simdjson").new()
+end)
+local simdjson = simdjson_ok and simdjson_or_err or nil
 
 local function read_file(p)
     local f = assert(io.open(p, "rb"))
@@ -255,6 +258,11 @@ local scenarios = {
 local has_pooled_api = type(qd.new_decoder) == "function"
 local pooled_decoder = has_pooled_api and qd.new_decoder() or nil
 
+if not simdjson then
+    print("lua-resty-simdjson unavailable; skipping simdjson rows: "
+        .. tostring(simdjson_or_err))
+end
+
 for _, s in ipairs(scenarios) do
     print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))
 
@@ -267,10 +275,12 @@ for _, s in ipairs(scenarios) do
         cjson_access(obj)
     end)
 
-    bench("simdjson.decode + access fields", s.iters, function()
-        local obj = simdjson:decode(s.payload)
-        cjson_access(obj)
-    end)
+    if simdjson then
+        bench("simdjson.decode + access fields", s.iters, function()
+            local obj = simdjson:decode(s.payload)
+            cjson_access(obj)
+        end)
+    end
 
     bench("quickdecode.parse + access fields", s.iters, function()
         local d = qd.parse(s.payload)
@@ -338,12 +348,14 @@ do
         default_cjson_access(obj)
     end)
 
-    next_p = make_cycler(interleaved)
-    bench("simdjson.decode + access fields", 400, function()
-        local p = next_p()
-        local obj = simdjson:decode(p)
-        default_cjson_access(obj)
-    end)
+    if simdjson then
+        next_p = make_cycler(interleaved)
+        bench("simdjson.decode + access fields", 400, function()
+            local p = next_p()
+            local obj = simdjson:decode(p)
+            default_cjson_access(obj)
+        end)
+    end
 
     next_p = make_cycler(interleaved)
     bench("quickdecode.parse + access fields", 400, function()
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index b387c9f..ef0dab9 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -20,7 +20,7 @@ Lua-table baselines.
 | Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 |
 | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active |
 | `lua-cjson` | vendored `openresty/lua-cjson` |
-| `lua-resty-simdjson` | OpenResty lualib `resty.simdjson` |
+| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib |
 
 ## Methodology
 
@@ -70,6 +70,8 @@ make bench
 This builds `quickdecode`, builds the vendored `lua-cjson` against OpenResty's
 LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so
 `lua-resty-simdjson` runs in its normal `ngx` environment.
+If `resty.simdjson` is not available on `package.path` / `package.cpath`, the
+harness prints a skip message and omits the simdjson rows.
 
 Numbers below come from one such run.