diff --git a/CLAUDE.md b/CLAUDE.md
index cec0555..f2ab0c8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -14,7 +14,7 @@ The `Makefile` is the canonical entry point; `make help` lists targets.
 make build              # cargo build --release  → target/release/libquickdecode.so
 make test               # cargo test --release + busted Lua tests
 make lint               # cargo clippy -D warnings + cargo fmt --check
-make bench              # LuaJIT vs lua-cjson on benches/fixtures
+make bench              # OpenResty LuaJIT benchmark vs lua-cjson and simdjson
 ```
 
 Under the hood / for narrower invocations:
@@ -79,7 +79,7 @@ src/
 lua/quickdecode.lua    LuaJIT wrapper (ffi.cdef + Doc/Cursor metatables)
 include/lua_quick_decode.h  public C header
 tests/                Rust integration tests + tests/lua/ busted suite
-benches/              lua_bench.lua vs lua-cjson; fixtures/ has small_api.json + medium_resp.json
+benches/              lua_bench.lua vs lua-cjson/simdjson; fixtures/ has small_api.json + medium_resp.json
 ```
 
 The enum values in `src/error.rs` are duplicated in `include/lua_quick_decode.h` and `lua/quickdecode.lua` (the latter only encodes the `T_*` type tags and `NOT_FOUND = 2`). Keep all three in sync when adding/renumbering codes.
diff --git a/Makefile b/Makefile
index dcf93ee..086b577 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,20 @@
-# Overridable: `make bench LUAJIT=/path/to/luajit LUA_CPATH='...'`
-LUAJIT    ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)
-LUA_CPATH ?= ./vendor/lua-cjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so
-
-LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)))
+# Overridable: `make bench LUAJIT=/path/to/luajit RESTY=/path/to/resty LUA_CPATH='...'`
+OPENRESTY ?= /usr/local/openresty
+OPENRESTY_LUAJIT := $(OPENRESTY)/luajit/bin/luajit
+OPENRESTY_RESTY  := $(OPENRESTY)/bin/resty
+LUAJIT    ?= $(shell if [ -x "$(OPENRESTY_LUAJIT)" ]; then echo "$(OPENRESTY_LUAJIT)"; else command -v luajit 2>/dev/null || echo luajit; fi)
+RESTY     ?= $(shell if [ -x "$(OPENRESTY_RESTY)" ]; then echo "$(OPENRESTY_RESTY)"; else command -v resty 2>/dev/null || echo resty; fi)
+LUA_PATH  ?= ./lua/?.lua;$(OPENRESTY)/lualib/?.lua;$(OPENRESTY)/lualib/?/init.lua;;
+LUA_CPATH ?= ./vendor/lua-cjson/?.so;./target/release/lib?.so;./?.so;$(OPENRESTY)/lualib/?.so;/usr/local/lib/lua/5.1/?.so;$(OPENRESTY)/luajit/lib/lua/5.1/?.so
+
+LUAJIT_PREFIX ?= $(shell dirname $$(dirname $$(command -v $(LUAJIT) 2>/dev/null || echo $(OPENRESTY_LUAJIT))))
 LUAJIT_INC    ?= $(LUAJIT_PREFIX)/include/luajit-2.1
 
 LIB_DIR := $(CURDIR)/target/release
 ifeq ($(shell uname),Darwin)
-LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)'
+LUA_ENV := DYLD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)'
 else
-LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)'
+LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_PATH='$(LUA_PATH)' LUA_CPATH='$(LUA_CPATH)'
 endif
 
 .PHONY: help build test lint bench clean
@@ -29,8 +34,8 @@ test: build ## Run cargo tests + busted Lua tests
 lint: ## Run clippy with -D warnings
 	cargo clippy --release --all-targets -- -D warnings
 
-bench: build vendor/lua-cjson/cjson.so ## Run the LuaJIT vs cjson benchmark
-	$(LUA_ENV) $(LUAJIT) benches/lua_bench.lua
+bench: build vendor/lua-cjson/cjson.so ## Run the OpenResty LuaJIT benchmark
+	$(LUA_ENV) $(RESTY) benches/lua_bench.lua
 
 vendor/lua-cjson/cjson.so: | vendor/lua-cjson/Makefile
 ifeq ($(shell uname),Darwin)
diff --git a/README.md b/README.md
index bc9d0c2..18d93c8 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below.
+Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
 
 ## Building
 
@@ -83,38 +83,31 @@ busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so'
 ## Benchmarks
 
 `quickdecode` vs. `lua-cjson` and `lua-resty-simdjson` on multimodal
-chat-completion payloads, "parse + access 3 fields" workload (median ops/s
-under LuaJIT 2.1, Skylake; 5 rounds, deterministic payload):
+chat-completion payloads, "parse + access model, temperature, and all
+messages[*].content paths" workload (median ops/s under OpenResty LuaJIT 2.1,
+Intel Core i5-9400; 5 rounds, deterministic payload):
 
-| Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson |
+| Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | speedup vs. cjson |
 |---:|---:|---:|---:|---:|---:|
-|   2 KB | 39,414 | 54,395 | 117,233 | 126,807 |  3.0× / 3.2× |
-| 100 KB |  2,589 | 19,944 |  72,202 |  61,162 | 27.9× / 23.6× |
-|   1 MB |    355 |  2,048 |  12,723 |  12,448 | 35.8× / 35.1× |
-|  10 MB |     32 |    128 |     537 |     609 | 16.8× / 19.0× |
+|   2 KB | 106,646 | 137,427 | 135,296 |  97,574 |  1.3× /  0.9× |
+| 100 KB |   6,045 |  46,577 | 137,931 | 134,590 | 22.8× / 22.3× |
+|   1 MB |     594 |   4,408 |  16,447 |  16,340 | 27.7× / 27.5× |
+|  10 MB |      59 |     356 |   1,035 |   1,028 | 17.5× / 17.4× |
 
 `qd.parse` wins because it skips building a Lua table for the parts you
 never read; `qd.decode + t.field` adds a cjson-shaped table proxy on top
 with similar throughput. Memory retention for `quickdecode` is essentially
-flat in payload size (a few KB for the reusable buffers), where `cjson`
-and `simdjson` retain ~1× the input size as live Lua-table state.
-
-ARM64 (Apple M4, NEON/PMULL scanner, same workload):
-
-| Size | cjson | `qd.parse` | `qd.decode + t.f x3` | speedup vs. cjson |
-|---:|---:|---:|---:|---:|
-|   2 KB | 237,124 | 705,000 | 390,000 |  3.0× /  1.6× |
-| 100 KB |  14,667 | 232,000 | 208,000 | 15.8× / 14.2× |
-|   1 MB |   1,494 |  33,700 |  33,000 | 22.6× / 22.1× |
-|  10 MB |     150 |   3,376 |   3,454 | 22.5× / 23.0× |
+flat in payload size (a few KB for the reusable buffers), while `cjson`
+and `simdjson` retain more Lua heap because they materialize the table tree.
 
 See [`docs/benchmarks.md`](docs/benchmarks.md) for the full size ladder,
 memory numbers, an "encode round-trip" row (passthrough emit via
-`memcpy`), the pure-decode (no-access) comparison, and the exact
-methodology + reproduction command.
+`memcpy`), exact environment, and the reproduction command. `make bench`
+uses `lua-resty-simdjson` when `resty.simdjson` is available in the
+OpenResty environment; otherwise it skips the simdjson rows.
 
 ```sh
-make bench       # quickdecode vs cjson
+make bench       # quickdecode vs cjson and lua-resty-simdjson
 ```
 
 ## RFC 8259 conformance
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 7f2c1de..a1b26d4 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -3,6 +3,10 @@ package.cpath = package.cpath .. ";./target/release/lib?.so"
 
 local qd    = require("quickdecode")
 local cjson = require("cjson")
+local simdjson_ok, simdjson_or_err = pcall(function()
+    return require("resty.simdjson").new()
+end)
+local simdjson = simdjson_ok and simdjson_or_err or nil
 
 local function read_file(p)
     local f = assert(io.open(p, "rb"))
@@ -11,19 +15,14 @@ local function read_file(p)
     return s
 end
 
--- Shape: a multimodal chat-completion request with one ~1.5K text question
--- and N base64-encoded image parts (each 50-500 KB) until the payload reaches
--- target_bytes. Mirrors the production case the bench is meant to reflect.
+-- Shape: a multimodal chat-completion request with one or more historical
+-- messages. Each message contains one small text part and one base64-encoded
+-- image part. The number of messages scales with payload size: a 10 MB request
+-- has roughly ten 1 MB image-bearing messages.
 --
--- Image sizes are drawn from a deterministic Park-Miller LCG (not math.random,
--- which delegates to libc rand() and varies across machines) so the same
--- target_bytes produces byte-identical output on any LuaJIT 2.1 host.
---
--- Size accuracy: the normal-branch upper is `min(500K, remaining)` so the
--- loop cannot overshoot during steady state. When fewer than 50 KB remain
--- the final image falls through to `math.max(1024, remaining)` — undershoot
--- is at most a few hundred bytes; worst-case overshoot is ~1 KB (only when
--- `remaining < 1024`, which the seed=42 walk does not hit for our ladder).
+-- Size accuracy: payload sizing is approximate. Message separators, role
+-- strings, and the 1 KB minimum image size can add small drift from
+-- `target_bytes` on tiny scenarios; larger scenarios stay close to target.
 -- GitHub-style payload: simulates /repos/{owner}/{repo}/issues response.
 -- Each issue has ~20 fields including nested user object, labels array,
 -- and realistic string lengths (URLs, timestamps, markdown body).
@@ -117,41 +116,28 @@ local function make_b64(size)
 end
 
 local function make_payload(target_bytes)
-    local rng_state = 42
-    local function rng_range(lo, hi)
-        -- Park-Miller minimal-standard LCG: a=48271, m=2^31-1. Multiplication
-        -- fits in double precision (48271 * 2^31 < 2^53).
-        rng_state = (rng_state * 48271) % 2147483647
-        return lo + (rng_state % (hi - lo + 1))
-    end
-
-    local text = string.rep("Q", 1500)
+    local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024)))
+    local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}'
+    local text = string.rep("Q", 256)
     local text_part = '{"type":"text","text":"' .. text .. '"}'
-    local parts = { text_part }
-    local current = 200 + #text_part  -- approx outer envelope overhead
-
-    while current < target_bytes do
-        local remaining = target_bytes - current
-        local img_size
-        if remaining < 50 * 1024 then
-            -- Final image: shrink below the 50 KB floor so the label matches
-            -- the actual payload size. Bench iters all see the same payload
-            -- regardless, so the smaller tail blob doesn't change what's
-            -- being measured.
-            img_size = math.max(1024, remaining)
-        else
-            local upper = math.min(500 * 1024, remaining)
-            img_size = rng_range(50 * 1024, upper)
-        end
-        local b64 = make_b64(img_size)
-        local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
-            .. b64 .. '"}}'
-        parts[#parts + 1] = img_part
-        current = current + #img_part + 1  -- +1 for comma
+    local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
+    local image_suffix = '"}}'
+    local message_overhead = #('{"role":"user","content":[,]}') + #text_part
+        + #image_prefix + #image_suffix
+    local remaining = target_bytes - #envelope - (message_count * message_overhead)
+    local image_size = math.max(1024, math.floor(remaining / message_count))
+
+    local messages = {}
+    for i = 1, message_count do
+        local role = i % 2 == 1 and "user" or "assistant"
+        local b64 = make_b64(image_size)
+        local image_part = image_prefix .. b64 .. image_suffix
+        messages[i] = '{"role":"' .. role .. '","content":['
+            .. text_part .. "," .. image_part .. ']}'
     end
 
-    return '{"model":"gpt-4-vision","temperature":0.7,"messages":'
-        .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}'
+    return '{"model":"gpt-4-vision","temperature":0.7,"messages":['
+        .. table.concat(messages, ",") .. ']}'
 end
 
 local ROUNDS = 5
@@ -190,19 +176,48 @@ end
 local function default_cjson_access(obj)
     local _ = obj.model
     local _ = obj.temperature
-    local _ = obj.messages and obj.messages[1] and obj.messages[1].role
+    if obj.messages then
+        for _, msg in ipairs(obj.messages) do
+            local _ = msg.content
+        end
+    end
+end
+
+local content_paths_by_message_count = {}
+
+local function content_paths(n)
+    local paths = content_paths_by_message_count[n]
+    if paths then
+        return paths
+    end
+
+    paths = {}
+    for i = 0, n - 1 do
+        paths[i + 1] = "messages[" .. i .. "].content"
+    end
+    content_paths_by_message_count[n] = paths
+    return paths
 end
 
 local function default_qd_access(d)
     local _ = d:get_str("model")
     local _ = d:get_f64("temperature")
-    local _ = d:get_str("messages[0].role")
+    local n = d:len("messages") or 0
+    local paths = content_paths(n)
+    for i = 1, n do
+        local _ = d:typeof(paths[i])
+    end
 end
 
 local function default_table_access(t)
     local _ = t.model
     local _ = t.temperature
-    local _ = t.messages and t.messages[1] and t.messages[1].role
+    if t.messages then
+        for i = 1, qd.len(t.messages) do
+            local msg = t.messages[i]
+            local _ = msg.content
+        end
+    end
 end
 
 -- GitHub issues accessors: array of issues, access first issue's fields
@@ -243,6 +258,11 @@ local scenarios = {
 local has_pooled_api = type(qd.new_decoder) == "function"
 local pooled_decoder = has_pooled_api and qd.new_decoder() or nil
 
+if not simdjson then
+    print("lua-resty-simdjson unavailable; skipping simdjson rows: "
+        .. tostring(simdjson_or_err))
+end
+
 for _, s in ipairs(scenarios) do
     print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))
 
@@ -250,18 +270,25 @@ for _, s in ipairs(scenarios) do
     local qd_access = s.qd_access or default_qd_access
     local table_access = s.table_access or default_table_access
 
-    bench("cjson.decode + access 3 fields", s.iters, function()
+    bench("cjson.decode + access fields", s.iters, function()
         local obj = cjson.decode(s.payload)
         cjson_access(obj)
     end)
 
-    bench("quickdecode.parse + access 3 fields", s.iters, function()
+    if simdjson then
+        bench("simdjson.decode + access fields", s.iters, function()
+            local obj = simdjson:decode(s.payload)
+            cjson_access(obj)
+        end)
+    end
+
+    bench("quickdecode.parse + access fields", s.iters, function()
         local d = qd.parse(s.payload)
         qd_access(d)
     end)
 
     if has_pooled_api then
-        bench("quickdecode pooled :parse + access 3 fields", s.iters, function()
+        bench("quickdecode pooled :parse + access fields", s.iters, function()
             local d = pooled_decoder:parse(s.payload)
             qd_access(d)
         end)
@@ -273,7 +300,7 @@ for _, s in ipairs(scenarios) do
         end)
     end
 
-    bench("qd.decode + t.field x3", s.iters, function()
+    bench("qd.decode + access content", s.iters, function()
         local t = qd.decode(s.payload)
         table_access(t)
     end)
@@ -315,41 +342,42 @@ print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ",
 
 do
     local next_p = make_cycler(interleaved)
-    bench("cjson.decode + access 3 fields", 400, function()
+    bench("cjson.decode + access fields", 400, function()
         local p = next_p()
         local obj = cjson.decode(p)
-        local _ = obj.model
-        local _ = obj.temperature
-        local _ = obj.messages and obj.messages[1] and obj.messages[1].role
+        default_cjson_access(obj)
     end)
 
+    if simdjson then
+        next_p = make_cycler(interleaved)
+        bench("simdjson.decode + access fields", 400, function()
+            local p = next_p()
+            local obj = simdjson:decode(p)
+            default_cjson_access(obj)
+        end)
+    end
+
     next_p = make_cycler(interleaved)
-    bench("quickdecode.parse + access 3 fields", 400, function()
+    bench("quickdecode.parse + access fields", 400, function()
         local p = next_p()
         local d = qd.parse(p)
-        local _ = d:get_str("model")
-        local _ = d:get_f64("temperature")
-        local _ = d:get_str("messages[0].role")
+        default_qd_access(d)
     end)
 
     if has_pooled_api then
         next_p = make_cycler(interleaved)
-        bench("quickdecode pooled :parse + access 3 fields", 400, function()
+        bench("quickdecode pooled :parse + access fields", 400, function()
             local p = next_p()
             local d = pooled_decoder:parse(p)
-            local _ = d:get_str("model")
-            local _ = d:get_f64("temperature")
-            local _ = d:get_str("messages[0].role")
+            default_qd_access(d)
         end)
     end
 
     next_p = make_cycler(interleaved)
-    bench("qd.decode + t.field x3", 400, function()
+    bench("qd.decode + access content", 400, function()
         local p = next_p()
         local t = qd.decode(p)
-        local _ = t.model
-        local _ = t.temperature
-        local _ = t.messages and t.messages[1] and t.messages[1].role
+        default_table_access(t)
     end)
 
     next_p = make_cycler(interleaved)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ea4eae9..ef0dab9 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -4,29 +4,23 @@ Throughput and memory comparison of `quickdecode` (this library) against
 `lua-cjson` and `lua-resty-simdjson` on a multimodal chat-completion payload
 ladder from 2 KB to 10 MB.
 
-`quickdecode` is optimized for *parse + read a small number of fields*; the
-data below quantifies how the lazy structural scan beats an eager build-the-
-whole-table parser, and where the gap narrows. `lua-cjson` is the baseline.
-`lua-resty-simdjson` (a Lua binding over the simdjson C++ library, eager) is
-included to show how much of the win comes from SIMD vs. from skipping the
-table build.
+`quickdecode` is optimized for *parse + read a small part of the document*;
+the data below quantifies how the lazy structural scan behaves when the caller
+reads request metadata plus every chat message `content`, without eagerly
+building the whole Lua table. `lua-cjson` and `lua-resty-simdjson` are eager
+Lua-table baselines.
 
 ## Environment
 
 | | |
 |---|---|
-| Host CPU | Intel Xeon (Skylake, IBRS), 4 cores |
-| Memory | 7.6 GiB |
-| OS | Linux x86_64 |
-| Runtime | OpenResty `resty` 0.29 / openresty 1.29.2.3 / LuaJIT 2.1 ROLLING |
+| Host CPU | Intel Core i5-9400, 6 cores, AVX2 + PCLMUL |
+| Memory | 15 GiB |
+| OS | Ubuntu 24.04.4 LTS, Linux 6.8.0-110-generic, x86_64 |
+| Runtime | OpenResty `resty` 0.29 / OpenResty 1.21.4.4 / LuaJIT 2.1.1723681758 |
 | `quickdecode` | this repo, release build, AVX2 + PCLMUL scanner active |
-| `lua-cjson` | bundled with OpenResty |
-| `lua-resty-simdjson` | upstream `main` build at `/tmp/lua-resty-simdjson`, simdjson C++ pinned by that repo |
-
-The bench uses the OpenResty `resty` CLI because `lua-resty-simdjson` pulls in
-`ngx.null` / `ngx.sleep` at load time and cannot run under bare LuaJIT
-without an OpenResty environment. `lua-cjson` and `quickdecode` themselves
-run fine under bare LuaJIT.
+| `lua-cjson` | vendored `openresty/lua-cjson` |
+| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib |
 
 ## Methodology
 
@@ -39,14 +33,15 @@ The harness lives at `benches/lua_bench.lua`. For each scenario:
 3. 5 rounds × N iterations of the workload; report the **median** ops/s
    across rounds (mean + range also reported in the raw output).
 4. Final `collectgarbage("count")` to capture the post-run memory delta in
-   KB — measures GC-rooted state retained by the parser, not transient
-   per-call allocations.
+   KB. The harness does not force a final collection after timing, so
+   short-lived garbage from the last round may still be included.
 
-The payload is a synthetic multimodal chat-completion request — one
-~1.5 KB text part plus N base64-encoded image parts of 50–500 KB each
-until the target size is reached. The image size sequence comes from a
-Park–Miller LCG with `seed=42` rather than `math.random` so the payload is
-byte-identical across hosts.
+The payload is a synthetic multimodal chat-completion request with one or more
+historical messages. Each message contains one small text part and one
+base64-encoded image part. Message count scales with payload size: the 10 MB
+scenario has roughly ten messages, each carrying one ~1 MB image, so the
+access pattern matches request bodies where every historical message includes
+an image.
 
 A separate `github-100k` scenario simulates a GitHub Issues API response
 (`/repos/{owner}/{repo}/issues`) with ~100 KB of realistic REST API
@@ -58,92 +53,80 @@ parsing workloads with ~3-5% structural density.
 
 | Row | What it does | Notes |
 |---|---|---|
-| `cjson.decode + access 3 fields` | `cjson.decode(s)` then read 3 fields | Eager Lua table |
-| `resty.simdjson:decode + access 3 fields` | `parser:decode(s)` then read 3 fields | Eager Lua table; **parser instance is reused** across iterations (the upstream-recommended pattern) |
-| `quickdecode.parse + access 3 fields` | `qd.parse(s)` then `d:get_str/get_f64` × 3 | Lazy structural scan; explicit path-based reads |
-| `qd.decode + t.field x3` | `qd.decode(s)` then `t.model` / `t.temperature` / `t.messages[1].role` | Lazy table proxy; reads go through `__index` |
+| `cjson.decode + access fields` | `cjson.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
+| `simdjson.decode + access fields` | `resty.simdjson:decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Eager Lua table |
+| `quickdecode.parse + access fields` | `qd.parse(s)`, read `model` / `temperature`, then touch every `messages[*].content` path | Lazy structural scan; explicit path reads |
+| `qd.decode + access content` | `qd.decode(s)`, read `model` / `temperature`, then read every `messages[*].content` | Lazy table proxy; reads go through `__index` |
 | `qd.decode + qd.encode (unmodified)` | `qd.decode(s)` then re-emit as JSON | Substring fast path — no fields touched, so the proxy re-emits the original byte range via `memcpy` |
 
 ## Reproducing
 
-The straight comparison against `cjson` is one command:
+Run the full comparison with one command:
 
 ```sh
 make bench
 ```
 
-This invokes `benches/lua_bench.lua` with `LD_LIBRARY_PATH=target/release`
-and a `LUA_CPATH` that picks up `cjson` from the system locations. It does
-**not** include `lua-resty-simdjson`.
-
-To also include `lua-resty-simdjson` you need (1) the library installed
-somewhere `package.cpath` can reach the `.so`, (2) its Lua wrapper on
-`package.path`, and (3) the bench script patched to require it. The patch
-that adds the bench rows is a small `pcall(require, "resty.simdjson")` block;
-keep it local — it is not part of the upstream bench file. Run it through
-`resty` so the `ngx.*` symbols are available:
-
-```sh
-LD_LIBRARY_PATH=$PWD/target/release \
-LUA_CPATH='/path/to/lua-resty-simdjson/?.so;./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so' \
-LUA_PATH='/path/to/lua-resty-simdjson/lib/?.lua;/path/to/lua-resty-simdjson/lib/?/init.lua;./lua/?.lua;;' \
-/usr/local/openresty/bin/resty benches/lua_bench.lua
-```
+This builds `quickdecode`, builds the vendored `lua-cjson` against OpenResty's
+LuaJIT, then invokes `benches/lua_bench.lua` through OpenResty's `resty` so
+`lua-resty-simdjson` runs in its normal `ngx` environment.
+If `resty.simdjson` is not available on `package.path` / `package.cpath`, the
+harness prints a skip message and omits the simdjson rows.
 
 Numbers below come from one such run.
 
 ## Results — throughput (median ops/s)
 
-Each row is "parse + access 3 fields" on the named payload.
+Each row is "parse + access request fields" on the named payload.
 
-| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` |
+| Scenario | Size | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|---:|---:|
-| small      |   2.1 KB | 39,414 | 54,395 | 117,233 | 126,807 | 268,240 |
-| medium     |  60.4 KB |  5,600 | 40,180 |  90,074 | 120,627 | 126,263 |
-| github-100k |   100 KB |  5,373 |      — |  27,020 |  27,367 |  36,430 |
-| 100k       |   100 KB |  2,589 | 19,944 |  72,202 |  61,162 |  80,257 |
-| 200k       |   200 KB |  1,414 | 14,397 |  57,670 |  48,031 |  58,548 |
-| 500k       |   500 KB |    722 |  5,882 |  34,602 |  33,167 |  36,900 |
-| 1m         |  1.00 MB |    355 |  2,048 |  12,723 |  12,448 |  12,669 |
-| 2m         |  2.00 MB |    157 |    886 |   7,143 |   6,521 |   7,432 |
-| 5m         |  5.00 MB |     64 |    250 |   2,509 |   2,235 |   2,552 |
-| 10m        | 10.00 MB |     32 |    128 |     537 |     609 |     540 |
-| interleaved (100k/200k/500k/1m, cycled) |  — |    723 |  4,399 |  21,424 |  23,378 |  24,004 |
+| small      |   2.1 KB | 106,646 | 137,427 | 135,296 |  97,574 | 202,388 |
+| medium     |  60.4 KB |  10,086 |  86,029 | 189,970 | 198,098 | 175,562 |
+| github-100k |   100 KB |   2,208 |   2,880 |   4,496 |   4,479 |   4,809 |
+| 100k       |   100 KB |   6,045 |  46,577 | 137,931 | 134,590 | 153,139 |
+| 200k       |   200 KB |   3,025 |  22,563 |  78,247 |  75,873 |  81,433 |
+| 500k       |   500 KB |   1,216 |   9,128 |  33,058 |  32,680 |  34,188 |
+| 1m         |  1.00 MB |     594 |   4,408 |  16,447 |  16,340 |  16,722 |
+| 2m         |  2.00 MB |     296 |   1,966 |   8,247 |   8,224 |   8,055 |
+| 5m         |  5.00 MB |     118 |     600 |   2,869 |   2,945 |   2,992 |
+| 10m        | 10.00 MB |      59 |     356 |   1,035 |   1,028 |   1,050 |
+| interleaved (100k/200k/500k/1m, cycled) | — | 1,318 | 9,116 | 33,342 | 32,752 | 34,031 |
 
 ### Speed-up vs. baselines
 
-| Scenario | simdjson / cjson | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access` / cjson |
+| Scenario | `qd.parse` / cjson | `qd.parse` / simdjson | `qd.decode + access content` / cjson | `qd.decode + access content` / simdjson |
 |---|---:|---:|---:|---:|
-| small  | 1.4× |  3.0× | 2.2× |  3.2× |
-| medium | 7.2× | 16.1× | 2.2× | 21.5× |
-| github-100k | — | 5.0× | — | 5.1× |
-| 100k   | 7.7× | 27.9× | 3.6× | 23.6× |
-| 200k   | 10.2× | 40.8× | 4.0× | 34.0× |
-| 500k   | 8.1× | 47.9× | 5.9× | 45.9× |
-| 1m     | 5.8× | 35.8× | 6.2× | 35.1× |
-| 2m     | 5.6× | 45.5× | 8.1× | 41.5× |
-| 5m     | 3.9× | 39.2× | 10.0× | 34.9× |
-| 10m    | 4.0× | 16.8× | 4.2× | 19.0× |
+| small  |  1.3× |  1.0× |  0.9× |  0.7× |
+| medium | 18.8× |  2.2× | 19.6× |  2.3× |
+| github-100k | 2.0× |  1.6× | 2.0× |  1.6× |
+| 100k   | 22.8× |  3.0× | 22.3× |  2.9× |
+| 200k   | 25.9× |  3.5× | 25.1× |  3.4× |
+| 500k   | 27.2× |  3.6× | 26.9× |  3.6× |
+| 1m     | 27.7× |  3.7× | 27.5× |  3.7× |
+| 2m     | 27.9× |  4.2× | 27.8× |  4.2× |
+| 5m     | 24.3× |  4.8× | 25.0× |  4.9× |
+| 10m    | 17.5× |  2.9× | 17.4× |  2.9× |
 
 ## Results — memory delta (KB retained after 5 rounds)
 
-Post-run `collectgarbage("count")` minus baseline. Captures GC-rooted state
-the parser retains across iterations; transient per-call allocations are
-collected before the snapshot.
+Post-run `collectgarbage("count")` minus baseline. Captures heap usage after
+the timing rounds without forcing a final collection, so short-lived garbage
+from the last round may still be included.
 
-| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + t.f x3` | `qd.decode + qd.encode` |
+| Scenario | cjson | simdjson | `qd.parse` | `qd.decode + access content` | `qd.decode + qd.encode` |
 |---|---:|---:|---:|---:|---:|
-| small      | +15,881 | +16,284 | +1,338 | +4,337 | +11,140 |
-| medium     |  +1,955 |  +2,661 |    +66 |   +500 |  +1,120 |
-| github-100k | +12,867 |       — |    +19 |   +592 |    +273 |
-| 100k       |    +601 |    +950 |    +18 |   +429 |    +229 |
-| 200k       |    +505 |    +722 |     +7 |   +206 |    +112 |
-| 500k       |    +648 |    +757 |     +3 |    +83 |     +45 |
-| 1m         |  +1,151 |  +1,246 |     +2 |    +62 |     +34 |
-| 2m         |  +2,311 |  +2,510 |     +3 |    +82 |     +45 |
-| 5m         |  +5,723 |  +6,191 |     +3 |    +82 |     +45 |
-| 10m        | +11,262 | +12,053 |     +3 |    +83 |     +45 |
-| interleaved |  +4,509 |  +6,464 |    +53 | +1,671 |    +898 |
+| small      | +15,464 | +15,447 | +4,094 | +15,251 | +11,908 |
+| medium     |  +1,955 |  +2,660 |   +160 |  +1,210 |  +1,216 |
+| github-100k | +13,187 | +3,362 |   +29 |    +548 |    +242 |
+| 100k       |    +484 |   +748 |   +79 |    +704 |    +241 |
+| 200k       |    +392 |   +523 |   +40 |    +352 |    +124 |
+| 500k       |    +577 |   +630 |   +17 |    +142 |     +48 |
+| 1m         |  +1,082 | +1,121 |   +13 |    +107 |     +37 |
+| 2m         |  +1,155 | +1,248 |   +21 |    +211 |     +48 |
+| 5m         |  +1,316 | +1,538 |   +17 |    +403 |     +48 |
+| 10m        |  +1,583 | +2,014 |   +16 |    +844 |     +48 |
+| interleaved | +3,355 | +4,404 |  +314 |  +2,825 |    +945 |
 
 `qd.parse` retention is essentially constant across payload size: the only
 GC-rooted state is the reusable `indices: Vec<u32>` and `scratch` buffers.
@@ -152,62 +135,40 @@ lazy proxy and any cached child views — but still allocate one to two
 orders of magnitude less than the eager parsers, which materialize every
 key into the Lua table heap.
 
-## Pure-decode comparison (no field access)
-
-Where the rows above measure "decode + use a few fields", this isolates
-parse-time only. `cjson` and `simdjson` must still materialize a full Lua
-table (no API to stop short of that); `qd.parse` does only the structural
-scan and the skip-cache prep, deferring all per-field decode to whoever
-later asks. Captures the upper bound of the lazy win.
-
-| Scenario | cjson | simdjson | `qd.parse` | `qd.parse` / cjson | `qd.parse` / simdjson |
-|---|---:|---:|---:|---:|---:|
-| small  |  47,699 | 72,776 | 264,985 |  5.6× | 3.6× |
-| medium |   6,698 | 48,328 | 105,485 | 15.7× | 2.2× |
-| 100k   |   3,944 | 35,753 | 154,321 | 39.1× | 4.3× |
-| 200k   |   1,974 | 17,403 |  80,386 | 40.7× | 4.6× |
-| 500k   |     773 |  6,911 |  35,149 | 45.5× | 5.1× |
-| 1m     |     362 |  2,611 |  14,691 | 40.6× | 5.6× |
-| 2m     |     179 |  1,197 |   7,516 | 42.0× | 6.3× |
-| 5m     |      74 |    293 |   2,876 | 38.9× | 9.8× |
-| 10m    |      37 |    143 |     665 | 18.0× | 4.7× |
-
 ## Observations
 
-1. **`simdjson` is 4–10× faster than `cjson` in the medium-to-large range**;
-   the gap narrows at both ends — very small payloads are dominated by
-   fixed per-call overhead, very large ones become memory-bandwidth bound on
-   the Lua-table build.
-2. **`quickdecode` is 16–48× faster than `cjson` and 2–10× faster than
-   `simdjson`** on this workload. The win is not from SIMD — `simdjson`
-   already has that — but from never building a Lua table. Field reads pay
-   their own cost, but most fields are never read.
+1. **`quickdecode` is fastest once payloads move beyond tiny inputs.**
+   The small 2 KB row is dominated by fixed Lua/FFI overhead, but medium and
+   larger multimodal payloads show roughly 18–28× higher throughput than
+   `cjson` and roughly 3–5× higher throughput than `lua-resty-simdjson`
+   for request-field access.
+2. **Reading every `messages[*].content` is still access-light for large
+   multimodal bodies.** The benchmark touches the top-level request fields and
+   one `content` field per message; the payload size comes from image data
+   inside each message.
 3. **The win drops at 10 MB.** `qd.parse` is L3-bandwidth-bound at that
    size, and the `qd.decode` proxy's per-`__index` dispatch starts to
-   amortize less well against the cheaper structural scan. Other parsers
-   are still allocating into the table heap at that size, so they degrade
-   too, but the ratio compresses.
+   amortize less well against the cheaper structural scan. `cjson` is still
+   allocating into the table heap at that size, so the ratio remains large.
 4. **`qd.decode + qd.encode (unmodified)` is the headline number for
    passthrough workloads** — e.g. an LLM gateway re-emitting the original
    JSON after light-touch inspection. The substring fast path means
    re-emit is `memcpy`, not re-serialize, and the throughput tracks
    `qd.parse` very closely.
 5. **Memory retention** for `quickdecode` is essentially flat in payload
-   size; the eager parsers retain ~1× the input size after the first run
+   size; the eager parsers retain more Lua heap after the first run
    because the Lua table tree stays GC-rooted until the next collection.
-   The 10 MB case retains ~11 MB for `cjson` / `simdjson`, ~3 KB for
-   `qd.parse`.
-6. **REST API payloads (github-100k) show a 5× speedup** — lower than the
-   multimodal payloads because the structural density is higher (~3-5% vs
-   <0.1%). However, memory savings remain dramatic: 677× less retention
-   (12.8 MB → 19 KB) because `cjson` must materialize every nested object
-   and string into the Lua heap.
+   The 10 MB case retains ~1.5 MB for `cjson`, ~2.0 MB for simdjson,
+   and ~16 KB for `qd.parse`.
+6. **REST API payloads (github-100k) show a smaller speedup** because their
+   structural density is higher than the multimodal request ladder. Memory
+   savings remain dramatic because `cjson` must materialize every nested
+   object and string into the Lua heap.
 
 ## When to pick which
 
-- **Read most/all fields** → `cjson` or `simdjson`. `simdjson` is a near-
-  drop-in faster replacement (pool the parser).
-- **Parse, read a few fields, discard / re-emit** → `quickdecode`. The
+- **Read most/all fields** → `cjson`.
+- **Parse, read selected fields, discard / re-emit** → `quickdecode`. The
   bigger the payload and the smaller the read fraction, the larger the
   win. `qd.decode` / `qd.encode` gives a `cjson`-shaped surface; `qd.parse`
   + path getters is the lower-level API with slightly higher peak
@@ -222,9 +183,8 @@ later asks. Captures the upper bound of the lazy win.
   do, broadly.
 - Workload is biased toward string-heavy payloads (chat-completion image
   parts). Object-key-heavy JSON shifts the picture: more structural work
-  per byte and less raw `memcpy`, so the SIMD scanners (`simdjson`,
-  `quickdecode`'s AVX2 path) get further ahead of `cjson` and the
-  table-build cost on the eager side rises.
+  per byte and less raw `memcpy`, while the table-build cost on the eager
+  side rises.
 - `quickdecode` retains the source buffer on the `Doc`, so the input
   string stays alive for the document's lifetime. If you parse and
   immediately discard the JSON string in the caller, GC can still free