From 89250167a732782f1f5543c900a64ee11033d7ba Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 15 May 2026 23:23:34 +0000 Subject: [PATCH 1/2] perf(ffi): fold legacy qjd_parse into a single allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decoder/document pooling refactor (commit 0721d7d) split the legacy qjd_parse path into two heap allocations (`qjd_decoder` + `qjd_doc`) and made every `qjd_get_*` walk a state-machine check via `check_doc_alive`. That made the new pooled API faster when the decoder is reused, but left the one-shot `qjd_parse()` users paying double-alloc / double-free per call plus a gen check per field access, with no upside since their doc cannot be stale. Co-locate `qjd_doc` and its private `qjd_decoder` in a single `#[repr(C)] OwnedDocBlock` allocation: - qjd_parse: one Box::new instead of two; parse in place to avoid a ~100-byte stack-to-heap memcpy of the freshly-constructed Decoder. - qjd_free: branch on owns_decoder and recover the block pointer via the offset-0 invariant for the legacy path. - check_doc_alive: legacy docs skip the state/gen check (unreachable by construction — the decoder is private) and reach the decoder by static offset from the doc pointer, not by loading doc.decoder. The pooled API (`qjd_decoder_new` + `qjd_decoder_parse`) is unchanged and still owns_decoder = false. All existing tests, including the count-allocs gate (`pooled < legacy / 2`), still pass. --- src/ffi.rs | 58 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/src/ffi.rs b/src/ffi.rs index 603615e..9aff4f7 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -73,13 +73,36 @@ pub struct qjd_doc { owns_decoder: bool, } +/// Heap layout for the one-shot [`qjd_parse`] path: `qjd_doc` and its +/// private `qjd_decoder` live in the same allocation. `#[repr(C)]` pins +/// `doc` at offset 0 so the public `*mut qjd_doc` returned to callers can +/// be cast back to `*mut OwnedDocBlock` on free. +#[repr(C)] +struct OwnedDocBlock { + doc: qjd_doc, + decoder: qjd_decoder, +} + // ── Entry-point safety helpers ────────────────────────────────────────────── /// Validate `doc` and return the live decoder. Order matters: a destroyed /// decoder is reported as `QJD_INVALID_ARG`, not `QJD_STALE_DOC`. +/// +/// Docs created by [`qjd_parse`] (`owns_decoder == true`) take a fast path: +/// the private decoder is unreachable to any of the `qjd_decoder_*` mutating +/// operations, so state and gen are pristine by construction. unsafe fn check_doc_alive(doc: *mut qjd_doc) -> Result<&'static Decoder, qjd_err> { if doc.is_null() { return Err(qjd_err::QJD_INVALID_ARG); } let d = &*doc; + if d.owns_decoder { + // Legacy: the decoder is the sibling field in the same OwnedDocBlock. + // Compute its address from the static struct offset — no pointer load, + // matching the pre-pool layout where the decoder sat directly inside + // the doc allocation. + let block_ptr = doc as *const OwnedDocBlock; + let dec: &Decoder = &(*block_ptr).decoder.0; + return Ok(std::mem::transmute::<&Decoder, &'static Decoder>(dec)); + } let dec: &Decoder = &d.decoder.as_ref().0; if matches!(dec.state, DecoderState::Destroyed) { return Err(qjd_err::QJD_INVALID_ARG); @@ -146,20 +169,27 @@ pub unsafe extern "C" fn qjd_parse( if !err_out.is_null() { *err_out = qjd_err::QJD_INVALID_ARG as c_int; } return ptr::null_mut(); } - let dec_ptr = Box::into_raw(Box::new(qjd_decoder(Decoder::new()))); + // Allocate the block first with a fresh Decoder, then parse in-place. + // This avoids the ~100-byte stack→heap memcpy a naive `let mut decoder + // = Decoder::new(); ... Box::new(...decoder)` path pays per parse. + let block_ptr = Box::into_raw(Box::new(OwnedDocBlock { + doc: qjd_doc { + decoder: NonNull::dangling(), + gen: 0, + owns_decoder: true, + }, + decoder: qjd_decoder(Decoder::new()), + })); + (*block_ptr).doc.decoder = NonNull::new_unchecked(&mut (*block_ptr).decoder); let slice: &[u8] = std::slice::from_raw_parts(buf, len); - match (*dec_ptr).0.parse(slice) { + match (*block_ptr).decoder.0.parse(slice) { Ok(()) => { + (*block_ptr).doc.gen = (*block_ptr).decoder.0.gen; *err_out = qjd_err::QJD_OK as c_int; - let doc = qjd_doc { - decoder: NonNull::new_unchecked(dec_ptr), - gen: (*dec_ptr).0.gen, - owns_decoder: true, - }; - Box::into_raw(Box::new(doc)) + &mut (*block_ptr).doc } Err(e) => { - let _ = Box::from_raw(dec_ptr); + let _ = Box::from_raw(block_ptr); *err_out = e as c_int; ptr::null_mut() } @@ -187,9 +217,13 @@ pub unsafe extern "C" fn qjd_parse( #[no_mangle] pub unsafe extern "C" fn qjd_free(doc: *mut qjd_doc) { if doc.is_null() { return; } - let d = Box::from_raw(doc); - if d.owns_decoder { - let _ = Box::from_raw(d.decoder.as_ptr()); + // Read owns_decoder before taking ownership: the legacy doc lives inside + // an OwnedDocBlock and must be freed through that layout, while a pool + // doc is a standalone Box. + if (*doc).owns_decoder { + let _ = Box::from_raw(doc as *mut OwnedDocBlock); + } else { + let _ = Box::from_raw(doc); } } From d3634fe3b5a374db07f45f4d86673e5f3ba7f0d9 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Fri, 15 May 2026 23:23:47 +0000 Subject: [PATCH 2/2] bench: warmup + median + interleaved + pooled/one-shot scenarios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The single-run-with-mean output the bench used to print swung 30-40% between invocations on noisy machines, making it hard to tell signal from noise when comparing perf commits. - bench() now runs a warmup pass (JIT trace compile, pool fill), then five timed rounds. Reports median and mean ops/s plus the round-by- round min..max range so reviewers can see whether a delta is real. - Add an `interleaved 100k,200k,500k,1m` scenario that rotates through four payload sizes, matching a server that handles varying request sizes back to back. The single-payload loops cannot exercise the doc pool the way real traffic does. - For each scenario, probe `qd.new_decoder` and run two extra qd variants when present: quickdecode pooled :parse — reused decoder across iters quickdecode new_decoder()+parse — one-shot per iter (no reuse) So a reader can directly compare the legacy qd.parse path, the pool-API-with-reuse path, and the realistic "user creates a fresh decoder per request" pattern in one bench run. Also ship benches/perf_probe.lua: a minimal hammer over qd.parse on a fixed payload for use under `perf record` when investigating FFI hot paths. Not invoked by Makefile targets. --- benches/lua_bench.lua | 118 ++++++++++++++++++++++++++++++++++++++--- benches/perf_probe.lua | 60 +++++++++++++++++++++ 2 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 benches/perf_probe.lua diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua index 5293dbf..7f52d25 100644 --- a/benches/lua_bench.lua +++ b/benches/lua_bench.lua @@ -62,17 +62,36 @@ local function make_payload(target_bytes) .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}' end +local ROUNDS = 5 + local function bench(name, iters, fn) + -- Warmup pass: lets JIT compile hot traces and any one-time pools fill + -- before measurement starts. Excluded from timing and memory delta. + local warmup = math.max(3, math.floor(iters / 5)) + for _ = 1, warmup do fn() end + collectgarbage("collect") local mem_before = collectgarbage("count") - local t0 = os.clock() - for _ = 1, iters do fn() end - local t1 = os.clock() + + local ops = {} + for r = 1, ROUNDS do + local t0 = os.clock() + for _ = 1, iters do fn() end + local t1 = os.clock() + ops[r] = iters / (t1 - t0) + end local mem_after = collectgarbage("count") - local elapsed = t1 - t0 - print(string.format("%-44s %7.2fms total %10.0f ops/s %+8.1fKB", - name, elapsed * 1000, iters / elapsed, - mem_after - mem_before)) + + table.sort(ops) + local median = ops[math.ceil(ROUNDS / 2)] + local lo, hi = ops[1], ops[ROUNDS] + local sum = 0 + for i = 1, ROUNDS do sum = sum + ops[i] end + local mean = sum / ROUNDS + + print(string.format( + "%-44s median %9.0f ops/s mean %9.0f range %7.0f..%-9.0f %+8.1fKB", + name, median, mean, lo, hi, mem_after - mem_before)) end local scenarios = { @@ -87,6 +106,11 @@ local scenarios = { {name = "10m", iters = 20, payload = make_payload(10 * 1024 * 1024)}, } +-- The pooled API (qd.new_decoder + :parse) only exists on commits that +-- landed the Decoder refactor. Probe so the bench still runs on older builds. +local has_pooled_api = type(qd.new_decoder) == "function" +local pooled_decoder = has_pooled_api and qd.new_decoder() or nil + for _, s in ipairs(scenarios) do print(string.format("=== %s (%d bytes) ===", s.name, #s.payload)) @@ -103,4 +127,84 @@ for _, s in ipairs(scenarios) do local _ = d:get_f64("temperature") local _ = d:get_str("messages[0].role") end) + + if has_pooled_api then + bench("quickdecode pooled :parse + access 3 fields", s.iters, function() + local d = pooled_decoder:parse(s.payload) + local _ = d:get_str("model") + local _ = d:get_f64("temperature") + local _ = d:get_str("messages[0].role") + end) + + -- One-shot-per-request pattern: each iter creates a fresh decoder, + -- parses once, and lets both decoder and doc fall to GC. No reuse. + -- This is the typical "user does not cache the decoder" path. + bench("quickdecode new_decoder()+parse (one-shot)", s.iters, function() + local dec = qd.new_decoder() + local d = dec:parse(s.payload) + local _ = d:get_str("model") + local _ = d:get_f64("temperature") + local _ = d:get_str("messages[0].role") + end) + end +end + +-- Interleaved scenario: cycle through several payloads of different sizes +-- back-to-back, mirroring a server processing variable-size requests. The +-- single-payload loops above hand the allocator the same block over and over +-- and have no allocation to amortize away — they cannot exercise the doc +-- pool. This scenario can. +local function scenario_by_name(n) + for _, s in ipairs(scenarios) do + if s.name == n then return s end + end + error("no scenario " .. n) +end + +local interleaved_names = {"100k", "200k", "500k", "1m"} +local interleaved = {} +for _, n in ipairs(interleaved_names) do + interleaved[#interleaved + 1] = scenario_by_name(n).payload +end + +local function make_cycler(items) + local i = 0 + local n = #items + return function() + i = i + 1 + return items[((i - 1) % n) + 1] + end +end + +print(string.format("=== interleaved %s ===", table.concat(interleaved_names, ","))) + +do + local next_p = make_cycler(interleaved) + bench("cjson.decode + access 3 fields", 400, function() + local p = next_p() + local obj = cjson.decode(p) + local _ = obj.model + local _ = obj.temperature + local _ = obj.messages and obj.messages[1] and obj.messages[1].role + end) + + next_p = make_cycler(interleaved) + bench("quickdecode.parse + access 3 fields", 400, function() + local p = next_p() + local d = qd.parse(p) + local _ = d:get_str("model") + local _ = d:get_f64("temperature") + local _ = d:get_str("messages[0].role") + end) + + if has_pooled_api then + next_p = make_cycler(interleaved) + bench("quickdecode pooled :parse + access 3 fields", 400, function() + local p = next_p() + local d = pooled_decoder:parse(p) + local _ = d:get_str("model") + local _ = d:get_f64("temperature") + local _ = d:get_str("messages[0].role") + end) + end end diff --git a/benches/perf_probe.lua b/benches/perf_probe.lua new file mode 100644 index 0000000..4b0231e --- /dev/null +++ b/benches/perf_probe.lua @@ -0,0 +1,60 @@ +-- Minimal probe for perf: hammers qd.parse on a fixed 100K payload so perf +-- samples concentrate on the FFI entry + parse hot path. Not a benchmark — +-- there is no timing or memory accounting here, just sustained work. + +package.path = package.path .. ";./lua/?.lua" +package.cpath = package.cpath .. ";./target/release/lib?.so" + +local qd = require("quickdecode") + +-- Same payload generator as lua_bench.lua so probe output corresponds to +-- the same shape the bench measures. Park-Miller LCG keeps it deterministic. +local function make_payload(target_bytes) + local rng_state = 42 + local function rng_range(lo, hi) + rng_state = (rng_state * 48271) % 2147483647 + return lo + (rng_state % (hi - lo + 1)) + end + + local text = string.rep("Q", 1500) + local text_part = '{"type":"text","text":"' .. text .. '"}' + local parts = { text_part } + local current = 200 + #text_part + + while current < target_bytes do + local remaining = target_bytes - current + local img_size + if remaining < 50 * 1024 then + img_size = math.max(1024, remaining) + else + local upper = math.min(500 * 1024, remaining) + img_size = rng_range(50 * 1024, upper) + end + local b64 = string.rep("A", img_size) + local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,' + .. b64 .. '"}}' + parts[#parts + 1] = img_part + current = current + #img_part + 1 + end + + return '{"model":"gpt-4-vision","temperature":0.7,"messages":' + .. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}' +end + +local payload = make_payload(100 * 1024) +local iters = tonumber(arg[1]) or 500000 + +-- Warmup so JIT traces compile before perf starts sampling steady state. +for _ = 1, 1000 do + local d = qd.parse(payload) + local _ = d:get_str("model") +end + +io.stderr:write(string.format("probe: %d bytes payload, %d iters\n", #payload, iters)) + +for _ = 1, iters do + local d = qd.parse(payload) + local _ = d:get_str("model") + local _ = d:get_f64("temperature") + local _ = d:get_str("messages[0].role") +end