Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Overridable: `make bench LUAJIT=/path/to/luajit LUA_CPATH='...'`
LUAJIT ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)
LUA_CPATH ?= ./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so

LIB_DIR := $(CURDIR)/target/release
LUA_ENV := LD_LIBRARY_PATH=$(LIB_DIR) LUA_CPATH='$(LUA_CPATH)'

.PHONY: help build test lint bench clean

help: ## Show this help
@awk 'BEGIN {FS = ":.*## "} /^[a-zA-Z_-]+:.*## / {printf " \033[36m%-10s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

build: ## Build the release cdylib (target/release/libquickdecode.so)
cargo build --release

test: build ## Run cargo tests + busted Lua tests
cargo test --release
$(LUA_ENV) busted --lua=$(LUAJIT) tests/lua --lpath='./lua/?.lua'

lint: ## Run clippy (deny warnings) and rustfmt --check
cargo clippy --release --all-targets -- -D warnings
cargo fmt --check

bench: build ## Run the LuaJIT vs cjson benchmark
$(LUA_ENV) $(LUAJIT) benches/lua_bench.lua

clean: ## Remove build artifacts
cargo clean
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ cargo build --release
# Output: target/release/libquickdecode.so
```

A `Makefile` wraps the common workflows; run `make help` to see `build`, `test`, `lint`, `bench`, and `clean` targets. Override `LUAJIT` / `LUA_CPATH` per invocation if your environment differs from the defaults.

## Testing

```sh
Expand Down Expand Up @@ -67,5 +69,8 @@ Items intentionally pushed out of the first implementation. Each will be picked
- **Lossless 64-bit integer mode** — return cdata `int64_t` to LuaJIT to preserve precision > 2⁵³.
- **Skip-cache LRU eviction** — only if memory pressure on huge documents proves problematic in practice.
- **Path-position info on Phase 1 errors** — currently only an opaque `QJD_PARSE_ERROR`.
- **AVX2 tail-bypass optimization** — current implementation falls back to whole-buffer scalar when a tail exists; could be optimized by emitting tail structural offsets directly.
- **Large bench fixtures** — spec §9.3 lists `large_dump.json` (~20 MB) and `deep_nest.json` (depth stress test); not yet committed. Only `small_api.json` and `medium_resp.json` ship today.
- **`structural_mask_chunk` via shuffle-based set check** — the current AVX2 scanner does 7 `_mm256_cmpeq_epi8` + `_mm256_movemask_epi8` per chunk half (one per structural char in `{}[]:,"`). A single `_mm256_shuffle_epi8` against a 16-byte LUT plus one cmpeq can do the same set membership in 2-3 ops per half. Estimated 15-25% scanner speedup on dense-structural workloads. Not on the hot path for string-heavy payloads (those already short-circuit via the fast path).
- **Adaptive `out.reserve` in scanners** — `out.reserve(buf.len() / 6)` is calibrated for object-heavy JSON. On string-heavy multimodal payloads (one big content array, mostly base64) the actual emit rate is <1 structural per 1 KB, so we over-reserve by 100x+. Mainly a memory hygiene concern (mmap'd pages stay lazily faulted), <5% throughput effect.
- **AVX-512 scanner backend** — 64-byte → 128-byte chunks doubles scan throughput on CPUs with `avx512bw` + `vpclmulqdq` (Sapphire Rapids, Zen 4+). Estimated 2× on supporting silicon; no effect elsewhere.
- **`# Safety` docs on unsafe FFI exports** — `make lint` currently fails on 22 `missing_safety_doc` clippy warnings from the public `qjd_*` C-ABI functions. Tracked separately so the Makefile can ship with `-D warnings` already wired up.
59 changes: 45 additions & 14 deletions benches/lua_bench.lua
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,69 @@ local function read_file(p)
return s
end

-- Shape: a multimodal chat-completion request with one ~1.5K text question
-- and N base64-encoded image parts (each 50-500 KB) until the payload reaches
-- target_bytes. Mirrors the production case the bench is meant to reflect.
local function make_payload(target_bytes)
math.randomseed(42)
local text = string.rep("Q", 1500)
local text_part = '{"type":"text","text":"' .. text .. '"}'
local parts = { text_part }
local current = 200 + #text_part -- approx outer envelope overhead

while current < target_bytes do
local remaining = target_bytes - current
local upper = math.min(500 * 1024, math.max(50 * 1024, remaining + 50 * 1024))
local lower = math.min(50 * 1024, upper)
local img_size = math.random(lower, upper)
local b64 = string.rep("A", img_size)
local img_part = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
.. b64 .. '"}}'
parts[#parts + 1] = img_part
current = current + #img_part + 1 -- +1 for comma
end

return '{"model":"gpt-4-vision","temperature":0.7,"messages":'
.. '[{"role":"user","content":[' .. table.concat(parts, ",") .. ']}]}'
end

local function bench(name, iters, fn)
collectgarbage("collect")
local mem_before = collectgarbage("count")
local t0 = os.clock()
for _ = 1, iters do fn() end
local t1 = os.clock()
local mem_after = collectgarbage("count")
print(string.format("%-44s %7.2fms total %6.2fus/op %+8.1fKB",
name, (t1 - t0) * 1000, (t1 - t0) * 1e6 / iters,
local elapsed = t1 - t0
print(string.format("%-44s %7.2fms total %10.0f ops/s %+8.1fKB",
name, elapsed * 1000, iters / elapsed,
mem_after - mem_before))
end

local fixtures = {
small = read_file("benches/fixtures/small_api.json"),
medium = read_file("benches/fixtures/medium_resp.json"),
local scenarios = {
{name = "small", iters = 5000, payload = read_file("benches/fixtures/small_api.json")},
{name = "medium", iters = 500, payload = read_file("benches/fixtures/medium_resp.json")},
{name = "100k", iters = 100, payload = make_payload(100 * 1024)},
{name = "200k", iters = 50, payload = make_payload(200 * 1024)},
{name = "500k", iters = 20, payload = make_payload(500 * 1024)},
{name = "1m", iters = 15, payload = make_payload(1024 * 1024)},
{name = "2m", iters = 10, payload = make_payload(2 * 1024 * 1024)},
{name = "5m", iters = 10, payload = make_payload(5 * 1024 * 1024)},
{name = "10m", iters = 10, payload = make_payload(10 * 1024 * 1024)},
}

local iters_for = { small = 5000, medium = 500 }

for _, size in ipairs({"small", "medium"}) do
local payload = fixtures[size]
print(string.format("=== %s (%d bytes) ===", size, #payload))
for _, s in ipairs(scenarios) do
print(string.format("=== %s (%d bytes) ===", s.name, #s.payload))

bench("cjson.decode + access 3 fields", iters_for[size], function()
local obj = cjson.decode(payload)
bench("cjson.decode + access 3 fields", s.iters, function()
local obj = cjson.decode(s.payload)
local _ = obj.model
local _ = obj.temperature
local _ = obj.messages and obj.messages[1] and obj.messages[1].role
end)

bench("quickdecode.parse + access 3 fields", iters_for[size], function()
local d = qd.parse(payload)
bench("quickdecode.parse + access 3 fields", s.iters, function()
local d = qd.parse(s.payload)
local _ = d:get_str("model")
local _ = d:get_f64("temperature")
local _ = d:get_str("messages[0].role")
Expand Down
48 changes: 48 additions & 0 deletions docs/superpowers/specs/2026-05-15-makefile-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Makefile for lua-quick-decode

Add a root-level `Makefile` that wraps the common Rust + LuaJIT workflows so contributors don't have to remember the env-var dance for bench/test.

## Targets

| Target | Action |
|---|---|
| `help` (default) | Print each target and its `## ` doc-comment via awk |
| `build` | `cargo build --release` (produces `target/release/libquickdecode.so`) |
| `test` | Depends on `build`. Runs `cargo test --release`, then `busted tests/lua --lua=$(LUAJIT) --lpath='./lua/?.lua'` with `LD_LIBRARY_PATH=$(CURDIR)/target/release` and `LUA_CPATH=$(LUA_CPATH)` exported |
| `lint` | `cargo clippy --release --all-targets -- -D warnings` then `cargo fmt --check` |
| `bench` | Depends on `build`. Runs `$(LUAJIT) benches/lua_bench.lua` with the same env exports as `test` |
| `clean` | `cargo clean` |

All targets are `.PHONY`.

## Overridable variables

```make
LUAJIT ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)
LUA_CPATH ?= ./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so
```

- `LUAJIT` autodetects: prefers `luajit` on `PATH` (apt/CI install), falls back to OpenResty's path (local dev box).
- `LUA_CPATH` includes OpenResty's `lualib` (where `cjson.so` lives on the local box) plus the standard LuaJIT 5.1 search paths. The default value is intentionally absolute, not appended to LuaJIT's built-in default, so the Makefile is reproducible regardless of which LuaJIT build is invoked.
- Users override per invocation: `make bench LUAJIT=/path/to/luajit`.

## Help format

Each target line carries a `## description` trailing comment. The `help` target greps targets with `## ` and pretty-prints `target — description` via awk. This pattern lets `help` stay in sync automatically when a new target is added.

## Out of scope

- **`luacheck` lint for Lua sources.** Neither the local box nor CI has it installed; adding it now would be dead code. Track in README if/when desired.
- **Separate `release` / `debug` build targets.** The repo only ships release artifacts (bench and Lua FFI tests both require release). Add later if a debug workflow appears.
- **Cross-target dependency on `test` from `bench`.** Bench depends only on `build`; running tests as part of bench would slow down iterative perf work.

## Non-goals

- Replacing CI. The Makefile mirrors CI commands but is not invoked by CI (CI keeps its explicit steps for cache-key clarity).
- Cross-platform. macOS/Windows are not supported; the OpenResty path defaults are Linux-specific. PRs welcome but not required for v1.

## Failure modes (intentional, loud)

- `busted` not installed → `test` fails with a clear `command not found`. Fix: `sudo luarocks install busted`.
- `luajit` not on PATH and OpenResty fallback missing → `bench` and `test` fail at the luajit invocation. Fix: install LuaJIT or pass `LUAJIT=...`.
- `target/release/libquickdecode.so` missing → impossible by construction; `bench` and `test` depend on `build`.
132 changes: 123 additions & 9 deletions src/scan/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
let escaped = find_escape_mask_with_carry(backslash, &mut bs_carry);
let real_quote = quote & !escaped;

// String-skip fast path: when the previous chunk left us inside a
// string and this chunk contains no unescaped quote, the entire
// chunk is string interior. No structural chars to emit and
// in_string stays 1; bs_carry was already updated above. Skip the
// 14 cmpeq / movemask ops in structural_mask_chunk plus the PCLMUL
// prefix-XOR — the dominant cost on string-heavy payloads.
if in_string != 0 && real_quote == 0 {
i += 64;
continue;
}

let (inside, new_in_string) = inside_string_mask(real_quote, in_string);
in_string = new_in_string;

Expand All @@ -43,17 +54,30 @@ unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
i += 64;
}

// Whenever there's a tail, fall back to scalar for the whole buffer.
// This is necessary because ScalarScanner validates bracket matching against
// its own stack; a tail containing `]` or `}` that closes a bracket opened
// in the AVX2-processed prefix would cause ScalarScanner::scan on the tail
// slice to return Err, silently dropping those structural chars.
// The common case (input length is a multiple of 64) is unaffected.
// Tail (<64 bytes): continue emit-only via scalar, carrying the
// in_string / bs_carry state from the last AVX2 chunk. Bracket pairing
// is checked once at the end on the merged indices.
//
// If bs_carry == 1 the byte at position `i` is escape-targeted by the
// trailing backslash run of the prior chunk; inside a string we must
// skip it (treat as an escaped data byte, not a structural). Outside
// a string backslashes are plain characters and bs_carry has no effect.
if i < buf.len() {
out.clear();
return super::ScalarScanner::scan(buf, out);
let scalar_start = if in_string != 0 && bs_carry != 0 {
i + 1
} else {
i
};
if scalar_start <= buf.len() {
super::scalar::scan_emit_resume(buf, scalar_start, in_string != 0, out)?;
} else if in_string != 0 {
return Err(buf.len());
}
} else if in_string != 0 {
return Err(buf.len());
}
Ok(())

super::validate_brackets(buf, out)
}

#[inline(always)]
Expand Down Expand Up @@ -239,6 +263,96 @@ mod tests {
parity(&buf);
}

/// Exercises the string-skip fast path: a string spanning many AVX2
/// chunks with no internal quotes. The fast-path branch must produce
/// the same emitted offsets as the slow path (which the parity check
/// against scalar implicitly verifies).
#[test]
fn long_string_engages_skip_fastpath() {
if !host_supports_avx2() { return; }
let mut buf = Vec::new();
buf.extend_from_slice(b"{\"k\":\"");
// ~10 KB of string interior — many chunks fully inside the string.
for _ in 0..10_000 { buf.push(b'a'); }
buf.push(b'"');
buf.push(b'}');
// Pad to 64-aligned to also exercise the no-tail branch.
while buf.len() % 64 != 0 { buf.push(b' '); }
parity(&buf);
}

/// String contains escaped quotes — the fast path must NOT fire when
/// `real_quote != 0` even though we may still be inside a string at
/// the chunk boundary.
#[test]
fn escaped_quotes_do_not_trip_fastpath() {
if !host_supports_avx2() { return; }
let mut buf = Vec::new();
buf.extend_from_slice(b"{\"k\":\"");
// Embed real escape sequences periodically so quotes appear in the
// raw bytes but are escaped; real_quote should still be 0 over
// these chunks, fast path should fire.
for _ in 0..50 {
buf.extend_from_slice(b"aaaaaaaa\\\""); // 8 a's + \"
}
buf.push(b'"');
buf.push(b'}');
while buf.len() % 64 != 0 { buf.push(b' '); }
parity(&buf);
}

/// AVX2 main loop + scalar tail: input length not a multiple of 64.
/// Exercises the path that used to bypass AVX2 entirely.
#[test]
fn unaligned_tail_parity() {
if !host_supports_avx2() { return; }
// Valid JSON of various non-64-aligned total lengths.
for tail_len in [1usize, 5, 17, 33, 63] {
let mut buf = Vec::new();
buf.extend_from_slice(b"{\"key\":\"");
while buf.len() < 60 { buf.push(b'x'); }
buf.extend_from_slice(b"abc\"}");
// buf now well-formed; pad with spaces after the closing `}`
// to land at 64 + tail_len total bytes.
let target = 64 + tail_len;
while buf.len() < target { buf.push(b' '); }
assert_eq!(buf.len(), target, "test setup");
parity(&buf);
}
}

/// String spans the 64-byte chunk boundary; the closing quote lives
/// in the scalar tail. Requires in_string state to carry correctly.
#[test]
fn string_crosses_avx2_boundary() {
if !host_supports_avx2() { return; }
let mut buf = Vec::new();
buf.extend_from_slice(b"{\"k\":\""); // 6 bytes, in_string from byte 5
while buf.len() < 80 { buf.push(b'a'); } // long string content past byte 64
buf.push(b'"');
buf.push(b'}');
parity(&buf);
}

/// Backslash at the LAST byte of the AVX2 chunk; the escaped target
/// is the FIRST byte of the scalar tail. Exercises bs_carry.
#[test]
fn backslash_at_chunk_boundary() {
if !host_supports_avx2() { return; }
// Bytes 0..63: `{"key":"` followed by 'x' padding ending with `\`.
// Byte 64 (first tail byte): an escaped `"` — must NOT close the string.
// Then real closing `"` and `}` follow.
let mut buf = Vec::new();
buf.extend_from_slice(b"{\"key\":\""); // 8 bytes
while buf.len() < 63 { buf.push(b'x'); } // pad to 63
buf.push(b'\\'); // byte 63: backslash
buf.push(b'"'); // byte 64: escaped quote (tail)
buf.push(b'y'); // byte 65
buf.push(b'"'); // byte 66: real string close
buf.push(b'}'); // byte 67
parity(&buf);
}

/// Verifies PCLMUL prefix-XOR produces correct inside-string mask
/// for multiple strings in a single 64-byte chunk.
#[test]
Expand Down
46 changes: 46 additions & 0 deletions src/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,49 @@ pub(crate) fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
});
f(buf, out)
}

/// Walk a sequence of already-emitted structural offsets and verify that
/// `{`/`}` and `[`/`]` are properly paired. String quotes toggle an
/// `in_string` flag and are otherwise skipped — well-formed emit paths
/// never push structural chars from inside strings, but the check is
/// defensive.
///
/// On the first mismatch, returns `Err(offset_in_buf)`. On unmatched
/// openers at end of input, returns `Err(buf.len())`.
pub(crate) fn validate_brackets(buf: &[u8], indices: &[u32]) -> Result<(), usize> {
let mut stack: Vec<u8> = Vec::with_capacity(32);
let mut in_string = false;

for &idx in indices {
let pos = idx as usize;
let b = buf[pos];

if b == b'"' {
in_string = !in_string;
continue;
}
if in_string {
continue;
}

match b {
b'{' | b'[' => stack.push(b),
b'}' => {
if stack.pop() != Some(b'{') {
return Err(pos);
}
}
b']' => {
if stack.pop() != Some(b'[') {
return Err(pos);
}
}
_ => {}
}
}

if !stack.is_empty() {
return Err(buf.len());
}
Ok(())
}
Loading
Loading