diff --git a/CLAUDE.md b/CLAUDE.md
index d5cffa5..e5039a8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,8 +6,6 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Rust JSON decoder (`cdylib` + `rlib`) exposed to LuaJIT via FFI. Optimized for parse-once / extract-a-few-fields / discard. The competitive edge over `lua-cjson` comes from **never building a Lua table** — Phase 1 records only structural offsets, Phase 2 lazily decodes the fields the caller actually asks for. Crate name in `Cargo.toml` is `lua-quick-decode`; the compiled artifact is `libquickdecode.so`.
 
-Authoritative design doc: `docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md`.
-
 ## Common commands
 
 The `Makefile` is the canonical entry point; `make help` lists targets.
diff --git a/README.md b/README.md
index b2fdd3d..7aac859 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,6 @@
 
 Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the common case where a large JSON is parsed once and only a small number of fields are extracted before the document is discarded.
 
-Design document: `docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md`.
-
 ## Status
 
 Initial implementation complete: scalar + AVX2/PCLMUL structural scanner, root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson but tuning is pending — see `Roadmap / Deferred` below.
@@ -38,6 +36,41 @@ local model = body:get_str("model")
 local temp  = body:get_f64("temperature")
 ```
 
+### Lazy table API (`qd.decode` / `qd.encode`)
+
+For callers migrating from `cjson`, an alternative API returns a table-shaped
+lazy view. Reads, iteration, and length all work like a `cjson.decode`'d
+table; writes materialize the affected level into a plain Lua table.
+
+```lua
+local qd    = require("quickdecode")
+local cjson = require("cjson")          -- optional; provides null / empty_array sentinels
+
+local t = qd.decode(json_str)
+
+print(t.model)
+for _, m in qd.ipairs(t.messages) do
+    print(m.role, m.content)
+end
+
+t.extra = "x"
+
+local s = qd.encode(t)                  -- drop-in replacement for cjson.encode
+```
+
+`qd.encode` works on lazy proxies (re-emitting unmodified subtrees as the
+original JSON bytes), real Lua tables (matching `cjson.encode` output), and
+mixed trees. Callers cannot pass a lazy proxy directly to `cjson.encode`
+(cjson bypasses metamethods in C); use `qd.encode` instead, or call
+`qd.materialize(t)` to get a plain Lua table that any third-party encoder
+can handle.
+
+**LuaJIT compat-52 caveat.** `for k, v in pairs/ipairs(t)` and `#t` on a lazy
+proxy rely on `__pairs` / `__ipairs` / `__len`, which LuaJIT only invokes when
+built with `LUAJIT_ENABLE_LUA52COMPAT` (OpenResty's default). On a stock LuaJIT
+5.1, use the explicit `qd.pairs(t)`, `qd.ipairs(t)`, and `qd.len(t)` helpers
+— they work on both builds.
+
 ## Testing — Lua
 
 Requires LuaJIT + busted + lua-cjson installed system-wide.
@@ -76,3 +109,17 @@ Items intentionally pushed out of the first implementation. Each will be picked
 - **`cargo fmt --check` not enforced** — `make lint` runs clippy only. The codebase uses intentional manual column alignment in struct definitions and compact single-line literals that default rustfmt would reflow. Skip rather than reformat until a project-wide style decision is made.
 - **`validate_brackets` fusion into scan emit loop** — surfaced by profiling: on structurally-dense workloads `validate_brackets` is 65% of parse time (second linear pass over emitted indices). Folding bracket pairing into the scan emit loop via an inline depth stack eliminates that pass. No effect on the current string-heavy bench (0.3% there); a win for config / JSONL / table-shape JSON.
 - **`memchr2` cross-chunk jump for very long string interiors** — the AVX2 in-string fast probe (issue #5) drops per-chunk cost from ~25 to ~10 ops but still pays ALU work for every 64-byte chunk in a string. A `memchr2(b'"', b'\\')` jump can approach memory bandwidth on multi-MB single-string payloads. Deferred until a workload that benefits clearly emerges; needs careful `bs_carry` reasoning across the jump.
+- **Stateful O(N) iterator FFI** — current `qd.pairs` and the `__newindex`
+  materialization path walk the object cursor from the start on every step,
+  giving O(N²) total cost for full enumeration. Acceptable for the "read a
+  few keys" use case the library is optimized for; full-iteration workloads
+  (e.g. encoding a deeply-keyed object that has been materialized) would
+  benefit from a `qjd_iter_init` / `qjd_iter_next` pair that holds position
+  state across calls.
+- **Lazy-table read overhead vs path API** — `qd.decode + t.field x3` lands
+  ~30–40% behind `qd.parse:get_str` on small-to-medium payloads, converging
+  to parity at multi-MB sizes. The gap is structural (per-access `__index`
+  metamethod dispatch + transient cdata allocation for nested wraps). Worth
+  attempting if a workload-driven need surfaces; current measured cost is
+  still 14× faster than `cjson.decode` at 100 KB, so the lazy API is the
+  right default for migrating callers.
diff --git a/benches/lua_bench.lua b/benches/lua_bench.lua
index 7f52d25..29db44b 100644
--- a/benches/lua_bench.lua
+++ b/benches/lua_bench.lua
@@ -147,6 +147,18 @@ for _, s in ipairs(scenarios) do
             local _ = d:get_str("messages[0].role")
         end)
     end
+
+    bench("qd.decode + t.field x3", s.iters, function()
+        local t = qd.decode(s.payload)
+        local _ = t.model
+        local _ = t.temperature
+        local _ = t.messages and t.messages[1] and t.messages[1].role
+    end)
+
+    bench("qd.decode + qd.encode (unmodified)", s.iters, function()
+        local t = qd.decode(s.payload)
+        local _ = qd.encode(t)
+    end)
 end
 
 -- Interleaved scenario: cycle through several payloads of different sizes
@@ -207,4 +219,20 @@ do
             local _ = d:get_str("messages[0].role")
         end)
     end
+
+    next_p = make_cycler(interleaved)
+    bench("qd.decode + t.field x3", 400, function()
+        local p = next_p()
+        local t = qd.decode(p)
+        local _ = t.model
+        local _ = t.temperature
+        local _ = t.messages and t.messages[1] and t.messages[1].role
+    end)
+
+    next_p = make_cycler(interleaved)
+    bench("qd.decode + qd.encode (unmodified)", 400, function()
+        local p = next_p()
+        local t = qd.decode(p)
+        local _ = qd.encode(t)
+    end)
 end
diff --git a/docs/superpowers/plans/2026-05-15-rust-quick-json-decode.md b/docs/superpowers/plans/2026-05-15-rust-quick-json-decode.md
deleted file mode 100644
index 64a20c9..0000000
--- a/docs/superpowers/plans/2026-05-15-rust-quick-json-decode.md
+++ /dev/null
@@ -1,3875 +0,0 @@
-# Rust Quick JSON Decode — Implementation Plan (v1)
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Ship a Rust `cdylib` (`libquickdecode.so`) plus `lua/quickdecode.lua` wrapper that lets LuaJIT extract individual fields from large JSON documents faster than `lua-cjson`, by skipping the full Lua-table construction.
-
-**Architecture:** Two-phase decoder. Phase 1 is a single high-throughput structural scan (scalar fallback + AVX2 fast path with runtime dispatch) that records only byte offsets of structural characters. Phase 2 is lazy: paths are resolved by walking those offsets, with a per-container sibling-skip cache built on first access. String/number decode is deferred to the moment a typed getter is called.
-
-**Tech Stack:** Rust (stable), `cdylib` + `rlib`, `cargo`, intrinsics for AVX2/PCLMUL via `core::arch::x86_64`, `memchr`, `rustc-hash`, `once_cell`. Tests use `cargo test` + `proptest`. Lua side uses LuaJIT `ffi` and `busted` for tests.
-
-**Spec:** `docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md` (commit `24990eb`).
-
----
-
-## File Structure
-
-Each row below is created or modified by exactly the tasks listed; if a task says "Create", the file does not exist yet at that point.
-
-```
-Cargo.toml                      — Task 1 (create)
-README.md                       — Tasks 1, 15, 17 (modify)
-include/lua_quick_decode.h      — Tasks 1 (skeleton), 14 (finalize)
-
-src/
-├── lib.rs                      — Task 1 (create)
-├── error.rs                    — Task 1 (create)
-├── ffi.rs                      — Tasks 3, 9, 10, 11, 12 (FFI surface)
-├── doc.rs                      — Task 3 (create), 6 (extend with cache)
-├── path.rs                     — Task 4 (create)
-├── cursor.rs                   — Task 5 (create), 6 (extend)
-├── skip_cache.rs               — Task 6 (create)
-├── scan/
-│   ├── mod.rs                  — Task 2 (create trait), 16 (add dispatch)
-│   ├── scalar.rs               — Task 2 (create)
-│   └── avx2.rs                 — Tasks 13, 14, 15, 16 (create + extend)
-└── decode/
-    ├── mod.rs                  — Task 7 (create)
-    ├── string.rs               — Task 7 (create)
-    └── number.rs               — Task 8 (create)
-
-tests/
-├── ffi_smoke.rs                — Task 3 (create)
-├── ffi_strings.rs              — Task 10 (create)
-├── ffi_numbers.rs              — Task 10 (create)
-├── ffi_cursor.rs               — Task 11 (create)
-├── ffi_panic_safety.rs         — Task 12 (create)
-├── scanner_crosscheck.rs       — Task 16 (create)
-└── lua/
-    ├── basic_spec.lua          — Task 17 (create)
-    ├── escape_spec.lua         — Task 17 (create)
-    └── cjson_compat_spec.lua   — Task 17 (create)
-
-lua/
-└── quickdecode.lua             — Task 15 (create)
-
-benches/
-├── lua_bench.lua               — Task 18 (create)
-└── fixtures/
-    ├── small_api.json          — Task 18 (create)
-    ├── medium_resp.json        — Task 18 (create)
-    └── large_dump.json         — Task 18 (create or generate)
-```
-
-The crate is a single package, not a workspace. Files are split by responsibility (scanner / decode / cursor / FFI / wrapper); each unit can be reasoned about without reading the others.
-
----
-
-## Task 1: Project scaffold + error codes
-
-**Files:**
-- Create: `Cargo.toml`
-- Create: `src/lib.rs`
-- Create: `src/error.rs`
-- Create: `include/lua_quick_decode.h` (skeleton)
-- Modify: `README.md`
-
-- [ ] **Step 1: Write `Cargo.toml`**
-
-```toml
-[package]
-name = "lua-quick-decode"
-version = "0.1.0"
-edition = "2021"
-publish = false
-
-[lib]
-name = "quickdecode"
-crate-type = ["cdylib", "rlib"]
-
-[dependencies]
-memchr = "2"
-rustc-hash = "2"
-once_cell = "1"
-
-[dev-dependencies]
-proptest = "1"
-
-[profile.release]
-opt-level = 3
-lto = "thin"
-codegen-units = 1
-panic = "abort"
-```
-
-- [ ] **Step 2: Write `src/error.rs`**
-
-```rust
-#![allow(non_camel_case_types)]
-
-#[repr(C)]
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum qjd_err {
-    QJD_OK              = 0,
-    QJD_PARSE_ERROR     = 1,
-    QJD_NOT_FOUND       = 2,
-    QJD_TYPE_MISMATCH   = 3,
-    QJD_OUT_OF_RANGE    = 4,
-    QJD_DECODE_FAILED   = 5,
-    QJD_INVALID_PATH    = 6,
-    QJD_INVALID_ARG     = 7,
-    QJD_OOM             = 8,
-}
-
-#[repr(C)]
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum qjd_type {
-    QJD_T_NULL = 0,
-    QJD_T_BOOL = 1,
-    QJD_T_NUM  = 2,
-    QJD_T_STR  = 3,
-    QJD_T_ARR  = 4,
-    QJD_T_OBJ  = 5,
-}
-
-pub fn strerror(code: qjd_err) -> &'static str {
-    match code {
-        qjd_err::QJD_OK            => "ok",
-        qjd_err::QJD_PARSE_ERROR   => "JSON parse error",
-        qjd_err::QJD_NOT_FOUND     => "path not found",
-        qjd_err::QJD_TYPE_MISMATCH => "type mismatch at path",
-        qjd_err::QJD_OUT_OF_RANGE  => "numeric out of range",
-        qjd_err::QJD_DECODE_FAILED => "decode failed",
-        qjd_err::QJD_INVALID_PATH  => "invalid path syntax",
-        qjd_err::QJD_INVALID_ARG   => "invalid argument",
-        qjd_err::QJD_OOM           => "out of memory",
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn strerror_covers_every_variant() {
-        for code in [
-            qjd_err::QJD_OK, qjd_err::QJD_PARSE_ERROR, qjd_err::QJD_NOT_FOUND,
-            qjd_err::QJD_TYPE_MISMATCH, qjd_err::QJD_OUT_OF_RANGE,
-            qjd_err::QJD_DECODE_FAILED, qjd_err::QJD_INVALID_PATH,
-            qjd_err::QJD_INVALID_ARG, qjd_err::QJD_OOM,
-        ] {
-            assert!(!strerror(code).is_empty());
-        }
-    }
-}
-```
-
-- [ ] **Step 3: Write `src/lib.rs`**
-
-```rust
-//! lua-quick-decode: Rust JSON decoder for LuaJIT FFI consumers.
-//! See docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md
-
-pub mod error;
-```
-
-- [ ] **Step 4: Write `include/lua_quick_decode.h` skeleton**
-
-```c
-#ifndef LUA_QUICK_DECODE_H
-#define LUA_QUICK_DECODE_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-    QJD_OK            = 0,
-    QJD_PARSE_ERROR   = 1,
-    QJD_NOT_FOUND     = 2,
-    QJD_TYPE_MISMATCH = 3,
-    QJD_OUT_OF_RANGE  = 4,
-    QJD_DECODE_FAILED = 5,
-    QJD_INVALID_PATH  = 6,
-    QJD_INVALID_ARG   = 7,
-    QJD_OOM           = 8
-} qjd_err;
-
-typedef enum {
-    QJD_T_NULL = 0, QJD_T_BOOL = 1, QJD_T_NUM = 2,
-    QJD_T_STR  = 3, QJD_T_ARR  = 4, QJD_T_OBJ = 5
-} qjd_type;
-
-const char* qjd_strerror(int code);
-
-/* Forward declarations; full prototypes filled in Task 14. */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-```
-
-- [ ] **Step 5: Update `README.md` with Building section**
-
-Insert under existing content:
-
-```markdown
-## Building
-
-```sh
-cargo build --release
-# Output: target/release/libquickdecode.so
-```
-
-## Testing
-
-```sh
-cargo test
-```
-```
-
-- [ ] **Step 6: Run tests**
-
-```sh
-cargo test
-```
-
-Expected: 1 test passes (`strerror_covers_every_variant`). Crate compiles as `cdylib` and `rlib`.
-
-- [ ] **Step 7: Commit**
-
-```sh
-git add Cargo.toml src/ include/ README.md
-git commit -m "Scaffold crate with error codes and C header skeleton"
-```
-
----
-
-## Task 2: ScalarScanner — Phase 1 structural scan
-
-**Files:**
-- Create: `src/scan/mod.rs`
-- Create: `src/scan/scalar.rs`
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write `src/scan/mod.rs`**
-
-```rust
-pub(crate) mod scalar;
-
-/// A structural scanner: given a JSON byte buffer, append the byte offset of
-/// every structural character (`{` `}` `[` `]` `:` `,` `"`) that is NOT inside
-/// a string literal to `out`. On shallow validation failure (unclosed string,
-/// unmatched bracket), returns `Err(offset)` where `offset` is the byte
-/// position the failure was detected at. The offset is informational and not
-/// exposed via FFI in v1.
-pub(crate) trait Scanner {
-    fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize>;
-}
-
-pub(crate) use scalar::ScalarScanner;
-```
-
-- [ ] **Step 2: Write failing tests in `src/scan/scalar.rs`**
-
-```rust
-use super::Scanner;
-
-pub(crate) struct ScalarScanner;
-
-impl Scanner for ScalarScanner {
-    fn scan(_buf: &[u8], _out: &mut Vec<u32>) -> Result<(), usize> {
-        unimplemented!()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn scan(input: &[u8]) -> Result<Vec<u32>, usize> {
-        let mut v = Vec::new();
-        ScalarScanner::scan(input, &mut v).map(|_| v)
-    }
-
-    #[test]
-    fn empty_object() {
-        assert_eq!(scan(b"{}"), Ok(vec![0, 1]));
-    }
-
-    #[test]
-    fn empty_array() {
-        assert_eq!(scan(b"[]"), Ok(vec![0, 1]));
-    }
-
-    #[test]
-    fn simple_object() {
-        // {"a":1}
-        //  ^   ^
-        //  012345 6
-        assert_eq!(scan(b"{\"a\":1}"), Ok(vec![0, 1, 3, 4, 6]));
-        //                            { " " : }
-    }
-
-    #[test]
-    fn nested_object() {
-        // {"a":{"b":2}}
-        //  0   4    9 10 11 12
-        let r = scan(b"{\"a\":{\"b\":2}}").unwrap();
-        // Positions of: { " " : { " " : } }
-        assert_eq!(r, vec![0, 1, 3, 4, 5, 6, 8, 9, 11, 12]);
-    }
-
-    #[test]
-    fn array_with_strings() {
-        // ["a","b"]
-        // 0 12 3 4 56 7 8
-        let r = scan(b"[\"a\",\"b\"]").unwrap();
-        assert_eq!(r, vec![0, 1, 3, 4, 5, 7, 8]);
-    }
-
-    #[test]
-    fn escape_double_quote_in_string() {
-        // {"a":"x\"y"}
-        // 0 12 3 4 5 678 9 10 11
-        let r = scan(b"{\"a\":\"x\\\"y\"}").unwrap();
-        assert_eq!(r, vec![0, 1, 3, 4, 5, 10, 11]);
-    }
-
-    #[test]
-    fn escape_backslash_then_quote() {
-        // {"a":"x\\"}     (string content is `x\`)
-        // 0 12 3 4 5 678 9 10
-        let r = scan(b"{\"a\":\"x\\\\\"}").unwrap();
-        assert_eq!(r, vec![0, 1, 3, 4, 5, 9, 10]);
-    }
-
-    #[test]
-    fn unclosed_string_is_error() {
-        assert!(scan(b"{\"a\":\"foo").is_err());
-    }
-
-    #[test]
-    fn unmatched_closer_is_error() {
-        assert!(scan(b"]").is_err());
-    }
-
-    #[test]
-    fn mismatched_bracket_type_is_error() {
-        assert!(scan(b"{]").is_err());
-    }
-
-    #[test]
-    fn deeply_nested() {
-        let mut buf = Vec::new();
-        for _ in 0..100 { buf.push(b'['); }
-        for _ in 0..100 { buf.push(b']'); }
-        let r = scan(&buf).unwrap();
-        assert_eq!(r.len(), 200);
-    }
-}
-```
-
-- [ ] **Step 3: Run tests to verify they fail (unimplemented)**
-
-```sh
-cargo test scan::scalar
-```
-
-Expected: tests panic with `unimplemented!()`.
-
-- [ ] **Step 4: Implement `ScalarScanner::scan`**
-
-Replace the body in `src/scan/scalar.rs`:
-
-```rust
-impl Scanner for ScalarScanner {
-    fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
-        out.reserve(buf.len() / 6);
-
-        let mut i = 0usize;
-        let mut in_str = false;
-        let mut stack: Vec<u8> = Vec::with_capacity(32);
-
-        while i < buf.len() {
-            let b = buf[i];
-
-            if in_str {
-                if b == b'\\' {
-                    // Skip the escape and the next byte unconditionally.
-                    // Anything in a string cannot be a structural char.
-                    i += 2;
-                    continue;
-                }
-                if b == b'"' {
-                    in_str = false;
-                    out.push(i as u32);
-                }
-                i += 1;
-                continue;
-            }
-
-            match b {
-                b'"' => {
-                    in_str = true;
-                    out.push(i as u32);
-                }
-                b'{' | b'[' => {
-                    stack.push(b);
-                    out.push(i as u32);
-                }
-                b'}' => {
-                    match stack.pop() {
-                        Some(b'{') => {}
-                        _ => return Err(i),
-                    }
-                    out.push(i as u32);
-                }
-                b']' => {
-                    match stack.pop() {
-                        Some(b'[') => {}
-                        _ => return Err(i),
-                    }
-                    out.push(i as u32);
-                }
-                b',' | b':' => out.push(i as u32),
-                _ => {}
-            }
-            i += 1;
-        }
-
-        if in_str { return Err(buf.len()); }
-        if !stack.is_empty() { return Err(buf.len()); }
-        Ok(())
-    }
-}
-```
-
-- [ ] **Step 5: Run tests to verify pass**
-
-```sh
-cargo test scan::scalar
-```
-
-Expected: all 10 tests pass.
-
-- [ ] **Step 6: Wire module into `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-```
-
-- [ ] **Step 7: Commit**
-
-```sh
-git add src/lib.rs src/scan/
-git commit -m "Add ScalarScanner with shallow JSON validation"
-```
-
----
-
-## Task 3: Document + qjd_parse / qjd_free FFI
-
-**Files:**
-- Create: `src/doc.rs`
-- Create: `src/ffi.rs`
-- Create: `tests/ffi_smoke.rs`
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write `src/doc.rs`**
-
-```rust
-use crate::error::qjd_err;
-use crate::scan::{Scanner, ScalarScanner};
-
-pub struct Document<'a> {
-    pub(crate) buf:     &'a [u8],
-    pub(crate) indices: Vec<u32>,
-    pub(crate) scratch: Vec<u8>,
-}
-
-impl<'a> Document<'a> {
-    pub fn parse(buf: &'a [u8]) -> Result<Self, qjd_err> {
-        let mut indices = Vec::new();
-        ScalarScanner::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?;
-        // Sentinel simplifies boundary checks during Phase 2.
-        indices.push(u32::MAX);
-        Ok(Self { buf, indices, scratch: Vec::new() })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn parses_simple_object() {
-        let doc = Document::parse(b"{\"a\":1}").unwrap();
-        assert!(doc.indices.len() >= 5);
-        assert_eq!(*doc.indices.last().unwrap(), u32::MAX);
-    }
-
-    #[test]
-    fn parse_error_on_malformed() {
-        assert!(Document::parse(b"{").is_err());
-    }
-}
-```
-
-- [ ] **Step 2: Write `src/ffi.rs`**
-
-```rust
-//! C ABI surface. Every public function is `unsafe extern "C"`.
-//! All public symbols use the `qjd_` prefix.
-
-#![allow(non_camel_case_types)]
-
-use std::os::raw::{c_char, c_int};
-use std::ptr;
-
-use crate::doc::Document;
-use crate::error::{qjd_err, strerror};
-
-/// Opaque type exported to C as `qjd_doc*`.
-#[allow(dead_code)]
-pub struct qjd_doc(Document<'static>);
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_strerror(code: c_int) -> *const c_char {
-    // Map int back to enum; fall back to OK message for unknown.
-    let e = match code {
-        0 => qjd_err::QJD_OK,
-        1 => qjd_err::QJD_PARSE_ERROR,
-        2 => qjd_err::QJD_NOT_FOUND,
-        3 => qjd_err::QJD_TYPE_MISMATCH,
-        4 => qjd_err::QJD_OUT_OF_RANGE,
-        5 => qjd_err::QJD_DECODE_FAILED,
-        6 => qjd_err::QJD_INVALID_PATH,
-        7 => qjd_err::QJD_INVALID_ARG,
-        8 => qjd_err::QJD_OOM,
-        _ => return c"unknown error code".as_ptr(),
-    };
-    // strerror returns a &'static str; we need NUL-terminated C strings.
-    // Hardcoded NUL-terminated map below to avoid runtime allocation.
-    match e {
-        qjd_err::QJD_OK            => c"ok".as_ptr(),
-        qjd_err::QJD_PARSE_ERROR   => c"JSON parse error".as_ptr(),
-        qjd_err::QJD_NOT_FOUND     => c"path not found".as_ptr(),
-        qjd_err::QJD_TYPE_MISMATCH => c"type mismatch at path".as_ptr(),
-        qjd_err::QJD_OUT_OF_RANGE  => c"numeric out of range".as_ptr(),
-        qjd_err::QJD_DECODE_FAILED => c"decode failed".as_ptr(),
-        qjd_err::QJD_INVALID_PATH  => c"invalid path syntax".as_ptr(),
-        qjd_err::QJD_INVALID_ARG   => c"invalid argument".as_ptr(),
-        qjd_err::QJD_OOM           => c"out of memory".as_ptr(),
-    }
-    // Touch strerror to keep it linked (used elsewhere later).
-    // let _ = strerror;
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_parse(
-    buf:     *const u8,
-    len:     usize,
-    err_out: *mut c_int,
-) -> *mut qjd_doc {
-    if buf.is_null() || err_out.is_null() {
-        if !err_out.is_null() { *err_out = qjd_err::QJD_INVALID_ARG as c_int; }
-        return ptr::null_mut();
-    }
-    let slice: &'static [u8] = std::slice::from_raw_parts(buf, len);
-    match Document::parse(slice) {
-        Ok(d) => {
-            *err_out = qjd_err::QJD_OK as c_int;
-            Box::into_raw(Box::new(qjd_doc(d)))
-        }
-        Err(e) => {
-            *err_out = e as c_int;
-            ptr::null_mut()
-        }
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_free(doc: *mut qjd_doc) {
-    if doc.is_null() { return; }
-    let _ = Box::from_raw(doc);
-}
-
-// Suppress dead_code warning during this task; later tasks consume it.
-#[allow(dead_code)]
-pub(crate) fn _link_strerror() { let _ = strerror; }
-```
-
-- [ ] **Step 3: Wire modules in `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-mod doc;
-pub mod ffi;
-```
-
-- [ ] **Step 4: Write `tests/ffi_smoke.rs`**
-
-```rust
-use std::ffi::CStr;
-use std::os::raw::c_int;
-
-use quickdecode::ffi::{qjd_doc, qjd_free, qjd_parse, qjd_strerror};
-
-#[test]
-fn parse_and_free_roundtrip() {
-    let json = b"{\"a\":1}";
-    let mut err: c_int = -1;
-    let doc: *mut qjd_doc = unsafe { qjd_parse(json.as_ptr(), json.len(), &mut err) };
-    assert!(!doc.is_null());
-    assert_eq!(err, 0);
-    unsafe { qjd_free(doc); }
-}
-
-#[test]
-fn parse_error_returns_null() {
-    let bad = b"{";
-    let mut err: c_int = -1;
-    let doc = unsafe { qjd_parse(bad.as_ptr(), bad.len(), &mut err) };
-    assert!(doc.is_null());
-    assert_eq!(err, 1); // QJD_PARSE_ERROR
-}
-
-#[test]
-fn parse_null_buffer_returns_invalid_arg() {
-    let mut err: c_int = -1;
-    let doc = unsafe { qjd_parse(std::ptr::null(), 0, &mut err) };
-    assert!(doc.is_null());
-    assert_eq!(err, 7); // QJD_INVALID_ARG
-}
-
-#[test]
-fn free_null_is_safe() {
-    unsafe { qjd_free(std::ptr::null_mut()); }
-}
-
-#[test]
-fn strerror_returns_non_empty() {
-    for code in 0..=8 {
-        let p = unsafe { qjd_strerror(code) };
-        assert!(!p.is_null());
-        let s = unsafe { CStr::from_ptr(p) }.to_str().unwrap();
-        assert!(!s.is_empty(), "code {}", code);
-    }
-}
-```
-
-- [ ] **Step 5: Run tests**
-
-```sh
-cargo test
-```
-
-Expected: all tests pass (unit + integration). `target/release/libquickdecode.so` exports `qjd_parse`, `qjd_free`, `qjd_strerror`.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/doc.rs src/ffi.rs src/lib.rs tests/ffi_smoke.rs
-git commit -m "Add Document and qjd_parse/qjd_free/qjd_strerror FFI"
-```
-
----
-
-## Task 4: Path string parser
-
-**Files:**
-- Create: `src/path.rs`
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write failing tests in `src/path.rs`**
-
-```rust
-use crate::error::qjd_err;
-
-#[derive(Debug, PartialEq, Eq)]
-pub(crate) enum PathSeg<'a> {
-    Key(&'a [u8]),
-    Idx(u32),
-}
-
-pub(crate) struct PathIter<'a> {
-    rest: &'a [u8],
-}
-
-impl<'a> PathIter<'a> {
-    pub(crate) fn new(path: &'a [u8]) -> Self { Self { rest: path } }
-}
-
-impl<'a> Iterator for PathIter<'a> {
-    type Item = Result<PathSeg<'a>, qjd_err>;
-    fn next(&mut self) -> Option<Self::Item> { unimplemented!() }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn parse(p: &[u8]) -> Result<Vec<PathSeg>, qjd_err> {
-        PathIter::new(p).collect()
-    }
-
-    #[test]
-    fn empty_path_yields_no_segs() {
-        assert_eq!(parse(b""), Ok(vec![]));
-    }
-
-    #[test]
-    fn single_key() {
-        assert_eq!(parse(b"body"), Ok(vec![PathSeg::Key(b"body")]));
-    }
-
-    #[test]
-    fn dotted_keys() {
-        assert_eq!(
-            parse(b"body.model"),
-            Ok(vec![PathSeg::Key(b"body"), PathSeg::Key(b"model")]),
-        );
-    }
-
-    #[test]
-    fn array_index_after_key() {
-        assert_eq!(
-            parse(b"messages[0]"),
-            Ok(vec![PathSeg::Key(b"messages"), PathSeg::Idx(0)]),
-        );
-    }
-
-    #[test]
-    fn complex_path() {
-        assert_eq!(
-            parse(b"body.messages[42].role"),
-            Ok(vec![
-                PathSeg::Key(b"body"),
-                PathSeg::Key(b"messages"),
-                PathSeg::Idx(42),
-                PathSeg::Key(b"role"),
-            ]),
-        );
-    }
-
-    #[test]
-    fn consecutive_indices() {
-        assert_eq!(
-            parse(b"data[3][1]"),
-            Ok(vec![PathSeg::Key(b"data"), PathSeg::Idx(3), PathSeg::Idx(1)]),
-        );
-    }
-
-    #[test]
-    fn leading_index() {
-        assert_eq!(parse(b"[5]"), Ok(vec![PathSeg::Idx(5)]));
-    }
-
-    #[test]
-    fn unterminated_index_is_error() {
-        assert_eq!(parse(b"a[3"), Err(qjd_err::QJD_INVALID_PATH));
-    }
-
-    #[test]
-    fn non_digit_in_index_is_error() {
-        assert_eq!(parse(b"a[abc]"), Err(qjd_err::QJD_INVALID_PATH));
-    }
-
-    #[test]
-    fn trailing_dot_is_error() {
-        assert_eq!(parse(b"a."), Err(qjd_err::QJD_INVALID_PATH));
-    }
-}
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```sh
-cargo test path::tests
-```
-
-Expected: panic with `unimplemented!()`.
-
-- [ ] **Step 3: Implement `PathIter::next`**
-
-Replace the body in `src/path.rs`:
-
-```rust
-impl<'a> Iterator for PathIter<'a> {
-    type Item = Result<PathSeg<'a>, qjd_err>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.rest.is_empty() {
-            return None;
-        }
-
-        let first = self.rest[0];
-
-        if first == b'[' {
-            // Index segment: [digits]
-            let close = match self.rest.iter().position(|&c| c == b']') {
-                Some(p) => p,
-                None => return Some(Err(qjd_err::QJD_INVALID_PATH)),
-            };
-            let digits = &self.rest[1..close];
-            if digits.is_empty() || !digits.iter().all(|c| c.is_ascii_digit()) {
-                return Some(Err(qjd_err::QJD_INVALID_PATH));
-            }
-            let mut n: u32 = 0;
-            for &c in digits {
-                n = n.checked_mul(10)
-                    .and_then(|x| x.checked_add((c - b'0') as u32))
-                    .unwrap_or(u32::MAX);
-                if n == u32::MAX {
-                    return Some(Err(qjd_err::QJD_INVALID_PATH));
-                }
-            }
-            self.rest = &self.rest[close + 1..];
-            return Some(Ok(PathSeg::Idx(n)));
-        }
-
-        if first == b'.' {
-            // Separator before a key. Skip it then require a key.
-            self.rest = &self.rest[1..];
-            if self.rest.is_empty() {
-                return Some(Err(qjd_err::QJD_INVALID_PATH));
-            }
-            return self.next();
-        }
-
-        // Key segment: read until '.' or '[' or end.
-        let end = self.rest.iter()
-            .position(|&c| c == b'.' || c == b'[')
-            .unwrap_or(self.rest.len());
-        if end == 0 {
-            return Some(Err(qjd_err::QJD_INVALID_PATH));
-        }
-        let key = &self.rest[..end];
-        self.rest = &self.rest[end..];
-        Some(Ok(PathSeg::Key(key)))
-    }
-}
-```
-
-- [ ] **Step 4: Run tests to verify pass**
-
-```sh
-cargo test path::tests
-```
-
-Expected: all 10 tests pass.
-
-- [ ] **Step 5: Wire module into `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-mod doc;
-mod path;
-pub mod ffi;
-```
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/path.rs src/lib.rs
-git commit -m "Add zero-alloc PathIter for path string parsing"
-```
-
----
-
-## Task 5: Cursor core + brute-force resolve
-
-This task implements a working `Cursor::resolve` without any skip cache. Task 6 adds the cache on top.
-
-**Files:**
-- Create: `src/cursor.rs`
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write failing tests in `src/cursor.rs`**
-
-```rust
-use crate::doc::Document;
-use crate::error::qjd_err;
-use crate::path::{PathIter, PathSeg};
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub(crate) struct Cursor {
-    /// Position in doc.indices of the opening '{' or '[', or the value's
-    /// first-byte structural marker (e.g. opening '"' for a string).
-    pub(crate) idx_start: u32,
-    /// One past the closing '}' / ']' in doc.indices. For scalar values,
-    /// idx_end == idx_start + 1.
-    pub(crate) idx_end: u32,
-}
-
-impl Cursor {
-    pub(crate) fn root(doc: &Document) -> Self {
-        Cursor { idx_start: 0, idx_end: (doc.indices.len() as u32) - 1 }
-    }
-
-    pub(crate) fn resolve(self, doc: &Document, path: &[u8]) -> Result<Cursor, qjd_err> {
-        let mut cur = self;
-        for seg in PathIter::new(path) {
-            let seg = seg?;
-            cur = step(doc, cur, &seg)?;
-        }
-        Ok(cur)
-    }
-}
-
-fn step(_doc: &Document, _cur: Cursor, _seg: &PathSeg) -> Result<Cursor, qjd_err> {
-    unimplemented!()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn doc_of(s: &[u8]) -> Document<'_> { Document::parse(s).unwrap() }
-
-    #[test]
-    fn root_path_returns_root() {
-        let d = doc_of(b"{\"a\":1}");
-        let c = Cursor::root(&d).resolve(&d, b"").unwrap();
-        assert_eq!(c, Cursor::root(&d));
-    }
-
-    #[test]
-    fn simple_key() {
-        let d = doc_of(b"{\"a\":1}");
-        let c = Cursor::root(&d).resolve(&d, b"a").unwrap();
-        // 'a' points at the value '1', which is between ':' and '}'.
-        // Value starts after the ':' (indices position 3).
-        // Cursor for a scalar value: idx_start at the value's leading marker
-        // (here we use the next structural index, which is the closing '}').
-        // We pick the convention: idx_start = position in indices array of
-        // the value's first-byte marker. For scalars without their own
-        // structural char, idx_start points at the position AFTER the ':'
-        // in indices, with idx_end == idx_start.
-        // Concretely the indices for {"a":1} are [0, 1, 3, 4, 6, MAX]:
-        //   { " " : }
-        // Position 4 in indices is the ':' index (byte offset 4). The value
-        // starts at byte 5 and ends before byte 6 (the '}'). We set:
-        //   idx_start = 4 + 1 = 5  (= position in indices of value's end)
-        //   idx_end   = 5
-        // Actually for scalars our convention: idx_start == idx_end ==
-        // position in indices of the structural char that ENDS the value.
-        assert_ne!(c, Cursor::root(&d));
-    }
-
-    #[test]
-    fn nested_key() {
-        let d = doc_of(b"{\"a\":{\"b\":2}}");
-        let _ = Cursor::root(&d).resolve(&d, b"a.b").unwrap();
-    }
-
-    #[test]
-    fn missing_key_is_not_found() {
-        let d = doc_of(b"{\"a\":1}");
-        let r = Cursor::root(&d).resolve(&d, b"b");
-        assert_eq!(r, Err(qjd_err::QJD_NOT_FOUND));
-    }
-
-    #[test]
-    fn type_mismatch_on_index_into_object() {
-        let d = doc_of(b"{\"a\":1}");
-        let r = Cursor::root(&d).resolve(&d, b"[0]");
-        assert_eq!(r, Err(qjd_err::QJD_TYPE_MISMATCH));
-    }
-
-    #[test]
-    fn type_mismatch_on_key_into_array() {
-        let d = doc_of(b"[1,2,3]");
-        let r = Cursor::root(&d).resolve(&d, b"a");
-        assert_eq!(r, Err(qjd_err::QJD_TYPE_MISMATCH));
-    }
-
-    #[test]
-    fn array_index() {
-        let d = doc_of(b"[10,20,30]");
-        let _ = Cursor::root(&d).resolve(&d, b"[1]").unwrap();
-    }
-
-    #[test]
-    fn array_out_of_bounds() {
-        let d = doc_of(b"[10,20]");
-        let r = Cursor::root(&d).resolve(&d, b"[5]");
-        assert_eq!(r, Err(qjd_err::QJD_NOT_FOUND));
-    }
-}
-```
-
-Note: cursor representation is internal. The tests above verify behavior (success / error variant), not exact field values, so we can refine the representation in Task 6 without rewriting tests.
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```sh
-cargo test cursor::tests
-```
-
-Expected: tests panic with `unimplemented!()`.
-
-- [ ] **Step 3: Implement `step` and supporting helpers**
-
-Replace `step` and add helpers in `src/cursor.rs`:
-
-```rust
-fn step(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, qjd_err> {
-    // The cursor must point at a container.
-    let opener_byte = container_opener_byte(doc, cur)
-        .ok_or(qjd_err::QJD_TYPE_MISMATCH)?;
-    match (seg, opener_byte) {
-        (PathSeg::Key(_), b'{') | (PathSeg::Idx(_), b'[') => {}
-        _ => return Err(qjd_err::QJD_TYPE_MISMATCH),
-    }
-
-    walk_children(doc, cur, seg)
-}
-
-/// If `cur` points at a container, return its opener byte (`{` or `[`).
-/// Returns None for scalars.
-fn container_opener_byte(doc: &Document, cur: Cursor) -> Option<u8> {
-    if cur.idx_start as usize >= doc.indices.len() { return None; }
-    let pos = doc.indices[cur.idx_start as usize] as usize;
-    let b = *doc.buf.get(pos)?;
-    if b == b'{' || b == b'[' { Some(b) } else { None }
-}
-
-/// Iterate children of the container at `cur` and return a Cursor for the
-/// matching child. `seg` is either a Key (object children) or Idx (array
-/// children).
-fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, qjd_err> {
-    let mut i = cur.idx_start + 1;            // skip opener
-    let end = cur.idx_end;                    // closer position in indices
-    let mut arr_idx: u32 = 0;
-    let is_obj = matches!(seg, PathSeg::Key(_));
-
-    while i < end {
-        // i now points at the start of a child entry.
-        // For object: i points at the key's opening '"'.
-        // For array:  i points at the value's first structural marker.
-
-        let child_key_match = if is_obj {
-            // Read the key: from quote at indices[i] to closing quote at indices[i+1].
-            let key_open = doc.indices[i as usize] as usize;
-            let key_close = doc.indices[(i + 1) as usize] as usize;
-            if doc.buf.get(key_open).copied() != Some(b'"') {
-                return Err(qjd_err::QJD_PARSE_ERROR);
-            }
-            let key_bytes = &doc.buf[key_open + 1 .. key_close];
-            match seg {
-                PathSeg::Key(want) => key_bytes == *want,
-                _ => unreachable!(),
-            }
-        } else {
-            match seg {
-                PathSeg::Idx(want) => arr_idx == *want,
-                _ => unreachable!(),
-            }
-        };
-
-        // Advance i past the key + ':' (for object), to the value's first marker.
-        let value_idx_start = if is_obj { i + 3 } else { i };
-        // ^ i: key '"' open; i+1: key '"' close; i+2: ':' ; i+3: value marker
-        // Determine value range. value_idx_end depends on value type.
-        let value_idx_end = find_value_end(doc, value_idx_start)?;
-
-        if child_key_match {
-            return Ok(Cursor { idx_start: value_idx_start, idx_end: value_idx_end });
-        }
-
-        // Move past this child: value_idx_end points at ',' or closing bracket.
-        // If at ',' continue; if at closing bracket we're at end.
-        let after_pos = doc.indices[value_idx_end as usize] as usize;
-        if after_pos >= doc.buf.len() { return Err(qjd_err::QJD_NOT_FOUND); }
-        match doc.buf[after_pos] {
-            b',' => { i = value_idx_end + 1; arr_idx += 1; }
-            b'}' | b']' => return Err(qjd_err::QJD_NOT_FOUND),
-            _ => return Err(qjd_err::QJD_PARSE_ERROR),
-        }
-    }
-    Err(qjd_err::QJD_NOT_FOUND)
-}
-
-/// Given the indices position of a value's first marker, return the indices
-/// position of the structural character immediately following the value:
-///   - for object/array values, the matching closer (one past it == sibling)
-///   - for string values, the closing quote
-///   - for scalars (numbers / true / false / null), the next structural char
-fn find_value_end(doc: &Document, start: u32) -> Result<u32, qjd_err> {
-    let pos = doc.indices[start as usize] as usize;
-    let b = *doc.buf.get(pos).ok_or(qjd_err::QJD_PARSE_ERROR)?;
-    match b {
-        b'{' | b'[' => {
-            // Brace-count to matching closer.
-            let want_close = if b == b'{' { b'}' } else { b']' };
-            let mut depth: i32 = 1;
-            let mut k = start + 1;
-            while (k as usize) < doc.indices.len() {
-                let cb = doc.buf[doc.indices[k as usize] as usize];
-                match cb {
-                    b'{' | b'[' => depth += 1,
-                    b'}' | b']' => {
-                        depth -= 1;
-                        if depth == 0 {
-                            if cb != want_close { return Err(qjd_err::QJD_PARSE_ERROR); }
-                            return Ok(k);
-                        }
-                    }
-                    _ => {}
-                }
-                k += 1;
-            }
-            Err(qjd_err::QJD_PARSE_ERROR)
-        }
-        b'"' => {
-            // String value: the indices array has both opening and closing quotes.
-            Ok(start + 1)
-        }
-        _ => {
-            // Scalar: end at next structural char.
-            Ok(start + 1)
-        }
-    }
-}
-```
-
-Note: this implementation works for Cursor::root if we set `idx_start` to `0` (the outer opener) and `idx_end` to the matching closer's position in `indices`. Update `Cursor::root`:
-
-```rust
-impl Cursor {
-    pub(crate) fn root(doc: &Document) -> Self {
-        // Find the closing index of the outermost container.
-        // indices has a u32::MAX sentinel at the end.
-        let n = doc.indices.len() as u32;
-        debug_assert!(n >= 2);
-        Cursor { idx_start: 0, idx_end: n - 2 }
-    }
-}
-```
-
-- [ ] **Step 4: Run tests to verify pass**
-
-```sh
-cargo test cursor::tests
-```
-
-Expected: all 8 tests pass.
-
-- [ ] **Step 5: Wire module into `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-mod doc;
-mod path;
-mod cursor;
-pub mod ffi;
-```
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/cursor.rs src/lib.rs
-git commit -m "Add Cursor with brute-force path resolution"
-```
-
----
-
-## Task 6: SkipCache (lazy fill)
-
-This task adds the per-container sibling-skip cache that makes repeated access of the same container O(N_keys) instead of O(N_keys × subtree_size).
-
-**Files:**
-- Create: `src/skip_cache.rs`
-- Modify: `src/doc.rs` (add cache to Document)
-- Modify: `src/cursor.rs` (use cache in walk_children)
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write `src/skip_cache.rs`**
-
-```rust
-use rustc_hash::FxHashMap;
-
-#[derive(Default)]
-pub(crate) struct SkipCache {
-    /// Slot 0 reserved as "no cache" marker.
-    slots: Vec<SkipSlot>,
-    /// Map from a container's opener position-in-indices (idx_start) to slot index.
-    by_opener: FxHashMap<u32, u32>,
-}
-
-pub(crate) struct SkipSlot {
-    /// child_starts[i] = position in doc.indices of the i-th child's leading
-    /// marker. For object children this is the key's opening '"'; for array
-    /// children, the value's first marker.
-    pub(crate) child_starts: Vec<u32>,
-}
-
-impl SkipCache {
-    pub(crate) fn new() -> Self {
-        Self { slots: vec![SkipSlot { child_starts: Vec::new() }], by_opener: FxHashMap::default() }
-    }
-
-    pub(crate) fn get_or_insert(&mut self, opener_idx: u32) -> (u32, bool) {
-        if let Some(&slot) = self.by_opener.get(&opener_idx) {
-            return (slot, true);
-        }
-        let new = self.slots.len() as u32;
-        self.slots.push(SkipSlot { child_starts: Vec::new() });
-        self.by_opener.insert(opener_idx, new);
-        (new, false)
-    }
-
-    pub(crate) fn slot_mut(&mut self, n: u32) -> &mut SkipSlot {
-        &mut self.slots[n as usize]
-    }
-
-    pub(crate) fn slot(&self, n: u32) -> &SkipSlot {
-        &self.slots[n as usize]
-    }
-}
-```
-
-- [ ] **Step 2: Add cache to `Document`**
-
-In `src/doc.rs`:
-
-```rust
-use crate::skip_cache::SkipCache;
-
-pub struct Document<'a> {
-    pub(crate) buf:     &'a [u8],
-    pub(crate) indices: Vec<u32>,
-    pub(crate) scratch: Vec<u8>,
-    pub(crate) skip:    std::cell::RefCell<SkipCache>,
-}
-
-impl<'a> Document<'a> {
-    pub fn parse(buf: &'a [u8]) -> Result<Self, qjd_err> {
-        let mut indices = Vec::new();
-        ScalarScanner::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?;
-        indices.push(u32::MAX);
-        Ok(Self {
-            buf,
-            indices,
-            scratch: Vec::new(),
-            skip: std::cell::RefCell::new(SkipCache::new()),
-        })
-    }
-}
-```
-
-We use `RefCell` because cursors take `&Document` but the cache mutates. Single-threaded use means `RefCell` is fine; multi-threading is explicitly out of scope (spec §7.5).
-
-- [ ] **Step 3: Modify `walk_children` to use the cache**
-
-Replace `walk_children` in `src/cursor.rs`:
-
-```rust
-fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, qjd_err> {
-    let is_obj = matches!(seg, PathSeg::Key(_));
-    let mut cache = doc.skip.borrow_mut();
-    let (slot_n, was_cached) = cache.get_or_insert(cur.idx_start);
-
-    if was_cached {
-        // Fast path: iterate cached child_starts.
-        let starts = cache.slot(slot_n).child_starts.clone();
-        // ^ small clone; alternative: drop borrow then iterate. We keep
-        // semantics simple at the cost of a Vec<u32> clone per match attempt;
-        // optimization deferred.
-        drop(cache);
-        return resolve_in_known_children(doc, &starts, is_obj, seg);
-    }
-
-    // Slow path: walk for the first time, populate cache as we go.
-    let mut starts: Vec<u32> = Vec::new();
-    let mut i = cur.idx_start + 1;
-    let end = cur.idx_end;
-    let mut arr_idx: u32 = 0;
-
-    while i < end {
-        starts.push(i);
-
-        let value_idx_start = if is_obj { i + 3 } else { i };
-        let value_idx_end   = find_value_end(doc, value_idx_start)?;
-
-        let matched = if is_obj {
-            let key_open  = doc.indices[i as usize] as usize;
-            let key_close = doc.indices[(i + 1) as usize] as usize;
-            let key_bytes = &doc.buf[key_open + 1 .. key_close];
-            match seg {
-                PathSeg::Key(want) => key_bytes == *want,
-                _ => unreachable!(),
-            }
-        } else {
-            match seg {
-                PathSeg::Idx(want) => arr_idx == *want,
-                _ => unreachable!(),
-            }
-        };
-
-        if matched {
-            // Continue populating cache fully before returning, so subsequent
-            // siblings benefit too. Walk remaining children without further
-            // matching.
-            let result = Cursor { idx_start: value_idx_start, idx_end: value_idx_end };
-
-            let mut j = value_idx_end;
-            loop {
-                let after = doc.buf[doc.indices[j as usize] as usize];
-                match after {
-                    b',' => { j += 1; starts.push(j); j = find_value_end(doc, if is_obj { j + 3 } else { j })?; }
-                    b'}' | b']' => break,
-                    _ => return Err(qjd_err::QJD_PARSE_ERROR),
-                }
-            }
-
-            cache.slot_mut(slot_n).child_starts = starts;
-            return Ok(result);
-        }
-
-        let after = doc.buf[doc.indices[value_idx_end as usize] as usize];
-        match after {
-            b',' => { i = value_idx_end + 1; arr_idx += 1; }
-            b'}' | b']' => {
-                cache.slot_mut(slot_n).child_starts = starts;
-                return Err(qjd_err::QJD_NOT_FOUND);
-            }
-            _ => return Err(qjd_err::QJD_PARSE_ERROR),
-        }
-    }
-
-    cache.slot_mut(slot_n).child_starts = starts;
-    Err(qjd_err::QJD_NOT_FOUND)
-}
-
-fn resolve_in_known_children(
-    doc: &Document, starts: &[u32], is_obj: bool, seg: &PathSeg,
-) -> Result<Cursor, qjd_err> {
-    for (k, &i) in starts.iter().enumerate() {
-        let matched = if is_obj {
-            let key_open  = doc.indices[i as usize] as usize;
-            let key_close = doc.indices[(i + 1) as usize] as usize;
-            let key_bytes = &doc.buf[key_open + 1 .. key_close];
-            matches!(seg, PathSeg::Key(want) if key_bytes == *want)
-        } else {
-            matches!(seg, PathSeg::Idx(want) if (k as u32) == *want)
-        };
-        if matched {
-            let value_idx_start = if is_obj { i + 3 } else { i };
-            let value_idx_end   = find_value_end(doc, value_idx_start)?;
-            return Ok(Cursor { idx_start: value_idx_start, idx_end: value_idx_end });
-        }
-    }
-    Err(qjd_err::QJD_NOT_FOUND)
-}
-```
-
-- [ ] **Step 4: Wire skip_cache into `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-mod skip_cache;
-mod doc;
-mod path;
-mod cursor;
-pub mod ffi;
-```
-
-- [ ] **Step 5: Add a test that exercises the cache hit path**
-
-Append to `src/cursor.rs` tests:
-
-```rust
-#[test]
-fn cache_hit_on_repeated_access() {
-    let d = doc_of(b"{\"a\":1,\"b\":2,\"c\":3}");
-    let r1 = Cursor::root(&d).resolve(&d, b"a").unwrap();
-    let r2 = Cursor::root(&d).resolve(&d, b"b").unwrap();
-    let r3 = Cursor::root(&d).resolve(&d, b"c").unwrap();
-    // Just assert all succeed; cache correctness verified by sharing impl.
-    assert_ne!(r1, r2);
-    assert_ne!(r2, r3);
-    // Verify only one slot exists for the root container.
-    let cache = d.skip.borrow();
-    // 1 slot + slot 0 reserved = 2
-    assert_eq!(cache.by_opener.len(), 1);
-}
-```
-
-- [ ] **Step 6: Run tests**
-
-```sh
-cargo test
-```
-
-Expected: all previous tests + new cache test pass.
-
-- [ ] **Step 7: Commit**
-
-```sh
-git add src/skip_cache.rs src/doc.rs src/cursor.rs src/lib.rs
-git commit -m "Add lazy sibling-skip cache for cursor path resolution"
-```
-
----
-
-## Task 7: String escape decode
-
-**Files:**
-- Create: `src/decode/mod.rs`
-- Create: `src/decode/string.rs`
-- Modify: `src/lib.rs`
-
-- [ ] **Step 1: Write `src/decode/mod.rs`**
-
-```rust
-pub(crate) mod string;
-pub(crate) mod number;
-```
-
-- [ ] **Step 2: Write failing tests in `src/decode/string.rs`**
-
-```rust
-use crate::error::qjd_err;
-
-/// Decode the JSON string between `start` and `end` (exclusive of the
-/// surrounding quotes) into `scratch` if escapes are present. Returns
-/// (ptr, len) pointing into either `buf` (no escapes) or `scratch`.
-pub(crate) fn decode_string(
-    buf: &[u8], start: usize, end: usize, scratch: &mut Vec<u8>,
-) -> Result<(*const u8, usize), qjd_err> {
-    let _ = (buf, start, end, scratch);
-    unimplemented!()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn d(s: &[u8]) -> Result<Vec<u8>, qjd_err> {
-        let mut scratch = Vec::new();
-        let (p, n) = decode_string(s, 0, s.len(), &mut scratch)?;
-        Ok(unsafe { std::slice::from_raw_parts(p, n) }.to_vec())
-    }
-
-    #[test]
-    fn no_escape_returns_input() {
-        assert_eq!(d(b"hello").unwrap(), b"hello".to_vec());
-    }
-
-    #[test]
-    fn escaped_quote() {
-        assert_eq!(d(b"a\\\"b").unwrap(), b"a\"b".to_vec());
-    }
-
-    #[test]
-    fn escaped_backslash() {
-        assert_eq!(d(b"a\\\\b").unwrap(), b"a\\b".to_vec());
-    }
-
-    #[test]
-    fn escaped_newline() {
-        assert_eq!(d(b"a\\nb").unwrap(), b"a\nb".to_vec());
-    }
-
-    #[test]
-    fn escaped_tab() {
-        assert_eq!(d(b"a\\tb").unwrap(), b"a\tb".to_vec());
-    }
-
-    #[test]
-    fn escaped_unicode_ascii() {
-        // A = 'A'
-        assert_eq!(d(b"a\\u0041b").unwrap(), b"aAb".to_vec());
-    }
-
-    #[test]
-    fn escaped_unicode_2byte() {
-        // é = 'é' = 0xC3 0xA9
-        assert_eq!(d(b"\\u00e9").unwrap(), vec![0xC3, 0xA9]);
-    }
-
-    #[test]
-    fn escaped_unicode_3byte() {
-        // 中 = '中' = 0xE4 0xB8 0xAD
-        assert_eq!(d(b"\\u4e2d").unwrap(), vec![0xE4, 0xB8, 0xAD]);
-    }
-
-    #[test]
-    fn surrogate_pair() {
-        // 😀 = '😀' = U+1F600 = 0xF0 0x9F 0x98 0x80
-        assert_eq!(
-            d(b"\\uD83D\\uDE00").unwrap(),
-            vec![0xF0, 0x9F, 0x98, 0x80],
-        );
-    }
-
-    #[test]
-    fn lone_high_surrogate_fails() {
-        assert_eq!(d(b"\\uD83D").unwrap_err(), qjd_err::QJD_DECODE_FAILED);
-    }
-
-    #[test]
-    fn invalid_hex_in_unicode_fails() {
-        assert_eq!(d(b"\\uZZZZ").unwrap_err(), qjd_err::QJD_DECODE_FAILED);
-    }
-
-    #[test]
-    fn unknown_escape_fails() {
-        assert_eq!(d(b"\\q").unwrap_err(), qjd_err::QJD_DECODE_FAILED);
-    }
-
-    #[test]
-    fn dangling_backslash_fails() {
-        assert_eq!(d(b"a\\").unwrap_err(), qjd_err::QJD_DECODE_FAILED);
-    }
-}
-```
-
-- [ ] **Step 3: Run tests to verify they fail**
-
-```sh
-cargo test decode::string
-```
-
-Expected: panics with `unimplemented!()`.
-
-- [ ] **Step 4: Implement `decode_string`**
-
-Replace in `src/decode/string.rs`:
-
-```rust
-pub(crate) fn decode_string(
-    buf: &[u8], start: usize, end: usize, scratch: &mut Vec<u8>,
-) -> Result<(*const u8, usize), qjd_err> {
-    let slice = &buf[start..end];
-    if memchr::memchr(b'\\', slice).is_none() {
-        return Ok((slice.as_ptr(), slice.len()));
-    }
-
-    scratch.clear();
-    scratch.reserve(slice.len());
-
-    let mut i = 0;
-    while i < slice.len() {
-        let b = slice[i];
-        if b != b'\\' {
-            scratch.push(b);
-            i += 1;
-            continue;
-        }
-        // Escape.
-        if i + 1 >= slice.len() { return Err(qjd_err::QJD_DECODE_FAILED); }
-        match slice[i + 1] {
-            b'"'  => { scratch.push(b'"');  i += 2; }
-            b'\\' => { scratch.push(b'\\'); i += 2; }
-            b'/'  => { scratch.push(b'/');  i += 2; }
-            b'b'  => { scratch.push(0x08);  i += 2; }
-            b'f'  => { scratch.push(0x0C);  i += 2; }
-            b'n'  => { scratch.push(b'\n'); i += 2; }
-            b'r'  => { scratch.push(b'\r'); i += 2; }
-            b't'  => { scratch.push(b'\t'); i += 2; }
-            b'u'  => {
-                if i + 6 > slice.len() { return Err(qjd_err::QJD_DECODE_FAILED); }
-                let h = parse_hex4(&slice[i + 2 .. i + 6])?;
-                i += 6;
-                let cp = if (0xD800..=0xDBFF).contains(&h) {
-                    // High surrogate, expect \uDXXX low surrogate next.
-                    if i + 6 > slice.len() || &slice[i..i + 2] != b"\\u" {
-                        return Err(qjd_err::QJD_DECODE_FAILED);
-                    }
-                    let l = parse_hex4(&slice[i + 2 .. i + 6])?;
-                    if !(0xDC00..=0xDFFF).contains(&l) {
-                        return Err(qjd_err::QJD_DECODE_FAILED);
-                    }
-                    i += 6;
-                    0x10000 + ((h - 0xD800) << 10) + (l - 0xDC00)
-                } else if (0xDC00..=0xDFFF).contains(&h) {
-                    // Unmatched low surrogate.
-                    return Err(qjd_err::QJD_DECODE_FAILED);
-                } else {
-                    h
-                };
-                encode_utf8(cp, scratch);
-            }
-            _ => return Err(qjd_err::QJD_DECODE_FAILED),
-        }
-    }
-
-    Ok((scratch.as_ptr(), scratch.len()))
-}
-
-fn parse_hex4(bytes: &[u8]) -> Result<u32, qjd_err> {
-    let mut v: u32 = 0;
-    for &b in bytes {
-        v <<= 4;
-        v |= match b {
-            b'0'..=b'9' => (b - b'0') as u32,
-            b'a'..=b'f' => (b - b'a' + 10) as u32,
-            b'A'..=b'F' => (b - b'A' + 10) as u32,
-            _ => return Err(qjd_err::QJD_DECODE_FAILED),
-        };
-    }
-    Ok(v)
-}
-
-fn encode_utf8(cp: u32, out: &mut Vec<u8>) {
-    if cp < 0x80 {
-        out.push(cp as u8);
-    } else if cp < 0x800 {
-        out.push(0xC0 | (cp >> 6) as u8);
-        out.push(0x80 | (cp & 0x3F) as u8);
-    } else if cp < 0x10000 {
-        out.push(0xE0 | (cp >> 12) as u8);
-        out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
-        out.push(0x80 | (cp & 0x3F) as u8);
-    } else {
-        out.push(0xF0 | (cp >> 18) as u8);
-        out.push(0x80 | ((cp >> 12) & 0x3F) as u8);
-        out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
-        out.push(0x80 | (cp & 0x3F) as u8);
-    }
-}
-```
-
-- [ ] **Step 5: Run tests to verify pass**
-
-```sh
-cargo test decode::string
-```
-
-Expected: all 13 tests pass.
-
-- [ ] **Step 6: Wire module into `src/lib.rs`**
-
-```rust
-pub mod error;
-mod scan;
-mod skip_cache;
-mod doc;
-mod path;
-mod cursor;
-mod decode;
-pub mod ffi;
-```
-
-- [ ] **Step 7: Commit**
-
-```sh
-git add src/decode/ src/lib.rs
-git commit -m "Add lazy string escape decode with surrogate-pair handling"
-```
-
----
-
-## Task 8: Number decode (i64 and f64)
-
-**Files:**
-- Create: `src/decode/number.rs`
-
-- [ ] **Step 1: Write failing tests in `src/decode/number.rs`**
-
-```rust
-use crate::error::qjd_err;
-
-pub(crate) fn parse_i64(bytes: &[u8]) -> Result<i64, qjd_err> {
-    let _ = bytes; unimplemented!()
-}
-
-pub(crate) fn parse_f64(bytes: &[u8]) -> Result<f64, qjd_err> {
-    let _ = bytes; unimplemented!()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test] fn i64_zero()       { assert_eq!(parse_i64(b"0"),  Ok(0)); }
-    #[test] fn i64_positive()   { assert_eq!(parse_i64(b"42"), Ok(42)); }
-    #[test] fn i64_negative()   { assert_eq!(parse_i64(b"-7"), Ok(-7)); }
-    #[test] fn i64_max() { assert_eq!(parse_i64(b"9223372036854775807"), Ok(i64::MAX)); }
-    #[test] fn i64_min() { assert_eq!(parse_i64(b"-9223372036854775808"), Ok(i64::MIN)); }
-
-    #[test]
-    fn i64_overflow() {
-        assert_eq!(parse_i64(b"9223372036854775808"), Err(qjd_err::QJD_OUT_OF_RANGE));
-    }
-
-    #[test]
-    fn i64_rejects_decimal() {
-        assert_eq!(parse_i64(b"1.5"), Err(qjd_err::QJD_TYPE_MISMATCH));
-    }
-
-    #[test]
-    fn i64_rejects_exponent() {
-        assert_eq!(parse_i64(b"1e5"), Err(qjd_err::QJD_TYPE_MISMATCH));
-    }
-
-    #[test]
-    fn i64_rejects_empty() {
-        assert_eq!(parse_i64(b""), Err(qjd_err::QJD_DECODE_FAILED));
-    }
-
-    #[test] fn f64_zero()    { assert_eq!(parse_f64(b"0.0").unwrap(),  0.0); }
-    #[test] fn f64_pi()      { assert!((parse_f64(b"3.14").unwrap() - 3.14).abs() < 1e-12); }
-    #[test] fn f64_negative(){ assert_eq!(parse_f64(b"-1.5").unwrap(), -1.5); }
-    #[test] fn f64_exponent(){ assert_eq!(parse_f64(b"1e2").unwrap(),  100.0); }
-
-    #[test]
-    fn f64_rejects_garbage() {
-        assert_eq!(parse_f64(b"hello"), Err(qjd_err::QJD_DECODE_FAILED));
-    }
-}
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```sh
-cargo test decode::number
-```
-
-Expected: panics with `unimplemented!()`.
-
-- [ ] **Step 3: Implement `parse_i64` and `parse_f64`**
-
-Replace in `src/decode/number.rs`:
-
-```rust
-pub(crate) fn parse_i64(bytes: &[u8]) -> Result<i64, qjd_err> {
-    if bytes.is_empty() {
-        return Err(qjd_err::QJD_DECODE_FAILED);
-    }
-    // Reject non-integer JSON numbers.
-    if bytes.iter().any(|&b| b == b'.' || b == b'e' || b == b'E') {
-        return Err(qjd_err::QJD_TYPE_MISMATCH);
-    }
-    let (neg, rest) = match bytes[0] {
-        b'-' => (true, &bytes[1..]),
-        _    => (false, bytes),
-    };
-    if rest.is_empty() || !rest.iter().all(|c| c.is_ascii_digit()) {
-        return Err(qjd_err::QJD_DECODE_FAILED);
-    }
-    let mut v: i64 = 0;
-    for &c in rest {
-        let d = (c - b'0') as i64;
-        v = match v.checked_mul(10).and_then(|x| {
-            if neg { x.checked_sub(d) } else { x.checked_add(d) }
-        }) {
-            Some(n) => n,
-            None    => return Err(qjd_err::QJD_OUT_OF_RANGE),
-        };
-    }
-    Ok(v)
-}
-
-pub(crate) fn parse_f64(bytes: &[u8]) -> Result<f64, qjd_err> {
-    if bytes.is_empty() {
-        return Err(qjd_err::QJD_DECODE_FAILED);
-    }
-    let s = std::str::from_utf8(bytes).map_err(|_| qjd_err::QJD_DECODE_FAILED)?;
-    s.parse::<f64>().map_err(|_| qjd_err::QJD_DECODE_FAILED)
-}
-```
-
-- [ ] **Step 4: Run tests to verify pass**
-
-```sh
-cargo test decode::number
-```
-
-Expected: all 14 tests pass.
-
-- [ ] **Step 5: Commit**
-
-```sh
-git add src/decode/number.rs
-git commit -m "Add lazy i64/f64 number decode with overflow checking"
-```
-
----
-
-## Task 9: qjd_typeof, qjd_is_null, qjd_len
-
-**Files:**
-- Modify: `src/ffi.rs`
-- Modify: `src/doc.rs` (add helper for cursor → value type)
-- Create: `tests/ffi_typeof.rs`
-
-- [ ] **Step 1: Add helper in `src/doc.rs`**
-
-Append to `src/doc.rs`:
-
-```rust
-use crate::cursor::Cursor;
-use crate::error::qjd_type;
-
-impl<'a> Document<'a> {
-    /// Inspect the byte at the cursor's value start to determine type.
-    pub(crate) fn type_of(&self, cur: Cursor) -> Result<qjd_type, qjd_err> {
-        let pos = *self.indices.get(cur.idx_start as usize)
-            .ok_or(qjd_err::QJD_PARSE_ERROR)? as usize;
-        // For values that have a leading structural marker:
-        //   '"' → string  '{' → object  '[' → array
-        // For scalars (numbers/true/false/null), idx_start points at the
-        // following structural char (e.g. ','/'}'/']'); we have to look at
-        // the byte BEFORE that position which begins the scalar.
-        let lead = self.buf.get(pos).copied().ok_or(qjd_err::QJD_PARSE_ERROR)?;
-        match lead {
-            b'"' => Ok(qjd_type::QJD_T_STR),
-            b'{' => Ok(qjd_type::QJD_T_OBJ),
-            b'[' => Ok(qjd_type::QJD_T_ARR),
-            // Otherwise this index points at a separator following a scalar.
-            _ => {
-                // Find the scalar's first non-whitespace byte in buf.
-                // The scalar lives between (previous index byte + 1) and pos.
-                let scalar_start = self.find_scalar_start(cur.idx_start)?;
-                match self.buf.get(scalar_start).copied() {
-                    Some(b't') | Some(b'f') => Ok(qjd_type::QJD_T_BOOL),
-                    Some(b'n')              => Ok(qjd_type::QJD_T_NULL),
-                    Some(b'-') | Some(b'0'..=b'9') => Ok(qjd_type::QJD_T_NUM),
-                    _ => Err(qjd_err::QJD_PARSE_ERROR),
-                }
-            }
-        }
-    }
-
-    pub(crate) fn find_scalar_start(&self, idx: u32) -> Result<usize, qjd_err> {
-        // Look at indices[idx-1] which marks the character immediately before
-        // this scalar (typically ':' or ',' or opening bracket); the scalar's
-        // first non-whitespace byte is at indices[idx-1] + 1 plus any whitespace.
-        if idx == 0 { return Err(qjd_err::QJD_PARSE_ERROR); }
-        let prev = self.indices[(idx - 1) as usize] as usize;
-        let mut p = prev + 1;
-        while p < self.buf.len() && matches!(self.buf[p], b' '|b'\t'|b'\n'|b'\r') {
-            p += 1;
-        }
-        Ok(p)
-    }
-
-    pub(crate) fn cursor_len(&self, cur: Cursor) -> Result<usize, qjd_err> {
-        let pos = self.indices[cur.idx_start as usize] as usize;
-        match self.buf.get(pos).copied() {
-            Some(b'{') | Some(b'[') => {}
-            _ => return Err(qjd_err::QJD_TYPE_MISMATCH),
-        }
-        // Use the same brace-counting walk as in cursor::find_value_end,
-        // but counting children instead.
-        let mut depth = 1i32;
-        let mut count = 0usize;
-        let mut at_start = true;
-        let mut i = cur.idx_start + 1;
-        let end = cur.idx_end;
-        while i < end {
-            let b = self.buf[self.indices[i as usize] as usize];
-            match b {
-                b'{' | b'[' => { if depth == 1 && at_start { count += 1; at_start = false; } depth += 1; }
-                b'}' | b']' => depth -= 1,
-                b',' if depth == 1 => { at_start = true; }
-                b'"' | b't' | b'f' | b'n' if depth == 1 && at_start => {
-                    count += 1; at_start = false;
-                }
-                _ => {
-                    if depth == 1 && at_start && (b == b':' ) {
-                        // object key was already what made us count, ':' separates
-                    }
-                }
-            }
-            i += 1;
-        }
-        Ok(count)
-    }
-}
-```
-
-Note: the `cursor_len` implementation above is approximate; refine in this task until tests pass. The reference behavior: count direct children of the container.
-
-- [ ] **Step 2: Write `tests/ffi_typeof.rs`**
-
-```rust
-use std::os::raw::c_int;
-use quickdecode::ffi::*;
-
-fn parse(s: &[u8]) -> *mut qjd_doc {
-    let mut err: c_int = -1;
-    let d = unsafe { qjd_parse(s.as_ptr(), s.len(), &mut err) };
-    assert!(!d.is_null());
-    d
-}
-
-#[test]
-fn typeof_string() {
-    let d = parse(b"{\"a\":\"hi\"}");
-    let mut t: c_int = -1;
-    let p = b"a";
-    let rc = unsafe { qjd_typeof(d, p.as_ptr() as *const i8, p.len(), &mut t) };
-    assert_eq!(rc, 0);
-    assert_eq!(t, 3); // QJD_T_STR
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn typeof_number() {
-    let d = parse(b"{\"a\":42}");
-    let mut t: c_int = -1;
-    let p = b"a";
-    let rc = unsafe { qjd_typeof(d, p.as_ptr() as *const i8, p.len(), &mut t) };
-    assert_eq!(rc, 0);
-    assert_eq!(t, 2); // QJD_T_NUM
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn typeof_bool() {
-    let d = parse(b"{\"a\":true}");
-    let mut t: c_int = -1;
-    let p = b"a";
-    let rc = unsafe { qjd_typeof(d, p.as_ptr() as *const i8, p.len(), &mut t) };
-    assert_eq!(rc, 0);
-    assert_eq!(t, 1);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn typeof_null() {
-    let d = parse(b"{\"a\":null}");
-    let mut t: c_int = -1;
-    let p = b"a";
-    let rc = unsafe { qjd_typeof(d, p.as_ptr() as *const i8, p.len(), &mut t) };
-    assert_eq!(rc, 0);
-    assert_eq!(t, 0);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn is_null_true() {
-    let d = parse(b"{\"a\":null}");
-    let mut b: c_int = -1;
-    let p = b"a";
-    let rc = unsafe { qjd_is_null(d, p.as_ptr() as *const i8, p.len(), &mut b) };
-    assert_eq!(rc, 0);
-    assert_ne!(b, 0);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn len_object() {
-    let d = parse(b"{\"a\":1,\"b\":2,\"c\":3}");
-    let mut n: usize = 0;
-    let p = b"";
-    let rc = unsafe { qjd_len(d, p.as_ptr() as *const i8, p.len(), &mut n) };
-    assert_eq!(rc, 0);
-    assert_eq!(n, 3);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn len_array() {
-    let d = parse(b"[10,20,30,40]");
-    let mut n: usize = 0;
-    let p = b"";
-    let rc = unsafe { qjd_len(d, p.as_ptr() as *const i8, p.len(), &mut n) };
-    assert_eq!(rc, 0);
-    assert_eq!(n, 4);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn typeof_not_found() {
-    let d = parse(b"{\"a\":1}");
-    let mut t: c_int = -1;
-    let p = b"b";
-    let rc = unsafe { qjd_typeof(d, p.as_ptr() as *const i8, p.len(), &mut t) };
-    assert_eq!(rc, 2); // NOT_FOUND
-    unsafe { qjd_free(d) };
-}
-```
-
-- [ ] **Step 3: Run tests to see them fail (undeclared symbols)**
-
-```sh
-cargo test ffi_typeof
-```
-
-Expected: link errors for `qjd_typeof`, `qjd_is_null`, `qjd_len`.
-
-- [ ] **Step 4: Add FFI exports in `src/ffi.rs`**
-
-Append:
-
-```rust
-use crate::cursor::Cursor;
-use crate::error::qjd_type;
-
-unsafe fn resolve_root_path(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize,
-) -> Result<(&'static Document<'static>, Cursor), qjd_err> {
-    if doc.is_null() || (path.is_null() && path_len != 0) {
-        return Err(qjd_err::QJD_INVALID_ARG);
-    }
-    let d: &Document = &(*doc).0;
-    let p: &[u8] = if path.is_null() {
-        &[]
-    } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = Cursor::root(d).resolve(d, p)?;
-    // SAFETY: caller holds doc alive; we re-erase lifetime for return.
-    Ok((std::mem::transmute(d), cur))
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_typeof(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, type_out: *mut c_int,
-) -> c_int {
-    if type_out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    match resolve_root_path(doc, path, path_len) {
-        Ok((d, cur)) => match d.type_of(cur) {
-            Ok(t) => { *type_out = t as c_int; qjd_err::QJD_OK as c_int }
-            Err(e) => e as c_int,
-        },
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_is_null(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut c_int,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    match resolve_root_path(doc, path, path_len) {
-        Ok((d, cur)) => match d.type_of(cur) {
-            Ok(qjd_type::QJD_T_NULL) => { *out = 1; qjd_err::QJD_OK as c_int }
-            Ok(_)                    => { *out = 0; qjd_err::QJD_OK as c_int }
-            Err(e) => e as c_int,
-        },
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_len(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut usize,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    match resolve_root_path(doc, path, path_len) {
-        Ok((d, cur)) => match d.cursor_len(cur) {
-            Ok(n) => { *out = n; qjd_err::QJD_OK as c_int }
-            Err(e) => e as c_int,
-        },
-        Err(e) => e as c_int,
-    }
-}
-```
-
-- [ ] **Step 5: Run tests**
-
-```sh
-cargo test ffi_typeof
-```
-
-Expected: all 8 tests pass.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/ffi.rs src/doc.rs tests/ffi_typeof.rs
-git commit -m "Add qjd_typeof / qjd_is_null / qjd_len FFI"
-```
-
----
-
-## Task 10: Typed root-path getters (str/i64/f64/bool)
-
-**Files:**
-- Modify: `src/ffi.rs`
-- Create: `tests/ffi_strings.rs`
-- Create: `tests/ffi_numbers.rs`
-
-- [ ] **Step 1: Write failing tests in `tests/ffi_strings.rs`**
-
-```rust
-use std::ffi::CStr;
-use std::os::raw::c_int;
-use quickdecode::ffi::*;
-
-fn parse(s: &[u8]) -> *mut qjd_doc {
-    let mut err: c_int = -1;
-    let d = unsafe { qjd_parse(s.as_ptr(), s.len(), &mut err) };
-    assert!(!d.is_null());
-    d
-}
-
-#[test]
-fn get_str_simple() {
-    let d = parse(b"{\"a\":\"hello\"}");
-    let mut p: *const u8 = std::ptr::null();
-    let mut n: usize = 0;
-    let path = b"a";
-    let rc = unsafe { qjd_get_str(d, path.as_ptr() as *const i8, path.len(), &mut p, &mut n) };
-    assert_eq!(rc, 0);
-    let s = unsafe { std::slice::from_raw_parts(p, n) };
-    assert_eq!(s, b"hello");
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_str_with_escape() {
-    let d = parse(b"{\"a\":\"he\\nlo\"}");
-    let mut p: *const u8 = std::ptr::null();
-    let mut n: usize = 0;
-    let path = b"a";
-    let rc = unsafe { qjd_get_str(d, path.as_ptr() as *const i8, path.len(), &mut p, &mut n) };
-    assert_eq!(rc, 0);
-    let s = unsafe { std::slice::from_raw_parts(p, n) };
-    assert_eq!(s, b"he\nlo");
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_str_type_mismatch() {
-    let d = parse(b"{\"a\":42}");
-    let mut p: *const u8 = std::ptr::null();
-    let mut n: usize = 0;
-    let path = b"a";
-    let rc = unsafe { qjd_get_str(d, path.as_ptr() as *const i8, path.len(), &mut p, &mut n) };
-    assert_eq!(rc, 3); // TYPE_MISMATCH
-    unsafe { qjd_free(d) };
-}
-```
-
-- [ ] **Step 2: Write failing tests in `tests/ffi_numbers.rs`**
-
-```rust
-use std::os::raw::c_int;
-use quickdecode::ffi::*;
-
-fn parse(s: &[u8]) -> *mut qjd_doc {
-    let mut err: c_int = -1;
-    let d = unsafe { qjd_parse(s.as_ptr(), s.len(), &mut err) };
-    assert!(!d.is_null());
-    d
-}
-
-#[test]
-fn get_i64_basic() {
-    let d = parse(b"{\"a\":42}");
-    let mut v: i64 = 0;
-    let p = b"a";
-    let rc = unsafe { qjd_get_i64(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert_eq!(rc, 0);
-    assert_eq!(v, 42);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_i64_negative() {
-    let d = parse(b"{\"a\":-7}");
-    let mut v: i64 = 0;
-    let p = b"a";
-    unsafe { qjd_get_i64(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert_eq!(v, -7);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_i64_overflow() {
-    let d = parse(b"{\"a\":99999999999999999999}");
-    let mut v: i64 = 0;
-    let p = b"a";
-    let rc = unsafe { qjd_get_i64(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert_eq!(rc, 4); // OUT_OF_RANGE
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_f64_basic() {
-    let d = parse(b"{\"a\":3.14}");
-    let mut v: f64 = 0.0;
-    let p = b"a";
-    unsafe { qjd_get_f64(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert!((v - 3.14).abs() < 1e-12);
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn get_bool() {
-    let d = parse(b"{\"a\":true,\"b\":false}");
-    let mut v: c_int = -1;
-    let p = b"a";
-    unsafe { qjd_get_bool(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert_ne!(v, 0);
-    let p = b"b";
-    unsafe { qjd_get_bool(d, p.as_ptr() as *const i8, p.len(), &mut v) };
-    assert_eq!(v, 0);
-    unsafe { qjd_free(d) };
-}
-```
-
-- [ ] **Step 3: Run tests to verify they fail**
-
-```sh
-cargo test ffi_strings ffi_numbers
-```
-
-Expected: link errors for `qjd_get_str`, `qjd_get_i64`, `qjd_get_f64`, `qjd_get_bool`.
-
-- [ ] **Step 4: Implement getters in `src/ffi.rs`**
-
-Append:
-
-```rust
-use crate::decode::{number, string};
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_get_str(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize,
-    out_ptr: *mut *const u8, out_len: *mut usize,
-) -> c_int {
-    if out_ptr.is_null() || out_len.is_null() {
-        return qjd_err::QJD_INVALID_ARG as c_int;
-    }
-    let (d, cur) = match resolve_root_path(doc, path, path_len) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    let pos = d.indices[cur.idx_start as usize] as usize;
-    if d.buf.get(pos).copied() != Some(b'"') {
-        return qjd_err::QJD_TYPE_MISMATCH as c_int;
-    }
-    // String ends at next index: indices[cur.idx_start + 1].
-    let close = d.indices[(cur.idx_start + 1) as usize] as usize;
-
-    // SAFETY: scratch is part of doc which the caller pins via Lua reference.
-    // We need mutable access to scratch; transmute to &mut.
-    let scratch = &mut *(&d.scratch as *const _ as *mut Vec<u8>);
-    match string::decode_string(d.buf, pos + 1, close, scratch) {
-        Ok((p, n)) => { *out_ptr = p; *out_len = n; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_get_i64(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut i64,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match resolve_root_path(doc, path, path_len) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    let bytes = match scalar_bytes(d, cur) {
-        Ok(b) => b, Err(e) => return e as c_int,
-    };
-    match number::parse_i64(bytes) {
-        Ok(v) => { *out = v; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_get_f64(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut f64,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match resolve_root_path(doc, path, path_len) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    let bytes = match scalar_bytes(d, cur) {
-        Ok(b) => b, Err(e) => return e as c_int,
-    };
-    match number::parse_f64(bytes) {
-        Ok(v) => { *out = v; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_get_bool(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut c_int,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match resolve_root_path(doc, path, path_len) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    let bytes = match scalar_bytes(d, cur) {
-        Ok(b) => b, Err(e) => return e as c_int,
-    };
-    match bytes {
-        b"true"  => { *out = 1; qjd_err::QJD_OK as c_int }
-        b"false" => { *out = 0; qjd_err::QJD_OK as c_int }
-        _ => qjd_err::QJD_TYPE_MISMATCH as c_int,
-    }
-}
-
-/// Return the byte slice for a scalar value (number, true, false, null).
-unsafe fn scalar_bytes<'d>(d: &'d Document, cur: Cursor) -> Result<&'d [u8], qjd_err> {
-    // Scalar's start: find first non-whitespace after previous index.
-    let start = d.find_scalar_start(cur.idx_start)?;
-    // Scalar's end: position of structural char at cur.idx_start.
-    let end = d.indices[cur.idx_start as usize] as usize;
-    if end < start { return Err(qjd_err::QJD_PARSE_ERROR); }
-    let mut e = end;
-    while e > start && matches!(d.buf[e - 1], b' '|b'\t'|b'\n'|b'\r') { e -= 1; }
-    Ok(&d.buf[start..e])
-}
-```
-
-- [ ] **Step 5: Run tests to verify pass**
-
-```sh
-cargo test ffi_strings ffi_numbers
-```
-
-Expected: all 8 tests pass.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/ffi.rs tests/ffi_strings.rs tests/ffi_numbers.rs
-git commit -m "Add qjd_get_str / get_i64 / get_f64 / get_bool FFI getters"
-```
-
----
-
-## Task 11: Cursor C ABI (qjd_open + qjd_cursor_*)
-
-**Files:**
-- Modify: `src/ffi.rs`
-- Create: `tests/ffi_cursor.rs`
-
-- [ ] **Step 1: Define `qjd_cursor` in `src/ffi.rs`**
-
-Append:
-
-```rust
-#[repr(C)]
-#[derive(Copy, Clone)]
-pub struct qjd_cursor {
-    pub doc:        *const qjd_doc,
-    pub idx_start:  u32,
-    pub idx_end:    u32,
-    pub cache_slot: u32,
-    pub _pad:       u32,
-}
-
-unsafe fn cursor_to_internal(c: *const qjd_cursor) -> Result<(&'static Document<'static>, Cursor), qjd_err> {
-    if c.is_null() { return Err(qjd_err::QJD_INVALID_ARG); }
-    let cc = &*c;
-    if cc.doc.is_null() { return Err(qjd_err::QJD_INVALID_ARG); }
-    let d: &Document = &(*(cc.doc as *mut qjd_doc)).0;
-    Ok((std::mem::transmute(d), Cursor { idx_start: cc.idx_start, idx_end: cc.idx_end }))
-}
-
-fn internal_to_cursor(doc: *const qjd_doc, cur: Cursor) -> qjd_cursor {
-    qjd_cursor {
-        doc, idx_start: cur.idx_start, idx_end: cur.idx_end,
-        cache_slot: 0, _pad: 0,
-    }
-}
-```
-
-- [ ] **Step 2: Add cursor open + getters**
-
-Append:
-
-```rust
-#[no_mangle]
-pub unsafe extern "C" fn qjd_open(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, out: *mut qjd_cursor,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    match resolve_root_path(doc, path, path_len) {
-        Ok((_, cur)) => { *out = internal_to_cursor(doc as *const qjd_doc, cur); qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_open(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, out: *mut qjd_cursor,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    match cur.resolve(d, p) {
-        Ok(child) => { *out = internal_to_cursor((*c).doc, child); qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_field(
-    c: *const qjd_cursor, key: *const c_char, key_len: usize, out: *mut qjd_cursor,
-) -> c_int {
-    if out.is_null() || (key.is_null() && key_len != 0) {
-        return qjd_err::QJD_INVALID_ARG as c_int;
-    }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let k = if key.is_null() { &[][..] } else { std::slice::from_raw_parts(key as *const u8, key_len) };
-    // Use PathSeg::Key directly via walk_children, but our public surface is
-    // resolve. Emulate single-segment key via path that has no separators.
-    // For keys containing '.' or '[', this is the intended escape hatch.
-    let child = match crate::cursor::resolve_single_key(d, cur, k) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    *out = internal_to_cursor((*c).doc, child);
-    qjd_err::QJD_OK as c_int
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_index(
-    c: *const qjd_cursor, i: usize, out: *mut qjd_cursor,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    if i > u32::MAX as usize { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let child = match crate::cursor::resolve_single_idx(d, cur, i as u32) {
-        Ok(x) => x, Err(e) => return e as c_int,
-    };
-    *out = internal_to_cursor((*c).doc, child);
-    qjd_err::QJD_OK as c_int
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_get_str(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize,
-    out_ptr: *mut *const u8, out_len: *mut usize,
-) -> c_int {
-    if out_ptr.is_null() || out_len.is_null() {
-        return qjd_err::QJD_INVALID_ARG as c_int;
-    }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    let pos = d.indices[cur.idx_start as usize] as usize;
-    if d.buf.get(pos).copied() != Some(b'"') {
-        return qjd_err::QJD_TYPE_MISMATCH as c_int;
-    }
-    let close = d.indices[(cur.idx_start + 1) as usize] as usize;
-    let scratch = &mut *(&d.scratch as *const _ as *mut Vec<u8>);
-    match string::decode_string(d.buf, pos + 1, close, scratch) {
-        Ok((p, n)) => { *out_ptr = p; *out_len = n; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-// Mirror qjd_cursor_get_i64 / get_f64 / get_bool / typeof / len following the
-// same pattern: resolve, then dispatch on value byte.
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_get_i64(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, out: *mut i64,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int };
-    match number::parse_i64(bytes) {
-        Ok(v) => { *out = v; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_get_f64(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, out: *mut f64,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int };
-    match number::parse_f64(bytes) {
-        Ok(v) => { *out = v; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_get_bool(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, out: *mut c_int,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    let bytes = match scalar_bytes(d, cur) { Ok(b) => b, Err(e) => return e as c_int };
-    match bytes {
-        b"true"  => { *out = 1; qjd_err::QJD_OK as c_int }
-        b"false" => { *out = 0; qjd_err::QJD_OK as c_int }
-        _ => qjd_err::QJD_TYPE_MISMATCH as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_typeof(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, type_out: *mut c_int,
-) -> c_int {
-    if type_out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    match d.type_of(cur) {
-        Ok(t) => { *type_out = t as c_int; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn qjd_cursor_len(
-    c: *const qjd_cursor, path: *const c_char, path_len: usize, out: *mut usize,
-) -> c_int {
-    if out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-    let (d, cur) = match cursor_to_internal(c) { Ok(x) => x, Err(e) => return e as c_int };
-    let p: &[u8] = if path.is_null() { &[] } else {
-        std::slice::from_raw_parts(path as *const u8, path_len)
-    };
-    let cur = match cur.resolve(d, p) { Ok(x) => x, Err(e) => return e as c_int };
-    match d.cursor_len(cur) {
-        Ok(n) => { *out = n; qjd_err::QJD_OK as c_int }
-        Err(e) => e as c_int,
-    }
-}
-```
-
-- [ ] **Step 3: Add `resolve_single_key` / `resolve_single_idx` helpers in `src/cursor.rs`**
-
-Append to `src/cursor.rs`:
-
-```rust
-pub(crate) fn resolve_single_key(doc: &Document, cur: Cursor, key: &[u8]) -> Result<Cursor, qjd_err> {
-    step(doc, cur, &PathSeg::Key(key))
-}
-
-pub(crate) fn resolve_single_idx(doc: &Document, cur: Cursor, idx: u32) -> Result<Cursor, qjd_err> {
-    step(doc, cur, &PathSeg::Idx(idx))
-}
-```
-
-- [ ] **Step 4: Write `tests/ffi_cursor.rs`**
-
-```rust
-use std::os::raw::c_int;
-use quickdecode::ffi::*;
-
-fn parse(s: &[u8]) -> *mut qjd_doc {
-    let mut err: c_int = -1;
-    let d = unsafe { qjd_parse(s.as_ptr(), s.len(), &mut err) };
-    assert!(!d.is_null());
-    d
-}
-
-#[test]
-fn open_object_then_get_field() {
-    let d = parse(b"{\"body\":{\"model\":\"gpt\",\"temperature\":0.5}}");
-    let mut c = std::mem::MaybeUninit::<qjd_cursor>::uninit();
-    let p = b"body";
-    let rc = unsafe { qjd_open(d, p.as_ptr() as *const i8, p.len(), c.as_mut_ptr()) };
-    assert_eq!(rc, 0);
-    let c = unsafe { c.assume_init() };
-
-    let mut pp: *const u8 = std::ptr::null();
-    let mut nn: usize = 0;
-    let k = b"model";
-    let rc = unsafe { qjd_cursor_get_str(&c, k.as_ptr() as *const i8, k.len(), &mut pp, &mut nn) };
-    assert_eq!(rc, 0);
-    let s = unsafe { std::slice::from_raw_parts(pp, nn) };
-    assert_eq!(s, b"gpt");
-
-    let mut f: f64 = 0.0;
-    let k = b"temperature";
-    let rc = unsafe { qjd_cursor_get_f64(&c, k.as_ptr() as *const i8, k.len(), &mut f) };
-    assert_eq!(rc, 0);
-    assert!((f - 0.5).abs() < 1e-12);
-
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn cursor_index_array() {
-    let d = parse(b"[\"a\",\"b\",\"c\"]");
-    let mut c = std::mem::MaybeUninit::<qjd_cursor>::uninit();
-    let p = b"";
-    unsafe { qjd_open(d, p.as_ptr() as *const i8, 0, c.as_mut_ptr()) };
-    let c = unsafe { c.assume_init() };
-
-    let mut sub = std::mem::MaybeUninit::<qjd_cursor>::uninit();
-    let rc = unsafe { qjd_cursor_index(&c, 1, sub.as_mut_ptr()) };
-    assert_eq!(rc, 0);
-    let sub = unsafe { sub.assume_init() };
-
-    let mut pp: *const u8 = std::ptr::null();
-    let mut nn: usize = 0;
-    let empty = b"";
-    let rc = unsafe { qjd_cursor_get_str(&sub, empty.as_ptr() as *const i8, 0, &mut pp, &mut nn) };
-    assert_eq!(rc, 0);
-    assert_eq!(unsafe { std::slice::from_raw_parts(pp, nn) }, b"b");
-
-    unsafe { qjd_free(d) };
-}
-
-#[test]
-fn cursor_field_with_dotted_key() {
-    let d = parse(b"{\"a.b\":42}");
-    let mut c = std::mem::MaybeUninit::<qjd_cursor>::uninit();
-    let p = b"";
-    unsafe { qjd_open(d, p.as_ptr() as *const i8, 0, c.as_mut_ptr()) };
-    let c = unsafe { c.assume_init() };
-
-    let mut sub = std::mem::MaybeUninit::<qjd_cursor>::uninit();
-    let key = b"a.b";
-    let rc = unsafe { qjd_cursor_field(&c, key.as_ptr() as *const i8, key.len(), sub.as_mut_ptr()) };
-    assert_eq!(rc, 0);
-
-    let sub = unsafe { sub.assume_init() };
-    let mut v: i64 = 0;
-    let empty = b"";
-    let rc = unsafe { qjd_cursor_get_i64(&sub, empty.as_ptr() as *const i8, 0, &mut v) };
-    assert_eq!(rc, 0);
-    assert_eq!(v, 42);
-
-    unsafe { qjd_free(d) };
-}
-```
-
-- [ ] **Step 5: Run tests to verify pass**
-
-```sh
-cargo test ffi_cursor
-```
-
-Expected: all 3 tests pass.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/ffi.rs src/cursor.rs tests/ffi_cursor.rs
-git commit -m "Add qjd_cursor type and qjd_open / qjd_cursor_* FFI"
-```
-
----
-
-## Task 12: panic::catch_unwind boundary
-
-**Files:**
-- Modify: `src/ffi.rs`
-- Create: `tests/ffi_panic_safety.rs`
-
-- [ ] **Step 1: Create a wrapper macro**
-
-In `src/ffi.rs`, add at the top:
-
-```rust
-macro_rules! ffi_catch {
-    ($body:block) => {{
-        let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| $body));
-        match r {
-            Ok(code) => code,
-            Err(_)   => qjd_err::QJD_OOM as c_int,
-        }
-    }};
-}
-```
-
-- [ ] **Step 2: Wrap every `pub unsafe extern "C" fn` returning `c_int`**
-
-For each of `qjd_typeof`, `qjd_is_null`, `qjd_len`, `qjd_get_str`, `qjd_get_i64`, `qjd_get_f64`, `qjd_get_bool`, `qjd_open`, `qjd_cursor_*`, wrap their bodies:
-
-```rust
-#[no_mangle]
-pub unsafe extern "C" fn qjd_typeof(
-    doc: *mut qjd_doc, path: *const c_char, path_len: usize, type_out: *mut c_int,
-) -> c_int {
-    ffi_catch!({
-        if type_out.is_null() { return qjd_err::QJD_INVALID_ARG as c_int; }
-        // ... existing body ...
-    })
-}
-```
-
-Apply the same transformation to every FFI function returning `c_int`. Functions returning `*mut qjd_doc` or `*const c_char` are not wrapped (they cannot return error codes the same way; for `qjd_parse` we keep the existing error-out parameter and just wrap separately):
-
-```rust
-#[no_mangle]
-pub unsafe extern "C" fn qjd_parse(
-    buf: *const u8, len: usize, err_out: *mut c_int,
-) -> *mut qjd_doc {
-    let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-        // ... existing body ...
-    }));
-    match r {
-        Ok(p) => p,
-        Err(_) => {
-            if !err_out.is_null() { *err_out = qjd_err::QJD_OOM as c_int; }
-            std::ptr::null_mut()
-        }
-    }
-}
-```
-
-- [ ] **Step 3: Add a Rust-only panic injection for testing**
-
-Add to `src/ffi.rs` (only when `cfg(test)` or behind a feature):
-
-```rust
-#[cfg(test)]
-#[no_mangle]
-pub unsafe extern "C" fn qjd_test_panic() -> c_int {
-    ffi_catch!({
-        panic!("forced panic for test");
-    })
-}
-```
-
-- [ ] **Step 4: Write `tests/ffi_panic_safety.rs`**
-
-```rust
-#[cfg(test)]
-use quickdecode::ffi::qjd_test_panic;
-
-#[test]
-fn panic_does_not_unwind_through_ffi() {
-    let rc = unsafe { qjd_test_panic() };
-    assert_eq!(rc, 8); // QJD_OOM
-}
-```
-
-- [ ] **Step 5: Run tests**
-
-```sh
-cargo test
-```
-
-Expected: all previously passing tests + panic-safety test pass. No tests panic-unwind through the FFI boundary.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/ffi.rs tests/ffi_panic_safety.rs
-git commit -m "Wrap FFI entry points in catch_unwind to prevent UB on panic"
-```
-
----
-
-## Task 13: Avx2Scanner — structural mask only (no string handling)
-
-This task establishes the AVX2 scaffolding and produces a correct scanner for input that contains **no strings**. Task 14 adds quote-and-escape handling. Task 15 adds the PCLMUL inside-string mask. Task 16 wires up multi-chunk state and registers in the dispatch.
-
-The implementation follows the simdjson approach. Reference: `simdjson/src/generic/stage1/json_structural_indexer.h` and `simdjson/src/generic/stage1/buf_block_reader.h` for the chunking and bitmask emission patterns.
-
-**Files:**
-- Create: `src/scan/avx2.rs`
-- Modify: `src/scan/mod.rs`
-
-- [ ] **Step 1: Create skeleton in `src/scan/avx2.rs`**
-
-```rust
-#![cfg(target_arch = "x86_64")]
-
-use core::arch::x86_64::*;
-use super::Scanner;
-
-pub(crate) struct Avx2Scanner;
-
-impl Scanner for Avx2Scanner {
-    fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
-        if buf.is_empty() { return Ok(()); }
-        out.reserve(buf.len() / 6);
-        unsafe { scan_avx2_impl(buf, out) }
-    }
-}
-
-#[target_feature(enable = "avx2,pclmulqdq")]
-unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
-    // Task 13: structural mask only; assumes no strings/escapes.
-    let mut i: usize = 0;
-    while i + 64 <= buf.len() {
-        let chunk_lo = _mm256_loadu_si256(buf.as_ptr().add(i)        as *const __m256i);
-        let chunk_hi = _mm256_loadu_si256(buf.as_ptr().add(i + 32) as *const __m256i);
-
-        let struct_mask = structural_mask_chunk(chunk_lo, chunk_hi);
-        emit_bits(struct_mask, i as u32, out);
-
-        i += 64;
-    }
-
-    // Tail: scalar fallback for the remainder.
-    super::ScalarScanner::scan(&buf[i..], &mut Vec::new()).ok();
-    // Append tail offsets (offset by i).
-    let mut tail = Vec::new();
-    super::ScalarScanner::scan(&buf[i..], &mut tail).map_err(|p| p + i)?;
-    out.extend(tail.into_iter().map(|p| p + i as u32));
-    Ok(())
-}
-
-#[inline(always)]
-unsafe fn structural_mask_chunk(lo: __m256i, hi: __m256i) -> u64 {
-    // For each byte, set 1 if byte is one of: { } [ ] : , "
-    // We use byte-wise equality compares OR'd together.
-    let chars = [b'{', b'}', b'[', b']', b':', b',', b'"'];
-    let mut mask_lo: i32 = 0;
-    let mut mask_hi: i32 = 0;
-    for c in chars {
-        let v = _mm256_set1_epi8(c as i8);
-        let eq_lo = _mm256_cmpeq_epi8(lo, v);
-        let eq_hi = _mm256_cmpeq_epi8(hi, v);
-        mask_lo |= _mm256_movemask_epi8(eq_lo);
-        mask_hi |= _mm256_movemask_epi8(eq_hi);
-    }
-    (mask_lo as u32 as u64) | ((mask_hi as u32 as u64) << 32)
-}
-
-#[inline(always)]
-fn emit_bits(mut mask: u64, base: u32, out: &mut Vec<u32>) {
-    while mask != 0 {
-        let tz = mask.trailing_zeros();
-        out.push(base + tz);
-        mask &= mask - 1; // clear lowest bit
-    }
-}
-```
-
-- [ ] **Step 2: Add a unit test in `src/scan/avx2.rs`**
-
-```rust
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::scan::{Scanner, ScalarScanner};
-
-    fn parity(input: &[u8]) {
-        let mut a = Vec::new();
-        let mut b = Vec::new();
-        ScalarScanner::scan(input, &mut a).unwrap();
-        Avx2Scanner::scan(input, &mut b).unwrap();
-        assert_eq!(a, b, "mismatch on input {:?}", std::str::from_utf8(input));
-    }
-
-    #[test]
-    fn no_strings_matches_scalar() {
-        // Pure structural inputs (no strings) — Task 13 only handles these correctly.
-        parity(b"{}");
-        parity(b"[]");
-        parity(b"[{}]");
-        parity(b"[[[]]]");
-        parity(b"[1,2,3,4,5,6,7,8,9,0]");
-        parity(b"{1:2,3:4,5:6,7:8,9:0,1:2}"); // illegal JSON keys, but valid scan
-    }
-}
-```
-
-- [ ] **Step 3: Run tests**
-
-```sh
-cargo test scan::avx2
-```
-
-Expected: tests pass on AVX2-capable hosts. Will fail to compile on non-x86_64 (gated).
-
-- [ ] **Step 4: Wire module into `src/scan/mod.rs`**
-
-```rust
-#[cfg(target_arch = "x86_64")]
-pub(crate) mod avx2;
-```
-
-- [ ] **Step 5: Commit**
-
-```sh
-git add src/scan/avx2.rs src/scan/mod.rs
-git commit -m "Add AVX2 scanner skeleton with structural mask kernel"
-```
-
----
-
-## Task 14: Avx2Scanner — quote and escape masks
-
-This task adds correct handling of strings inside the AVX2 kernel via the simdjson backslash-escape algorithm. After this task, the AVX2 scanner produces correct results on any input that fits in a single 64-byte chunk **plus** correctly handles within-chunk strings; multi-chunk state carry is added in Task 16.
-
-**Files:**
-- Modify: `src/scan/avx2.rs`
-
-- [ ] **Step 1: Add escape-aware quote mask helper**
-
-Append to `src/scan/avx2.rs`:
-
-```rust
-/// Compute the mask of escaped bytes inside a 64-byte chunk: for any backslash
-/// followed by a byte, that byte is "escaped". Consecutive backslashes alternate.
-/// Algorithm: identify odd-length backslash runs; the byte immediately after
-/// each odd run is escaped.
-#[inline(always)]
-unsafe fn find_escape_mask(backslash_mask: u64) -> u64 {
-    // From simdjson: identify starts of backslash runs.
-    let starts = backslash_mask & !(backslash_mask << 1);
-    // For each start, determine parity by xor-running. Encode start position
-    // parity via odd/even bit patterns of the runs.
-    // For chunk-local computation (no cross-chunk carry yet):
-    let even_bits: u64 = 0x5555_5555_5555_5555;
-    let odd_bits:  u64 = 0xAAAA_AAAA_AAAA_AAAA;
-    let even_starts = starts & even_bits;
-    let odd_starts  = starts & odd_bits;
-    // Use carry arithmetic to find run ends.
-    let even_carries = backslash_mask.wrapping_add(even_starts);
-    let odd_carries  = backslash_mask.wrapping_add(odd_starts);
-    let even_carry_ends = even_carries & !backslash_mask;
-    let odd_carry_ends  = odd_carries  & !backslash_mask;
-    // odd-length runs starting at even position end at odd parity;
-    // odd-length runs starting at odd position end at even parity.
-    let odd_run_ends = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits);
-    // Each odd_run_end bit is the position right after an odd-length run; the
-    // byte at that position is escaped.
-    odd_run_ends
-}
-```
-
-This computation is the standard simdjson kernel; see `find_escape_mask` in simdjson's source for the canonical derivation. For chunk-local correctness it's enough; cross-chunk carry comes in Task 16.
-
-- [ ] **Step 2: Update `scan_avx2_impl` to use quote mask**
-
-Replace `scan_avx2_impl`:
-
-```rust
-#[target_feature(enable = "avx2,pclmulqdq")]
-unsafe fn scan_avx2_impl(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
-    let mut i: usize = 0;
-    let mut in_string: u64 = 0; // 1 if chunk-start is inside a string
-
-    while i + 64 <= buf.len() {
-        let chunk_lo = _mm256_loadu_si256(buf.as_ptr().add(i)        as *const __m256i);
-        let chunk_hi = _mm256_loadu_si256(buf.as_ptr().add(i + 32) as *const __m256i);
-
-        let backslash = byte_mask(chunk_lo, chunk_hi, b'\\');
-        let quote     = byte_mask(chunk_lo, chunk_hi, b'"');
-        let escaped   = find_escape_mask(backslash);
-        let real_quote = quote & !escaped;
-
-        // inside_string_mask is computed in Task 15 via PCLMUL. For Task 14
-        // we just emit structural chars excluding bytes inside strings using
-        // a scalar in-string flag carried across this chunk only.
-        // This is a placeholder bridge — Task 15 replaces it with PCLMUL.
-        let mut inside: u64 = if in_string != 0 { !0u64 } else { 0 };
-        let mut bit = 1u64;
-        let mut in_str_cur = in_string != 0;
-        for _ in 0..64 {
-            if (real_quote & bit) != 0 {
-                in_str_cur = !in_str_cur;
-                inside ^= bit;
-            }
-            if in_str_cur { inside |= bit; } else { inside &= !bit; }
-            bit <<= 1;
-            if bit == 0 { break; }
-        }
-        in_string = if in_str_cur { 1 } else { 0 };
-
-        let struct_mask = structural_mask_chunk(chunk_lo, chunk_hi);
-        let final_mask = (struct_mask & !inside) | real_quote;
-        emit_bits(final_mask, i as u32, out);
-
-        i += 64;
-    }
-
-    // Tail
-    let mut tail = Vec::new();
-    super::ScalarScanner::scan(&buf[i..], &mut tail).map_err(|p| p + i)?;
-    out.extend(tail.into_iter().map(|p| p + i as u32));
-    Ok(())
-}
-
-#[inline(always)]
-unsafe fn byte_mask(lo: __m256i, hi: __m256i, c: u8) -> u64 {
-    let v = _mm256_set1_epi8(c as i8);
-    let eq_lo = _mm256_cmpeq_epi8(lo, v);
-    let eq_hi = _mm256_cmpeq_epi8(hi, v);
-    let mlo = _mm256_movemask_epi8(eq_lo) as u32 as u64;
-    let mhi = _mm256_movemask_epi8(eq_hi) as u32 as u64;
-    mlo | (mhi << 32)
-}
-```
-
-- [ ] **Step 3: Extend the parity tests**
-
-In the test module:
-
-```rust
-#[test]
-fn within_chunk_strings_match_scalar() {
-    parity(b"{\"a\":\"hello\"}");
-    parity(b"{\"a\":\"he\\nlo\"}");
-    parity(b"{\"a\":\"he\\\"lo\"}");
-    parity(b"[\"x\",\"y\",\"z\"]");
-}
-```
-
-- [ ] **Step 4: Run tests**
-
-```sh
-cargo test scan::avx2
-```
-
-Expected: pass on AVX2 hosts. Inputs spanning more than 64 bytes may still mismatch — that's covered in Task 16.
-
-- [ ] **Step 5: Commit**
-
-```sh
-git add src/scan/avx2.rs
-git commit -m "AVX2 scanner: chunk-local quote and escape masks"
-```
-
----
-
-## Task 15: Avx2Scanner — PCLMUL inside-string mask
-
-Replace the scalar in-string flag bridge with the PCLMUL prefix-XOR algorithm that simdjson uses. This makes the kernel branchless per chunk.
-
-**Files:**
-- Modify: `src/scan/avx2.rs`
-
-- [ ] **Step 1: Add the PCLMUL helper**
-
-In `src/scan/avx2.rs`:
-
-```rust
-/// Given the mask of true (non-escaped) quotes and the prior in-string state,
-/// return (inside_string_mask, new_in_string).
-/// Algorithm: prefix XOR via carry-less multiplication.
-#[inline(always)]
-#[target_feature(enable = "avx2,pclmulqdq")]
-unsafe fn inside_string_mask(real_quote: u64, prev_in_string: u64) -> (u64, u64) {
-    // Prefix XOR: for each bit, the result bit equals the XOR of all bits up
-    // to and including this position. Carry-less multiply by all-ones produces
-    // exactly this prefix XOR.
-    let ones = _mm_set1_epi64x(-1i64);
-    let q = _mm_set_epi64x(0, real_quote as i64);
-    let prefix = _mm_clmulepi64_si128::<0>(q, ones);
-    let mut mask = _mm_cvtsi128_si64(prefix) as u64;
-    // XOR with prior state so that a chunk starting inside a string flips polarity.
-    if prev_in_string != 0 { mask = !mask; }
-    let new_state = mask >> 63;
-    (mask, new_state & 1)
-}
-```
-
-- [ ] **Step 2: Replace the scalar bridge in `scan_avx2_impl`**
-
-Replace the `// inside_string_mask is computed in Task 15` block with:
-
-```rust
-        let (inside, new_in_string) = inside_string_mask(real_quote, in_string);
-        in_string = new_in_string;
-```
-
-(Remove the bit-by-bit scalar loop.)
-
-- [ ] **Step 3: Add tests with multi-quote strings**
-
-```rust
-#[test]
-fn pclmul_inside_string_correct() {
-    parity(b"{\"a\":\"foo\",\"b\":\"bar\"}");
-    parity(b"[\"a\",\"b\",\"c\",\"d\",\"e\"]");
-    // Adversarial: nested escapes
-    parity(b"{\"a\":\"\\\\\\\\\\\"\"}");
-}
-```
-
-- [ ] **Step 4: Run tests**
-
-```sh
-cargo test scan::avx2
-```
-
-Expected: pass on AVX2+PCLMUL hosts.
-
-- [ ] **Step 5: Commit**
-
-```sh
-git add src/scan/avx2.rs
-git commit -m "AVX2 scanner: PCLMUL prefix-XOR for inside-string mask"
-```
-
----
-
-## Task 16: Multi-chunk state, runtime dispatch, proptest cross-check
-
-**Files:**
-- Modify: `src/scan/avx2.rs`
-- Modify: `src/scan/mod.rs`
-- Modify: `src/doc.rs`
-- Create: `tests/scanner_crosscheck.rs`
-
-- [ ] **Step 1: Add cross-chunk carry to backslash escape**
-
-The chunk-local `find_escape_mask` is incorrect at chunk boundaries when a backslash run crosses the boundary. Fix:
-
-```rust
-#[inline(always)]
-unsafe fn find_escape_mask_with_carry(
-    backslash_mask: u64, prev_carry: &mut u64,
-) -> u64 {
-    let bs = backslash_mask;
-    let starts = bs & !(bs << 1 | *prev_carry);
-    let even_bits: u64 = 0x5555_5555_5555_5555;
-    let odd_bits:  u64 = 0xAAAA_AAAA_AAAA_AAAA;
-    let even_starts = starts & even_bits;
-    let odd_starts  = starts & odd_bits;
-    let even_carries = bs.wrapping_add(even_starts);
-    let odd_carries  = bs.wrapping_add(odd_starts).wrapping_add(*prev_carry);
-    let even_carry_ends = even_carries & !bs;
-    let odd_carry_ends  = odd_carries  & !bs;
-    let odd_run_ends = (even_carry_ends & odd_bits) | (odd_carry_ends & even_bits);
-    // Update carry for next chunk: 1 if the chunk ended mid-run with odd parity.
-    *prev_carry = (bs >> 63) & 1;
-    odd_run_ends
-}
-```
-
-Update `scan_avx2_impl` to keep `let mut bs_carry: u64 = 0;` across iterations and call `find_escape_mask_with_carry(backslash, &mut bs_carry)` instead of `find_escape_mask`.
-
-- [ ] **Step 2: Set up runtime dispatch in `src/scan/mod.rs`**
-
-```rust
-use once_cell::sync::OnceCell;
-
-static SCAN_FN: OnceCell<fn(&[u8], &mut Vec<u32>) -> Result<(), usize>>
-    = OnceCell::new();
-
-pub(crate) fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize> {
-    let f = *SCAN_FN.get_or_init(|| {
-        #[cfg(target_arch = "x86_64")]
-        {
-            if std::is_x86_feature_detected!("avx2")
-                && std::is_x86_feature_detected!("pclmulqdq")
-            {
-                return avx2::Avx2Scanner::scan;
-            }
-        }
-        ScalarScanner::scan
-    });
-    f(buf, out)
-}
-```
-
-- [ ] **Step 3: Wire `Document::parse` to call `scan::scan` instead of `ScalarScanner::scan`**
-
-In `src/doc.rs`:
-
-```rust
-pub fn parse(buf: &'a [u8]) -> Result<Self, qjd_err> {
-    let mut indices = Vec::new();
-    crate::scan::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?;
-    indices.push(u32::MAX);
-    Ok(Self {
-        buf,
-        indices,
-        scratch: Vec::new(),
-        skip: std::cell::RefCell::new(crate::skip_cache::SkipCache::new()),
-    })
-}
-```
-
-- [ ] **Step 4: Write `tests/scanner_crosscheck.rs`**
-
-```rust
-use proptest::prelude::*;
-use quickdecode::error::qjd_err;
-
-// We need access to the internal scanners. Expose them via a test-only path
-// through pub(crate). Easiest: add a pub-test export.
-//
-// In src/lib.rs add:
-//   #[doc(hidden)] pub mod __test_api {
-//       pub use crate::scan::{ScalarScanner, Scanner};
-//       #[cfg(target_arch="x86_64")] pub use crate::scan::avx2::Avx2Scanner;
-//   }
-use quickdecode::__test_api::{Scanner, ScalarScanner};
-#[cfg(target_arch = "x86_64")]
-use quickdecode::__test_api::Avx2Scanner;
-
-#[cfg(target_arch = "x86_64")]
-proptest! {
-    #![proptest_config(ProptestConfig::with_cases(2000))]
-
-    #[test]
-    fn scalar_avx2_bit_identical(input in valid_jsonish()) {
-        if !std::is_x86_feature_detected!("avx2")
-            || !std::is_x86_feature_detected!("pclmulqdq") {
-            return Ok(());
-        }
-        let mut a = Vec::new();
-        let mut b = Vec::new();
-        let ra = ScalarScanner::scan(input.as_bytes(), &mut a);
-        let rb = Avx2Scanner::scan(input.as_bytes(), &mut b);
-        prop_assert_eq!(ra.is_err(), rb.is_err(),
-            "scalar/avx2 disagree on validity for {:?}", input);
-        if ra.is_ok() {
-            prop_assert_eq!(a, b, "mismatch on {:?}", input);
-        }
-    }
-}
-
-/// Generate strings that exercise structural and quote/escape edge cases.
-fn valid_jsonish() -> impl Strategy<Value = String> {
-    // Mix of structural bytes, escape sequences, multi-byte UTF-8.
-    proptest::collection::vec(
-        prop_oneof![
-            Just("{".to_string()),
-            Just("}".to_string()),
-            Just("[".to_string()),
-            Just("]".to_string()),
-            Just(",".to_string()),
-            Just(":".to_string()),
-            Just("\"a\"".to_string()),
-            Just("\"\\\\\"".to_string()),
-            Just("\"\\\"\"".to_string()),
-            Just("\"\\u00e9\"".to_string()),
-            Just("\"中文\"".to_string()),
-            Just("123".to_string()),
-        ],
-        0..200,
-    ).prop_map(|v| v.concat())
-}
-```
-
-Also add to `src/lib.rs`:
-
-```rust
-#[doc(hidden)]
-pub mod __test_api {
-    pub use crate::scan::{ScalarScanner, Scanner};
-    #[cfg(target_arch = "x86_64")]
-    pub use crate::scan::avx2::Avx2Scanner;
-}
-```
-
-- [ ] **Step 5: Run cross-check**
-
-```sh
-cargo test scanner_crosscheck --release
-```
-
-Expected: 2000 proptest cases pass with no scalar/AVX2 divergence.
-
-- [ ] **Step 6: Commit**
-
-```sh
-git add src/scan/avx2.rs src/scan/mod.rs src/doc.rs src/lib.rs tests/scanner_crosscheck.rs
-git commit -m "AVX2 scanner cross-chunk carry, runtime dispatch, proptest cross-check"
-```
-
----
-
-## Task 17: Public C header (finalize) + LuaJIT wrapper
-
-**Files:**
-- Modify: `include/lua_quick_decode.h`
-- Create: `lua/quickdecode.lua`
-- Modify: `README.md`
-
-- [ ] **Step 1: Finalize `include/lua_quick_decode.h`**
-
-Replace placeholder with full prototypes matching the FFI surface:
-
-```c
-#ifndef LUA_QUICK_DECODE_H
-#define LUA_QUICK_DECODE_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-    QJD_OK            = 0,
-    QJD_PARSE_ERROR   = 1,
-    QJD_NOT_FOUND     = 2,
-    QJD_TYPE_MISMATCH = 3,
-    QJD_OUT_OF_RANGE  = 4,
-    QJD_DECODE_FAILED = 5,
-    QJD_INVALID_PATH  = 6,
-    QJD_INVALID_ARG   = 7,
-    QJD_OOM           = 8
-} qjd_err;
-
-typedef enum {
-    QJD_T_NULL = 0, QJD_T_BOOL = 1, QJD_T_NUM = 2,
-    QJD_T_STR  = 3, QJD_T_ARR  = 4, QJD_T_OBJ = 5
-} qjd_type;
-
-typedef struct qjd_doc qjd_doc;
-
-typedef struct {
-    const qjd_doc* doc;
-    uint32_t       idx_start;
-    uint32_t       idx_end;
-    uint32_t       cache_slot;
-    uint32_t       _pad;
-} qjd_cursor;
-
-const char* qjd_strerror(int code);
-
-qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out);
-void     qjd_free (qjd_doc* doc);
-
-int qjd_get_str  (qjd_doc*, const char* path, size_t path_len,
-                  const uint8_t** out_ptr, size_t* out_len);
-int qjd_get_i64  (qjd_doc*, const char* path, size_t path_len, int64_t* out);
-int qjd_get_f64  (qjd_doc*, const char* path, size_t path_len, double*  out);
-int qjd_get_bool (qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_is_null  (qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_typeof   (qjd_doc*, const char* path, size_t path_len, int*     type_out);
-int qjd_len      (qjd_doc*, const char* path, size_t path_len, size_t*  out);
-
-int qjd_open            (qjd_doc*, const char* path, size_t path_len, qjd_cursor* out);
-int qjd_cursor_open     (const qjd_cursor*, const char* path, size_t path_len, qjd_cursor* out);
-int qjd_cursor_field    (const qjd_cursor*, const char* key,  size_t key_len, qjd_cursor* out);
-int qjd_cursor_index    (const qjd_cursor*, size_t i, qjd_cursor* out);
-
-int qjd_cursor_get_str  (const qjd_cursor*, const char* path, size_t path_len,
-                         const uint8_t** out_ptr, size_t* out_len);
-int qjd_cursor_get_i64  (const qjd_cursor*, const char* path, size_t path_len, int64_t* out);
-int qjd_cursor_get_f64  (const qjd_cursor*, const char* path, size_t path_len, double*  out);
-int qjd_cursor_get_bool (const qjd_cursor*, const char* path, size_t path_len, int*     out);
-int qjd_cursor_typeof   (const qjd_cursor*, const char* path, size_t path_len, int*     out);
-int qjd_cursor_len      (const qjd_cursor*, const char* path, size_t path_len, size_t*  out);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-```
-
-- [ ] **Step 2: Create `lua/quickdecode.lua`**
-
-```lua
-local ffi = require("ffi")
-
-ffi.cdef[[
-typedef enum {
-    QJD_OK = 0, QJD_PARSE_ERROR = 1, QJD_NOT_FOUND = 2,
-    QJD_TYPE_MISMATCH = 3, QJD_OUT_OF_RANGE = 4, QJD_DECODE_FAILED = 5,
-    QJD_INVALID_PATH = 6, QJD_INVALID_ARG = 7, QJD_OOM = 8
-} qjd_err;
-
-typedef struct qjd_doc qjd_doc;
-typedef struct {
-    const qjd_doc* doc;
-    uint32_t idx_start, idx_end, cache_slot, _pad;
-} qjd_cursor;
-
-const char* qjd_strerror(int code);
-qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out);
-void qjd_free(qjd_doc* doc);
-
-int qjd_get_str (qjd_doc*, const char* path, size_t path_len, const uint8_t** p, size_t* n);
-int qjd_get_i64 (qjd_doc*, const char* path, size_t path_len, int64_t* out);
-int qjd_get_f64 (qjd_doc*, const char* path, size_t path_len, double*  out);
-int qjd_get_bool(qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_is_null (qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_typeof  (qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_len     (qjd_doc*, const char* path, size_t path_len, size_t*  out);
-
-int qjd_open        (qjd_doc*, const char* path, size_t path_len, qjd_cursor* out);
-int qjd_cursor_open (const qjd_cursor*, const char* path, size_t path_len, qjd_cursor* out);
-int qjd_cursor_field(const qjd_cursor*, const char* key,  size_t key_len, qjd_cursor* out);
-int qjd_cursor_index(const qjd_cursor*, size_t i, qjd_cursor* out);
-
-int qjd_cursor_get_str (const qjd_cursor*, const char*, size_t, const uint8_t**, size_t*);
-int qjd_cursor_get_i64 (const qjd_cursor*, const char*, size_t, int64_t*);
-int qjd_cursor_get_f64 (const qjd_cursor*, const char*, size_t, double*);
-int qjd_cursor_get_bool(const qjd_cursor*, const char*, size_t, int*);
-int qjd_cursor_typeof  (const qjd_cursor*, const char*, size_t, int*);
-int qjd_cursor_len     (const qjd_cursor*, const char*, size_t, size_t*);
-]]
-
-local C = ffi.load("quickdecode")
-
-local err_box  = ffi.new("int[1]")
-local i64_box  = ffi.new("int64_t[1]")
-local f64_box  = ffi.new("double[1]")
-local bool_box = ffi.new("int[1]")
-local size_box = ffi.new("size_t[1]")
-local type_box = ffi.new("int[1]")
-local strp_box = ffi.new("const uint8_t*[1]")
-local cur_box  = ffi.new("qjd_cursor[1]")
-
-local NOT_FOUND = 2
-
-local _M = {
-    T_NULL = 0, T_BOOL = 1, T_NUM = 2,
-    T_STR  = 3, T_ARR  = 4, T_OBJ = 5,
-}
-
-local Doc    = {}; Doc.__index    = Doc
-local Cursor = {}; Cursor.__index = Cursor
-
-local function check_err(rc)
-    if rc == 0 then return true end
-    if rc == NOT_FOUND then return false end
-    error("quickdecode: " .. ffi.string(C.qjd_strerror(rc)))
-end
-
-function _M.parse(json_str)
-    local ptr = C.qjd_parse(json_str, #json_str, err_box)
-    if ptr == nil then
-        error("quickdecode: " .. ffi.string(C.qjd_strerror(err_box[0])))
-    end
-    return setmetatable({
-        _ptr  = ffi.gc(ptr, C.qjd_free),
-        _hold = json_str,
-    }, Doc)
-end
-
-function Doc:get_str(path)
-    local rc = C.qjd_get_str(self._ptr, path, #path, strp_box, size_box)
-    if not check_err(rc) then return nil end
-    return ffi.string(strp_box[0], size_box[0])
-end
-
-function Doc:get_i64(path)
-    local rc = C.qjd_get_i64(self._ptr, path, #path, i64_box)
-    if not check_err(rc) then return nil end
-    return tonumber(i64_box[0])
-end
-
-function Doc:get_f64(path)
-    local rc = C.qjd_get_f64(self._ptr, path, #path, f64_box)
-    if not check_err(rc) then return nil end
-    return f64_box[0]
-end
-
-function Doc:get_bool(path)
-    local rc = C.qjd_get_bool(self._ptr, path, #path, bool_box)
-    if not check_err(rc) then return nil end
-    return bool_box[0] ~= 0
-end
-
-function Doc:is_null(path)
-    local rc = C.qjd_is_null(self._ptr, path, #path, bool_box)
-    if not check_err(rc) then return nil end
-    return bool_box[0] ~= 0
-end
-
-function Doc:typeof(path)
-    local rc = C.qjd_typeof(self._ptr, path, #path, type_box)
-    if not check_err(rc) then return nil end
-    return type_box[0]
-end
-
-function Doc:len(path)
-    local rc = C.qjd_len(self._ptr, path, #path, size_box)
-    if not check_err(rc) then return nil end
-    return tonumber(size_box[0])
-end
-
-function Doc:open(path)
-    local rc = C.qjd_open(self._ptr, path, #path, cur_box)
-    if not check_err(rc) then return nil end
-    return setmetatable({ _cur = cur_box[0], _doc = self }, Cursor)
-end
-
-function Cursor:get_str(path)
-    local rc = C.qjd_cursor_get_str(self._cur, path or "", path and #path or 0, strp_box, size_box)
-    if not check_err(rc) then return nil end
-    return ffi.string(strp_box[0], size_box[0])
-end
-
-function Cursor:get_i64(path)
-    local rc = C.qjd_cursor_get_i64(self._cur, path or "", path and #path or 0, i64_box)
-    if not check_err(rc) then return nil end
-    return tonumber(i64_box[0])
-end
-
-function Cursor:get_f64(path)
-    local rc = C.qjd_cursor_get_f64(self._cur, path or "", path and #path or 0, f64_box)
-    if not check_err(rc) then return nil end
-    return f64_box[0]
-end
-
-function Cursor:get_bool(path)
-    local rc = C.qjd_cursor_get_bool(self._cur, path or "", path and #path or 0, bool_box)
-    if not check_err(rc) then return nil end
-    return bool_box[0] ~= 0
-end
-
-function Cursor:typeof(path)
-    local rc = C.qjd_cursor_typeof(self._cur, path or "", path and #path or 0, type_box)
-    if not check_err(rc) then return nil end
-    return type_box[0]
-end
-
-function Cursor:len(path)
-    local rc = C.qjd_cursor_len(self._cur, path or "", path and #path or 0, size_box)
-    if not check_err(rc) then return nil end
-    return tonumber(size_box[0])
-end
-
-function Cursor:open(path)
-    local out = ffi.new("qjd_cursor[1]")
-    local rc = C.qjd_cursor_open(self._cur, path, #path, out)
-    if not check_err(rc) then return nil end
-    return setmetatable({ _cur = out[0], _doc = self._doc }, Cursor)
-end
-
-function Cursor:field(key)
-    local out = ffi.new("qjd_cursor[1]")
-    local rc = C.qjd_cursor_field(self._cur, key, #key, out)
-    if not check_err(rc) then return nil end
-    return setmetatable({ _cur = out[0], _doc = self._doc }, Cursor)
-end
-
-function Cursor:index(i)
-    local out = ffi.new("qjd_cursor[1]")
-    local rc = C.qjd_cursor_index(self._cur, i, out)
-    if not check_err(rc) then return nil end
-    return setmetatable({ _cur = out[0], _doc = self._doc }, Cursor)
-end
-
-return _M
-```
-
-- [ ] **Step 3: Update README with LuaJIT usage example**
-
-Append to `README.md`:
-
-```markdown
-## LuaJIT Usage
-
-```lua
-local qd = require("quickdecode")
-local doc = qd.parse(json_str)
-
--- Root-path getter:
-local model = doc:get_str("body.model")
-
--- Cursor (avoid re-walking shared prefix):
-local body = doc:open("body")
-local model = body:get_str("model")
-local temp  = body:get_f64("temperature")
-```
-```
-
-- [ ] **Step 4: Commit**
-
-```sh
-git add include/lua_quick_decode.h lua/quickdecode.lua README.md
-git commit -m "Finalize C header and add LuaJIT wrapper module"
-```
-
----
-
-## Task 18: Lua integration tests (busted) + benchmark vs lua-cjson
-
-**Files:**
-- Create: `tests/lua/basic_spec.lua`
-- Create: `tests/lua/escape_spec.lua`
-- Create: `tests/lua/cjson_compat_spec.lua`
-- Create: `benches/lua_bench.lua`
-- Create: `benches/fixtures/small_api.json`
-- Create: `benches/fixtures/medium_resp.json`
-- Modify: `README.md`
-
-- [ ] **Step 1: Write `tests/lua/basic_spec.lua`**
-
-```lua
-local qd = require("quickdecode")
-
-describe("quickdecode basic", function()
-    it("parses an object and gets a string field", function()
-        local d = qd.parse('{"a":"hello"}')
-        assert.are.equal("hello", d:get_str("a"))
-    end)
-
-    it("returns nil on missing path", function()
-        local d = qd.parse('{"a":1}')
-        assert.is_nil(d:get_str("b"))
-    end)
-
-    it("errors on type mismatch", function()
-        local d = qd.parse('{"a":1}')
-        assert.has_error(function() d:get_str("a") end)
-    end)
-
-    it("supports nested paths", function()
-        local d = qd.parse('{"body":{"model":"gpt"}}')
-        assert.are.equal("gpt", d:get_str("body.model"))
-    end)
-
-    it("supports array indexing", function()
-        local d = qd.parse('{"xs":[10,20,30]}')
-        assert.are.equal(20, d:get_i64("xs[1]"))
-    end)
-
-    it("cursor reuses shared prefix", function()
-        local d = qd.parse('{"body":{"a":1,"b":"two"}}')
-        local b = d:open("body")
-        assert.are.equal(1, b:get_i64("a"))
-        assert.are.equal("two", b:get_str("b"))
-    end)
-
-    it("typeof reports correct types", function()
-        local d = qd.parse('{"s":"x","n":1,"f":1.5,"b":true,"z":null,"a":[],"o":{}}')
-        assert.are.equal(qd.T_STR,  d:typeof("s"))
-        assert.are.equal(qd.T_NUM,  d:typeof("n"))
-        assert.are.equal(qd.T_NUM,  d:typeof("f"))
-        assert.are.equal(qd.T_BOOL, d:typeof("b"))
-        assert.are.equal(qd.T_NULL, d:typeof("z"))
-        assert.are.equal(qd.T_ARR,  d:typeof("a"))
-        assert.are.equal(qd.T_OBJ,  d:typeof("o"))
-    end)
-
-    it("len for objects and arrays", function()
-        local d = qd.parse('{"o":{"a":1,"b":2,"c":3},"a":[1,2,3,4]}')
-        assert.are.equal(3, d:len("o"))
-        assert.are.equal(4, d:len("a"))
-    end)
-end)
-```
-
-- [ ] **Step 2: Write `tests/lua/escape_spec.lua`**
-
-```lua
-local qd = require("quickdecode")
-
-describe("quickdecode strings", function()
-    it("decodes simple escape", function()
-        local d = qd.parse('{"a":"he\\nlo"}')
-        assert.are.equal("he\nlo", d:get_str("a"))
-    end)
-
-    it("decodes unicode escape", function()
-        local d = qd.parse('{"a":"\\u00e9"}')
-        assert.are.equal("\xc3\xa9", d:get_str("a"))
-    end)
-
-    it("decodes surrogate pair", function()
-        local d = qd.parse('{"a":"\\uD83D\\uDE00"}')
-        assert.are.equal("\xF0\x9F\x98\x80", d:get_str("a"))
-    end)
-
-    it("zero-copy for unescaped strings", function()
-        local d = qd.parse('{"a":"plain"}')
-        assert.are.equal("plain", d:get_str("a"))
-    end)
-end)
-```
-
-- [ ] **Step 3: Write `tests/lua/cjson_compat_spec.lua`**
-
-```lua
-local qd    = require("quickdecode")
-local cjson = require("cjson")
-
-local function expect_eq(qd_doc, cjson_obj, paths)
-    for _, p in ipairs(paths) do
-        local got = qd_doc:get_str(p) or qd_doc:get_f64(p) or qd_doc:get_bool(p)
-        -- Walk cjson result.
-        local want = cjson_obj
-        for seg in p:gmatch("[^%.]+") do
-            want = want[seg] or want[tonumber(seg)]
-        end
-        assert.are.equal(want, got, "path " .. p)
-    end
-end
-
-describe("quickdecode vs lua-cjson", function()
-    it("agrees on simple object fields", function()
-        local s = '{"a":"x","b":42,"c":1.5,"d":true}'
-        expect_eq(qd.parse(s), cjson.decode(s), {"a","b","c","d"})
-    end)
-end)
-```
-
-- [ ] **Step 4: Write `benches/fixtures/small_api.json`**
-
-A representative ~5KB JSON. Concrete content (a single LLM API request shape):
-
-```json
-{
-  "model": "gpt-4",
-  "temperature": 0.7,
-  "max_tokens": 1024,
-  "messages": [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Hello, how are you?"}
-  ],
-  "metadata": {
-    "user_id": "u_123",
-    "session_id": "s_abc",
-    "tags": ["a", "b", "c"]
-  }
-}
-```
-
-(Real fixture should be ~5KB; pad messages content or add more keys to reach ~5KB. Same shape for medium_resp.json at ~200KB with more messages.)
-
-- [ ] **Step 5: Write `benches/lua_bench.lua`**
-
-```lua
-package.path = package.path .. ";./lua/?.lua"
-package.cpath = package.cpath .. ";./target/release/lib?.so"
-
-local qd    = require("quickdecode")
-local cjson = require("cjson")
-
-local function read_file(p)
-    local f = assert(io.open(p, "rb"))
-    local s = f:read("*a")
-    f:close()
-    return s
-end
-
-local function bench(name, iters, fn)
-    collectgarbage("collect")
-    local mem_before = collectgarbage("count")
-    local t0 = os.clock()
-    for i = 1, iters do fn() end
-    local t1 = os.clock()
-    local mem_after = collectgarbage("count")
-    print(string.format("%-40s  %.2fms total  %.2fµs/op  +%.1fKB",
-        name, (t1 - t0) * 1000, (t1 - t0) * 1e6 / iters,
-        mem_after - mem_before))
-end
-
-local fixtures = {
-    small  = read_file("benches/fixtures/small_api.json"),
-    medium = read_file("benches/fixtures/medium_resp.json"),
-}
-
-for size, payload in pairs(fixtures) do
-    print("=== " .. size .. " (" .. #payload .. " bytes) ===")
-
-    bench("cjson.decode  + access 3 fields", 1000, function()
-        local obj = cjson.decode(payload)
-        local _ = obj.model
-        local _ = obj.temperature
-        local _ = obj.messages[1].role
-    end)
-
-    bench("quickdecode.parse + access 3 fields", 1000, function()
-        local d = qd.parse(payload)
-        local _ = d:get_str("model")
-        local _ = d:get_f64("temperature")
-        local _ = d:get_str("messages[0].role")
-    end)
-end
-```
-
-- [ ] **Step 6: Update README with how to run tests/benchmarks**
-
-Append:
-
-```markdown
-## Testing
-
-```sh
-cargo test                                    # Rust unit + integration
-cargo build --release                          # build the .so
-busted tests/lua --lpath='./lua/?.lua' \
-       --cpath='./target/release/lib?.so'      # Lua-side tests
-```
-
-## Benchmarking vs lua-cjson
-
-```sh
-cargo build --release
-luajit benches/lua_bench.lua
-```
-
-Expected: quickdecode is 3-10× faster than lua-cjson on the "decode + extract few fields" pattern. See spec §9.3 for targets.
-```
-
-- [ ] **Step 7: Run all tests**
-
-```sh
-cargo build --release
-busted tests/lua --lpath='./lua/?.lua' --cpath='./target/release/lib?.so'
-luajit benches/lua_bench.lua
-```
-
-Expected: all Lua tests pass; benchmark shows quickdecode beating cjson.
-
-- [ ] **Step 8: Commit**
-
-```sh
-git add tests/lua/ benches/ README.md
-git commit -m "Add Lua integration tests and lua-cjson benchmark"
-```
-
----
-
-## Self-Review
-
-**Spec coverage** (against `2026-05-15-rust-quick-json-decode-design.md`):
-
-| Spec section | Task(s) |
-|---|---|
-| §3.1 Module layout | Tasks 1-7, 13, 15, 17 |
-| §3.2 Data flow | Tasks 3-11 |
-| §3.3 Invariants | Tasks 6, 10 (scratch invalidate), 16 |
-| §4 C ABI types & errors | Tasks 1, 3 |
-| §4.3 qjd_parse / qjd_free | Task 3 |
-| §4.4 Root-path getters | Tasks 9, 10 |
-| §4.5 Cursor API | Task 11 |
-| §4.6 Path syntax | Task 4 |
-| §4.7 String pointer lifetime | Task 10 (scratch handling) |
-| §5 ScalarScanner | Task 2 |
-| §5 Avx2Scanner | Tasks 13-16 |
-| §5.4 Runtime dispatch | Task 16 |
-| §5.6 Shallow validation | Task 2 |
-| §6 Cursor + skip cache | Tasks 5, 6 |
-| §6.5 String escape decode | Task 7 |
-| §6.6 Number decode | Task 8 |
-| §7.3 catch_unwind | Task 12 |
-| §8 Lua wrapper | Task 17 |
-| §9 Tests / Benchmark | Tasks 16 (proptest), 18 |
-
-**Items not covered by individual tasks (acknowledged):**
-- §5.5 SmallVec fast path for <4KB — deferred per spec Roadmap.
-- §6.5 SIMD backslash search — deferred per spec Roadmap.
-- §6.6 `lexical` fast float parser — deferred per spec Roadmap.
-- §7.4 NEON backend — deferred per spec Roadmap.
-- CI workflow (GitHub Actions) — handled in deployment; not in V1 implementation plan.
-
-**Type consistency:**
-- `qjd_cursor.cache_slot` (C side) matches `SkipCache.slots` indexing in Rust (Task 6, 11).
-- `Cursor::idx_start` / `idx_end` consistent across Tasks 5, 6, 9, 10, 11.
-- FFI symbol names match header in Task 17.
-
-**No placeholders:** every step has runnable code or exact commands. AVX2 tasks (13-16) reference simdjson algorithms by name with full kernel code shown.
-
----
-
-Plan complete and saved to `docs/superpowers/plans/2026-05-15-rust-quick-json-decode.md`. Two execution options:
-
-1. **Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration.
-2. **Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints.
-
-Which approach?
diff --git a/docs/superpowers/specs/2026-05-15-avx2-memchr-string-skip-design.md b/docs/superpowers/specs/2026-05-15-avx2-memchr-string-skip-design.md
deleted file mode 100644
index d125cc9..0000000
--- a/docs/superpowers/specs/2026-05-15-avx2-memchr-string-skip-design.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# AVX2 scanner: cheaper in-string fast path
-
-**Status**: design approved, ready for implementation plan
-**Issue**: [#5 perf(scan): memchr-based fast path for in-string content](https://github.com/membphis/lua-quick-decode/issues/5)
-**Touches**: `src/scan/avx2.rs`, `benches/lua_bench.lua`, `README.md` (Roadmap / Deferred)
-
-## Problem
-
-The AVX2 scanner's current in-string fast path (`src/scan/avx2.rs:34-43`, added in PR #3) detects when a 64-byte chunk lies fully inside a string and skips the structural-mask + PCLMUL prefix-XOR work. The condition is `in_string != 0 && real_quote == 0`, which still requires computing both the backslash mask and the escape mask before it can fire.
-
-Per-chunk cost when the current fast path *fires*:
-
-- 2 × `loadu` (free, needed for any path)
-- `backslash` byte mask: ~6 ops
-- `quote` byte mask: ~6 ops
-- `find_escape_mask_with_carry`: ~10 scalar ALU ops + several branches
-- final `real_quote == 0` test
-
-≈ 25 ops per "skip" chunk. On string-heavy payloads — e.g. a multimodal-shaped JSON whose `data` field is ~10 MB of base64 — ~95% of chunks hit this path, making it the dominant scanner cost.
-
-## Goal
-
-Lower per-chunk cost on string-interior chunks from ~25 ops to ~10 ops, by replacing the current fast-path *condition* with a cheaper probe that detects "chunk has no `"` and no `\`" directly, before computing the escape mask.
-
-Estimated speedup on a 10 MB string-heavy payload: ~3× scan-phase throughput (op-count analysis; the implementation will validate via `make bench` against a synthetic fixture).
-
-This proposal is the chunk-granularity step (Option 1 in brainstorming). Cross-chunk `memchr2` jumps for very long string interiors are deferred (see Roadmap / Deferred).
-
-## Non-goals
-
-- Touching the scalar scanner (`src/scan/scalar.rs`). The hot path for the targeted workloads is the AVX2 backend.
-- Changing validation semantics. Every byte still gets scanned for well-formedness; bracket balance still validated at end.
-- Adding a new cargo feature. The change rides on the existing `avx2` feature.
-- Cross-chunk jumps (`memchr2` jump path). Deferred — see Roadmap / Deferred.
-
-## Design
-
-### Code change
-
-Single file: `src/scan/avx2.rs::scan_avx2_impl`. The chunk loop body becomes:
-
-```rust
-while i + 64 <= buf.len() {
-    let chunk_lo = _mm256_loadu_si256(buf.as_ptr().add(i)      as *const __m256i);
-    let chunk_hi = _mm256_loadu_si256(buf.as_ptr().add(i + 32) as *const __m256i);
-
-    // in_string fast-probe: only enter when previous chunk left us inside
-    // a string. Cheap quote-or-backslash mask; if zero, the chunk is pure
-    // string interior and we can skip ALL mask computation including the
-    // escape-run scan.
-    if in_string != 0 {
-        let interesting = quote_or_backslash_mask(chunk_lo, chunk_hi);
-        if interesting == 0 {
-            // No `"` or `\` in chunk → no escapes can originate here, so
-            // bs_carry must be 0 leaving this chunk. in_string stays 1.
-            bs_carry = 0;
-            i += 64;
-            continue;
-        }
-    }
-
-    // Slow path unchanged below.
-    let backslash = byte_mask(chunk_lo, chunk_hi, b'\\');
-    let quote     = byte_mask(chunk_lo, chunk_hi, b'"');
-    let escaped   = find_escape_mask_with_carry(backslash, &mut bs_carry);
-    let real_quote = quote & !escaped;
-
-    let (inside, new_in_string) = inside_string_mask(real_quote, in_string);
-    in_string = new_in_string;
-
-    let struct_mask = structural_mask_chunk(chunk_lo, chunk_hi);
-    let final_mask = (struct_mask & !inside) | real_quote;
-
-    emit_bits(final_mask, i as u32, out);
-
-    i += 64;
-}
-```
-
-The current fast-path branch (`if in_string != 0 && real_quote == 0 { i += 64; continue; }`) is **removed** — the new probe is a true subset of its trigger condition (proof in §"Correctness"), so removing the late fast path costs nothing and the code reads more linearly.
-
-### New helper
-
-```rust
-#[inline(always)]
-unsafe fn quote_or_backslash_mask(lo: __m256i, hi: __m256i) -> u64 {
-    let vq = _mm256_set1_epi8(b'"' as i8);
-    let vb = _mm256_set1_epi8(b'\\' as i8);
-    let lo_or = _mm256_or_si256(_mm256_cmpeq_epi8(lo, vq), _mm256_cmpeq_epi8(lo, vb));
-    let hi_or = _mm256_or_si256(_mm256_cmpeq_epi8(hi, vq), _mm256_cmpeq_epi8(hi, vb));
-    let mlo = _mm256_movemask_epi8(lo_or) as u32 as u64;
-    let mhi = _mm256_movemask_epi8(hi_or) as u32 as u64;
-    mlo | (mhi << 32)
-}
-```
-
-Matches the style of existing helpers (`byte_mask`, `structural_mask_chunk`): `#[inline(always)] unsafe fn` with no explicit `#[target_feature]` annotation — the caller `scan_avx2_impl` carries `#[target_feature(enable = "avx2,pclmulqdq")]` and inlining propagates the feature set.
-
-Op count: 4 `cmpeq` + 2 `or` + 2 `movemask` + 1 shift + 1 or = ~10 vector ops, no scalar ALU, no branches.
-
-### Op-count comparison
-
-| chunk shape | current path | new path | delta |
-|---|---|---|---|
-| not in_string | full mask path (~25 ops, no fast path) | unchanged | 0 |
-| in_string, chunk pure string interior | ~25 ops (current fast path) | ~10 ops (new probe) | **−60%** |
-| in_string, chunk has `\` or `"` | ~25 ops slow path | ~10 ops probe + ~25 slow = ~35 | +40% |
-
-Net effect on a 10 MB base64-style payload (~95% pure-interior chunks): probe-hit case dominates; expected ~3× scan throughput. Mixed payloads with frequent escapes inside strings see a smaller win or slight regression on the in-string-with-escapes chunks; bench will measure the crossover.
-
-## Correctness
-
-The new fast path fires when `in_string == 1 ∧ chunk contains no '"' and no '\'`. We must prove that taking the branch (skip 64 bytes, set `bs_carry = 0`, keep `in_string = 1`) produces output identical to letting the slow path run.
-
-### (a) `bs_carry` leaves the chunk as 0
-
-`bs_carry` represents whether the trailing backslash run of the current chunk has odd parity (and thus escapes byte 0 of the next chunk). With `backslash == 0`:
-
-- `trailing_bs = 0` in `find_escape_mask_with_carry`
-- Falls into the `else` branch: `new_carry = 0 & 1 = 0`
-
-So slow-path `bs_carry` after this chunk is 0, regardless of incoming `bs_carry`. Setting it to 0 explicitly is equivalent.
-
-### (b) `in_string` stays 1
-
-With `real_quote == 0` (which follows from `quote == 0`), `inside_string_mask` computes:
-
-- `q = 0`, prefix-XOR via `_mm_clmulepi64_si128` = 0
-- If `prev_in_string != 0`, `mask = !0 = u64::MAX`
-- `new_state = (u64::MAX >> 63) & 1 = 1`
-
-Slow path leaves `in_string = 1`. Explicit retention is equivalent.
-
-### (c) No structural offsets are emitted for this chunk
-
-Slow path: `final_mask = (struct_mask & !inside) | real_quote`. With the whole chunk inside the string (`inside = u64::MAX`) and `real_quote = 0`, `final_mask = 0`. Zero offsets emitted. Skipping the chunk emits nothing. Equivalent.
-
-### (d) New condition is strictly narrower than current fast path
-
-Current condition `in_string != 0 ∧ real_quote == 0` fires when `quote & !escaped == 0`. New condition fires when `quote == 0 ∧ backslash == 0`. The new condition implies `quote == 0 ⇒ real_quote == 0`, so any chunk hit by the new path was also hit by the current fast path. The reverse is not true: a chunk with `quote != 0` where every quote bit is escaped (preceded by an odd backslash run) hits the current fast path but not the new one. Those chunks now go through the slow path — correctness unchanged, performance unchanged (slow path is the same code).
-
-### Edge cases
-
-| scenario | behavior |
-|---|---|
-| Entering chunk with `bs_carry == 1`, chunk byte 0 is `\` | `backslash != 0` → probe miss → slow path → `pc=1` handled by `find_escape_mask_with_carry` as before |
-| Entering chunk with `bs_carry == 1`, chunk has no `"` or `\` | Probe hit → `bs_carry := 0`, equivalent to slow path's `else` branch returning `new_carry = 0` |
-| 64-aligned input ending mid-string | Unchanged — main loop exits with `i == buf.len()`, existing post-loop `if i < buf.len() ... else if in_string != 0 { return Err(buf.len()) }` still flags unterminated |
-| Non-aligned tail with `bs_carry=1` from probe-hit chunk | `bs_carry = 0` after probe hit, so `scalar_start = i` (existing logic), correct |
-
-## Bench fixture
-
-`benches/lua_bench.lua` gains a synthetic "string-heavy" scenario. **Fixture is generated at run time, not committed.**
-
-- Top-level shape: `{"id": "...", "ts": <int>, "data": "<base64-ish>"}`
-- `data` value: `QJD_BENCH_BIG_MB` MB (default 10) of characters drawn from `A-Za-z0-9+/`. Guaranteed no `"` or `\` in the payload. Deterministic seed for reproducibility.
-- Bench reports fixture size + three-run median for:
-  - `lua-cjson` full parse
-  - `quickdecode` parse + single-field extract on `data`
-
-Bench is a manual `make bench` target. **Not a CI gate.** Its output goes into the PR description and a Performance section update in `README.md`.
-
-## Tests
-
-Rust unit tests in `src/scan/avx2.rs::tests`. The host-AVX2 guard pattern (`if !host_supports_avx2() { return; }`) is preserved.
-
-| test | new / modified | purpose |
-|---|---|---|
-| `long_string_engages_skip_fastpath` | modified | bump from ~10 KB to ≥1 MB string interior — multiple probe-hit chunks in a row |
-| `long_string_with_periodic_backslash` | **new** | every ~5 chunks inject `\\n` / `\\\"` escape sequences; alternates probe-hit and slow path, asserts parity with scalar |
-| `bs_carry_one_at_pure_string_chunk_boundary` | **new** | construct prior chunk ending in odd-length backslash run (`bs_carry=1`), next chunk fully pure string interior with no `"`/`\`; assert parity (verifies §(a)) |
-| `escaped_quotes_remain_correct_with_fastpath` | unchanged | existing test, still passes |
-| `scanner_crosscheck` (proptest, `tests/scanner_crosscheck.rs`) | unchanged | 2000-case property test; if shrinking finds a regression case, `.proptest-regressions` gets committed |
-
-## CI matrix
-
-Unchanged. No new cargo features, no new test binaries.
-
-1. `cargo test --release` — exercises new path (host AVX2 required)
-2. `cargo test --release --no-default-features` — scalar-only, new code excluded by `#![cfg(target_arch = "x86_64")]` + feature gate
-3. `cargo test --features test-panic --release` — FFI panic barrier unchanged
-4. Lua busted suite under LuaJIT — unchanged
-
-## Roadmap / Deferred
-
-After landing, add to `README.md` under Roadmap / Deferred:
-
-> - **memchr2 jump for ≥N consecutive in-string chunks** — current chunk-per-chunk probe leaves ~10 vector ops/chunk on the table for very large string-interior runs (≥1 MB single string). A `memchr2(b'"', b'\\')` jump path can approach memory bandwidth; deferred until a workload that benefits clearly emerges.
-
-## Out of scope
-
-- Scalar scanner changes.
-- Auto-tuning the probe threshold or making the probe optional.
-- Reworking `find_escape_mask_with_carry` (its cost is paid only on slow-path chunks now).
-- Cross-chunk `memchr2` jumps (Option 2 from brainstorming; tracked in Roadmap).
diff --git a/docs/superpowers/specs/2026-05-15-makefile-design.md b/docs/superpowers/specs/2026-05-15-makefile-design.md
deleted file mode 100644
index c0c01a5..0000000
--- a/docs/superpowers/specs/2026-05-15-makefile-design.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Makefile for lua-quick-decode
-
-Add a root-level `Makefile` that wraps the common Rust + LuaJIT workflows so contributors don't have to remember the env-var dance for bench/test.
-
-## Targets
-
-| Target | Action |
-|---|---|
-| `help` (default) | Print each target and its `## ` doc-comment via awk |
-| `build` | `cargo build --release` (produces `target/release/libquickdecode.so`) |
-| `test` | Depends on `build`. Runs `cargo test --release`, then `busted tests/lua --lua=$(LUAJIT) --lpath='./lua/?.lua'` with `LD_LIBRARY_PATH=$(CURDIR)/target/release` and `LUA_CPATH=$(LUA_CPATH)` exported |
-| `lint` | `cargo clippy --release --all-targets -- -D warnings` then `cargo fmt --check` |
-| `bench` | Depends on `build`. Runs `$(LUAJIT) benches/lua_bench.lua` with the same env exports as `test` |
-| `clean` | `cargo clean` |
-
-All targets are `.PHONY`.
-
-## Overridable variables
-
-```make
-LUAJIT    ?= $(shell command -v luajit 2>/dev/null || echo /usr/local/openresty/luajit/bin/luajit)
-LUA_CPATH ?= ./?.so;/usr/local/openresty/lualib/?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/openresty/luajit/lib/lua/5.1/?.so
-```
-
-- `LUAJIT` autodetects: prefers `luajit` on `PATH` (apt/CI install), falls back to OpenResty's path (local dev box).
-- `LUA_CPATH` includes OpenResty's `lualib` (where `cjson.so` lives on the local box) plus the standard LuaJIT 5.1 search paths. The default value is intentionally absolute, not appended to LuaJIT's built-in default, so the Makefile is reproducible regardless of which LuaJIT build is invoked.
-- Users override per invocation: `make bench LUAJIT=/path/to/luajit`.
-
-## Help format
-
-Each target line carries a `## description` trailing comment. The `help` target greps targets with `## ` and pretty-prints `target — description` via awk. This pattern lets `help` stay in sync automatically when a new target is added.
-
-## Out of scope
-
-- **`luacheck` lint for Lua sources.** Neither the local box nor CI has it installed; adding it now would be dead code. Track in README if/when desired.
-- **Separate `release` / `debug` build targets.** The repo only ships release artifacts (bench and Lua FFI tests both require release). Add later if a debug workflow appears.
-- **Cross-target dependency on `test` from `bench`.** Bench depends only on `build`; running tests as part of bench would slow down iterative perf work.
-
-## Non-goals
-
-- Replacing CI. The Makefile mirrors CI commands but is not invoked by CI (CI keeps its explicit steps for cache-key clarity).
-- Cross-platform. macOS/Windows are not supported; the OpenResty path defaults are Linux-specific. PRs welcome but not required for v1.
-
-## Failure modes (intentional, loud)
-
-- `busted` not installed → `test` fails with a clear `command not found`. Fix: `sudo luarocks install busted`.
-- `luajit` not on PATH and OpenResty fallback missing → `bench` and `test` fail at the luajit invocation. Fix: install LuaJIT or pass `LUAJIT=...`.
-- `target/release/libquickdecode.so` missing → impossible by construction; `bench` and `test` depend on `build`.
diff --git a/docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md b/docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md
deleted file mode 100644
index 196e9f1..0000000
--- a/docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md
+++ /dev/null
@@ -1,623 +0,0 @@
-# Rust Quick JSON Decode — Design (v1)
-
-**Date:** 2026-05-15
-**Status:** Design approved, awaiting implementation plan
-**Project:** `lua-quick-decode`
-
----
-
-## 1. Purpose & Non-Goals
-
-### Purpose
-
-A Rust-implemented JSON decoder exposed to LuaJIT via FFI, optimized for the case where:
-
-- A large-ish JSON (1 KB – 64 MB) is parsed **once**.
-- The caller extracts a **small number of fields** (typically 5–20) via dynamic paths.
-- The document is then **discarded**.
-
-The library's competitive advantage over `lua-cjson` is that it skips:
-
-- Constructing a full Lua `table` for the parsed document.
-- Copying / interning every string value.
-- Allocating GC objects for every nested object/array.
-
-It does so by performing a **single fast SIMD structural scan** in Phase 1 (only recording byte offsets of structural characters) and then **lazily decoding** only the fields the caller actually requests in Phase 2.
-
-### Non-Goals
-
-- Full JSON RFC 8259 validation. We perform shallow structural validation only; value-level errors (invalid escapes, malformed numbers, invalid UTF-8 in `\u` sequences) are deferred to lazy decode and surfaced only if the offending field is accessed.
-- Building a Lua table representation. The library never produces a Lua table from JSON; callers must request fields explicitly.
-- Streaming / incremental parse. The whole input must be available as a contiguous `&[u8]` before parsing begins.
-- Thread safety. A `qjd_doc` is single-threaded. Document objects must not be shared across threads.
-- JSON encoding / serialization. Decode only.
-
----
-
-## 2. Confirmed Decisions
-
-| Aspect | Decision |
-|---|---|
-| Output artifact | Rust `cdylib` → `.so`, plus `rlib` for Rust-side tests/integration |
-| Caller binding | LuaJIT via `ffi.cdef` + `ffi.load`; **no** dependency on `lua.h` |
-| Access pattern | Fully dynamic, runtime path resolution |
-| Access frequency | One parse, few accesses, then discard |
-| Input size | 1 KB – 64 MB (32-bit offsets sufficient) |
-| Portability | Scalar fallback required; SIMD as runtime-detected acceleration |
-| Validation level | Shallow (brace/quote/structure only); value validity deferred to lazy decode |
-| Buffer ownership | Borrow `&[u8]`; `Document` holds a reference for its lifetime |
-| Field API | Both root-path (`get_str(doc, "body.model")`) and cursor (`open(doc, "body")` → sub-cursor) |
-| Error model | `errcode` return + output pointer; static `qjd_strerror(code)` for descriptions |
-| Lua wrapper | Full `quickdecode.lua` shipped as deliverable |
-| Benchmark targets | 100 KB – 1 MB / 5-20 fields; 10 MB – 64 MB / few fields |
-| Backends in v1 | `ScalarScanner` (fallback) + `Avx2Scanner` (x86_64); NEON deferred |
-
----
-
-## 3. Architecture
-
-### 3.1 Module Layout
-
-```
-src/
-├── lib.rs               — crate root, re-exports
-├── ffi.rs               — pub extern "C" symbols (C ABI layer)
-├── doc.rs               — Document type (Phase 1 + container helpers)
-├── cursor.rs            — Cursor, path resolution, skip-cache walk
-├── path.rs              — path string parse (zero-alloc iterator)
-├── error.rs             — error / type enums
-├── scan/
-│   ├── mod.rs           — Scanner trait + runtime dispatch (OnceCell-cached)
-│   ├── scalar.rs        — scalar fallback
-│   └── avx2.rs          — x86_64 AVX2 + PCLMUL (gated by `avx2` feature)
-├── decode/
-│   ├── mod.rs
-│   ├── number.rs        — lazy i64/f64 parse
-│   └── string.rs        — lazy escape decode + UTF-8 check on \u
-└── skip_cache.rs        — Phase 2 sibling-skip cache
-
-lua/
-└── quickdecode.lua      — LuaJIT wrapper module
-
-tests/
-├── integration.rs       — Rust-side C ABI tests
-└── lua/                 — busted Lua tests
-
-benches/
-├── rust_bench.rs        — Rust criterion benches
-├── lua_bench.lua        — Lua bench vs lua-cjson
-└── fixtures/            — JSON fixtures
-
-include/
-└── lua_quick_decode.h   — public C header
-
-docs/
-└── superpowers/specs/
-    └── 2026-05-15-rust-quick-json-decode-design.md   (this file)
-```
-
-### 3.2 Layered Data Flow
-
-```
-caller buf:&[u8]
-        │
-        ▼ ffi::qjd_parse
-   Document::parse
-        │
-        ▼ scan::dispatch (cached function pointer)
-   { ScalarScanner | Avx2Scanner }
-        │
-        ▼
-   indices: Vec<u32>  (Phase 1 complete)
-        │
-caller: doc:get_str("body.model")
-        │
-        ▼ path::parse  (zero-alloc iterator)
-   Cursor::resolve
-        │      │
-        │      └─► skip_cache (lazy fill)
-        ▼
-   decode::string
-        │
-        ▼
-   (ptr, len) → LuaJIT side ffi.string()
-```
-
-### 3.3 Key Invariants
-
-- `Document<'a>` borrows `'a` from the caller's input buffer; the FFI layer erases `'a` to `'static`, and the LuaJIT wrapper enforces lifetime via Lua-side strong references.
-- `indices: Vec<u32>` is write-once: filled during Phase 1, read-only thereafter.
-- `skip_cache` lives on `Document`; populated lazily during Phase 2 access.
-- `scratch: Vec<u8>` (for escape decode) lives on `Document`. **Invariant: only the most recent `get_str` result's pointer is valid.** The LuaJIT wrapper calls `ffi.string(ptr, len)` immediately to copy into a Lua string.
-- `indices` records only byte offsets, **not** token types. Type is recovered from `buf[indices[i]]`. This saves 25% memory vs storing a type tag.
-
----
-
-## 4. C ABI
-
-Public header: `include/lua_quick_decode.h`. Symbols all `extern "C"`, `#[no_mangle]`.
-
-### 4.1 Types
-
-```c
-typedef struct qjd_doc qjd_doc;   /* opaque */
-
-typedef struct {
-    const qjd_doc* doc;
-    uint32_t       idx_start;     /* opener position in doc.indices */
-    uint32_t       idx_end;       /* one past closer */
-    uint32_t       _reserved0;    /* reserved for future fast-path */
-    uint32_t       _reserved1;    /* reserved / padding */
-} qjd_cursor;   /* 24 bytes, by-value, no allocation */
-```
-
-### 4.2 Error Codes
-
-```c
-typedef enum {
-    QJD_OK              = 0,
-    QJD_PARSE_ERROR     = 1,   /* Phase 1 structural failure */
-    QJD_NOT_FOUND       = 2,   /* path does not exist */
-    QJD_TYPE_MISMATCH   = 3,   /* path target is wrong JSON type for getter */
-    QJD_OUT_OF_RANGE    = 4,   /* numeric overflow for requested integer type */
-    QJD_DECODE_FAILED   = 5,   /* malformed escape / UTF-8 / number */
-    QJD_INVALID_PATH    = 6,   /* path string syntax error */
-    QJD_INVALID_ARG     = 7,   /* NULL pointer etc. */
-    QJD_OOM             = 8,
-} qjd_err;
-
-const char* qjd_strerror(int code);   /* static; caller must not free */
-```
-
-### 4.3 Phase 1
-
-```c
-qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out);
-void     qjd_free (qjd_doc* doc);
-```
-
-Returns NULL on failure with `*err_out` set. `qjd_free(NULL)` is a no-op. The caller must keep `buf` valid for the lifetime of the returned `qjd_doc`.
-
-### 4.4 Phase 2 — Root-Path API
-
-```c
-int qjd_get_str  (qjd_doc*, const char* path, size_t path_len,
-                  const uint8_t** out_ptr, size_t* out_len);
-int qjd_get_i64  (qjd_doc*, const char* path, size_t path_len, int64_t* out);
-int qjd_get_f64  (qjd_doc*, const char* path, size_t path_len, double*  out);
-int qjd_get_bool (qjd_doc*, const char* path, size_t path_len, int*     out);
-int qjd_is_null  (qjd_doc*, const char* path, size_t path_len, int*     out);
-
-typedef enum {
-    QJD_T_NULL = 0, QJD_T_BOOL = 1, QJD_T_NUM = 2,
-    QJD_T_STR  = 3, QJD_T_ARR  = 4, QJD_T_OBJ = 5,
-} qjd_type;
-int qjd_typeof   (qjd_doc*, const char* path, size_t path_len, int* type_out);
-int qjd_len      (qjd_doc*, const char* path, size_t path_len, size_t* out);
-```
-
-### 4.5 Phase 2 — Cursor API
-
-```c
-int qjd_open            (qjd_doc*, const char* path, size_t path_len, qjd_cursor* out);
-
-int qjd_cursor_get_str  (qjd_cursor*, const char* path, size_t path_len,
-                         const uint8_t** out_ptr, size_t* out_len);
-int qjd_cursor_get_i64  (qjd_cursor*, const char* path, size_t path_len, int64_t* out);
-int qjd_cursor_get_f64  (qjd_cursor*, const char* path, size_t path_len, double*  out);
-int qjd_cursor_get_bool (qjd_cursor*, const char* path, size_t path_len, int*     out);
-int qjd_cursor_typeof   (qjd_cursor*, const char* path, size_t path_len, int* out);
-int qjd_cursor_len      (qjd_cursor*, const char* path, size_t path_len, size_t* out);
-
-/* sub-cursor; key/index avoids path-string composition */
-int qjd_cursor_open     (qjd_cursor*, const char* path, size_t path_len, qjd_cursor* out);
-int qjd_cursor_field    (qjd_cursor*, const char* key,  size_t key_len, qjd_cursor* out);
-int qjd_cursor_index    (qjd_cursor*, size_t i, qjd_cursor* out);
-```
-
-### 4.6 Path Syntax
-
-```
-path     := segment ( '.' segment | '[' digit+ ']' )*
-segment  := key | '[' digit+ ']'
-key      := characters not containing '.' or '['
-```
-
-Empty path / NULL path = root.
-
-Keys containing `.` or `[` are **not supported** via path strings — use `qjd_cursor_field()` instead. Attempting to parse such a path returns `QJD_INVALID_PATH`.
-
-### 4.7 String Output Pointer Lifetime
-
-The `out_ptr` returned by `qjd_get_str` / `qjd_cursor_get_str` points to either:
-
-1. The original input buffer (when the string contains no escape sequences); or
-2. A document-internal scratch buffer (when escape decode was required).
-
-**The caller must consume the result before the next call to any `*_get_str` function on the same document.** Any subsequent `get_str` may invalidate prior pointers. The LuaJIT wrapper handles this by calling `ffi.string(ptr, len)` immediately, copying into a Lua string.
-
----
-
-## 5. Phase 1 — Structural Scan
-
-### 5.1 Goal
-
-Given `buf: &[u8]`, produce `indices: Vec<u32>` listing the byte offset of every structural character (`{`, `}`, `[`, `]`, `:`, `,`, `"`) that is **not inside a string literal**.
-
-### 5.2 Quote Handling
-
-The hard part is correctly identifying which `"` characters open/close strings versus being escaped. We use the classical SIMD algorithm (simdjson):
-
-For each 64-byte chunk:
-
-1. Build `quote_mask` (bit per byte = `"`)
-2. Build `backslash_mask` (bit per byte = `\`)
-3. Build `structural_mask` (bit per byte = one of `{}[]:,`)
-4. Compute `escaped_quote_mask` from `backslash_mask` using bit arithmetic that accounts for consecutive backslash runs (odd-length run = next char escaped; even-length = next char literal).
-5. `real_quote_mask = quote_mask & ~escaped_quote_mask`
-6. Use PCLMUL (or scalar prefix XOR on fallback) to turn `real_quote_mask` into `inside_string_mask` (1 between consecutive quote pairs).
-7. `output_mask = structural_mask & ~inside_string_mask`, plus `real_quote_mask` itself (strings' boundaries are also structural).
-8. Iterate set bits in `output_mask` and append byte offsets to `indices`.
-
-The "carry-over" state across chunks: whether the chunk begins inside a string, and the trailing backslash count of the previous chunk.
-
-### 5.3 Backend Trait
-
-```rust
-pub(crate) trait StructScanner {
-    /// Scan `buf`, appending offsets to `out`.
-    /// On shallow validation failure (unclosed string, unmatched bracket),
-    /// returns `Err(byte_offset)` (offset not exposed in v1 errors).
-    fn scan(buf: &[u8], out: &mut Vec<u32>) -> Result<(), usize>;
-}
-
-pub(crate) struct ScalarScanner;
-#[cfg(target_arch = "x86_64")] pub(crate) struct Avx2Scanner;
-```
-
-### 5.4 Runtime Dispatch
-
-```rust
-static SCAN_FN: OnceCell<fn(&[u8], &mut Vec<u32>) -> Result<(), usize>>
-    = OnceCell::new();
-
-fn dispatch() -> fn(&[u8], &mut Vec<u32>) -> Result<(), usize> {
-    *SCAN_FN.get_or_init(|| {
-        #[cfg(target_arch = "x86_64")]
-        if is_x86_feature_detected!("avx2")
-            && is_x86_feature_detected!("pclmulqdq")
-        {
-            return Avx2Scanner::scan;
-        }
-        ScalarScanner::scan
-    })
-}
-```
-
-First call detects CPU features; subsequent calls use a cached function pointer (no `cpuid` overhead).
-
-### 5.5 Indices Capacity
-
-Initial capacity = `buf.len() / 6` (≈17 % of input bytes). Empirically structural characters make up 5–25 % of a typical JSON. Under-allocation triggers `Vec` doubling, costing one realloc; over-allocation wastes ≤17 % of input size.
-
-For very small documents (< 4 KB), the wasted bytes are negligible. A stack-allocated SmallVec fast path is **deferred to Roadmap**.
-
-### 5.6 Shallow Validation Coverage
-
-Phase 1 detects and rejects:
-
-- Unclosed string at end of buffer
-- Mismatched bracket types (`{` paired with `]` etc.)
-- Unbalanced closers (more `}` than `{` etc.)
-
-Phase 1 does **not** check:
-
-- Semantic position of `:` `,` (extraneous commas, missing colons)
-- Escape sequence validity inside strings
-- UTF-8 validity (multi-byte UTF-8 cannot be confused with ASCII structural chars)
-- Number format validity
-- Duplicate keys
-
-### 5.7 Expected Throughput
-
-| Backend | Target |
-|---|---|
-| Scalar | 500 MB/s – 1 GB/s |
-| AVX2 (+ PCLMUL) | 3 – 6 GB/s |
-
----
-
-## 6. Phase 2 — Path Resolution & Cursor
-
-### 6.1 Cursor Internal Representation
-
-```rust
-#[derive(Copy, Clone)]
-pub(crate) struct Cursor<'d> {
-    doc:        &'d Document<'d>,
-    /// Slice of doc.indices covered by this cursor.
-    /// idx_start points at '{' or '['; idx_end points one past matching '}' / ']'.
-    idx_start:  u32,
-    idx_end:    u32,
-}
-```
-
-The published `qjd_cursor` carries two `_reservedN` slots beyond `idx_start`/`idx_end`; they are unused in v1 but reserved so a future per-cursor skip-cache fast-path can be added without breaking the ABI.
-
-`Cursor` is `Copy` and never allocates. `open()`, `field()`, `index()` return new cursors by value.
-
-### 6.2 Resolution Algorithm
-
-```text
-for seg in path:
-    Confirm cursor points at correct container type:
-        seg=Key  → require '{' at cursor opener; else TYPE_MISMATCH
-        seg=Idx  → require '[' at cursor opener; else TYPE_MISMATCH
-
-    Walk children of the container:
-        - If cache_slot is populated: directly read child_starts[i] /
-          probe child_starts for matching key.
-        - Otherwise: brace-counting scan from opener+1 to find each child,
-          populating cache_slot as we go (incremental fill).
-
-    On match: advance cursor to child's [idx_start, idx_end).
-    On exhaustion: NOT_FOUND.
-```
-
-### 6.3 Sibling-Skip Cache
-
-```rust
-pub(crate) struct SkipCache {
-    slots:     Vec<SkipSlot>,                  // slot 0 reserved
-    by_opener: rustc_hash::FxHashMap<u32, u32>,// opener idx → slot number
-}
-
-pub(crate) struct SkipSlot {
-    /// child_starts[i] = position in doc.indices where i-th child begins
-    /// (for object: pointing at the key's opening '"';
-    ///  for array: pointing at the value's first token).
-    child_starts: Vec<u32>,
-    /// child_ends[i] = idx_end for a Cursor pointing at the i-th child's value.
-    /// Storing this lets cache-hit lookups skip the brace-counting walk.
-    child_ends:   Vec<u32>,
-}
-```
-
-**Build-on-first-access:** when a container is entered for the first time, its `SkipSlot` is built incrementally as the resolver walks its children. The walk uses brace-counting (the cheap operation on the `indices` array, not on the original buffer). Subsequent accesses to the same container are O(N_keys) field comparisons with no brace counting.
-
-**Memory cost analysis:** worst case is when the caller enters every child of a large array (e.g. iterates 100 `messages[i]` and descends into each). Each entered container costs roughly `8 * num_children` bytes. For a 1 MB / 100-message JSON this stays below 5 MB total — acceptable. No LRU eviction in v1.
-
-### 6.4 Field-Type Dispatch
-
-Typed getters (`get_str`, `get_i64`, ...) inspect `buf[doc.indices[cursor.idx_start]]` after path resolution:
-
-| First byte | Inferred type | Behavior |
-|---|---|---|
-| `"` | string | `get_str` → decode; `get_i64`/`get_f64`/`get_bool` → TYPE_MISMATCH |
-| `0`-`9`, `-` | number | `get_i64`/`get_f64` → parse; others → TYPE_MISMATCH |
-| `t`, `f` | bool | `get_bool` → parse; others → TYPE_MISMATCH |
-| `n` | null | `is_null` → true; others → TYPE_MISMATCH |
-| `{` | object | `typeof` → OBJ; getters → TYPE_MISMATCH |
-| `[` | array | `typeof` → ARR; getters → TYPE_MISMATCH |
-
-`qjd_typeof` only inspects the first byte; no value decoding.
-
-`qjd_typeof` on a non-existent path returns `QJD_NOT_FOUND`, **not** `QJD_T_NULL`. The two are distinct.
-
-### 6.5 String Escape Decode
-
-```rust
-fn decode_string(
-    buf: &[u8], start: usize, end: usize,
-    scratch: &mut Vec<u8>,
-) -> Result<(*const u8, usize), qjd_err> {
-    // Fast path: no backslash in range → return original slice.
-    if memchr::memchr(b'\\', &buf[start..end]).is_none() {
-        return Ok((buf.as_ptr().wrapping_add(start), end - start));
-    }
-    // Slow path: decode into scratch.
-    scratch.clear();
-    // Handle: \" \\ \/ \b \f \n \r \t \u XXXX with surrogate pair join
-    // ...
-    Ok((scratch.as_ptr(), scratch.len()))
-}
-```
-
-UTF-8 validity of `\u XXXX` sequences (correct surrogate pairing) is checked here and surfaced as `QJD_DECODE_FAILED`. Other bytes are passed through without UTF-8 validation, consistent with our shallow-validation policy.
-
-A SIMD-accelerated backslash search in the fast path is **deferred to Roadmap**.
-
-### 6.6 Number Decode
-
-- `get_i64`: hand-written fast parse, accepts JSON-number integer form (`-?[0-9]+`), rejects `.`, `e`, `E`. Overflow → `QJD_OUT_OF_RANGE`.
-- `get_f64`: `core::str::FromStr` on a verified-ASCII slice. If first benchmark shows this dominating, switch to `lexical` — **deferred to Roadmap**.
-- Integers > 2⁵³ requested via `get_f64` will return with precision loss per IEEE 754 (no error). Integers > i64 range via `get_i64` return `QJD_OUT_OF_RANGE`.
-
-A "lossless integer" mode returning `int64_t` as cdata (preserving full precision on the Lua side) is **deferred to Roadmap**.
-
----
-
-## 7. Memory Management & Safety
-
-### 7.1 Document Layout
-
-```rust
-pub struct Document<'a> {
-    buf:     &'a [u8],
-    indices: Vec<u32>,      // appended sentinel u32::MAX at end
-    scratch: Vec<u8>,       // lazy; populated on first escape-decode
-    skip:    SkipCache,     // lazy; populated on first Phase 2 access
-}
-```
-
-### 7.2 Allocation Budget
-
-| Phase | Item | Count |
-|---|---|---|
-| Phase 1 | `Box<Document>` | 1 |
-| Phase 1 | `indices` initial reserve | 1 |
-| Phase 1 | `indices` doubling (worst case) | 0–2 |
-| Phase 2 | `scratch` first escape | 0 or 1 |
-| Phase 2 | `skip.slots[i].child_starts` per first-entered container | 1 each |
-| Phase 2 | path parse / cursor ops | 0 |
-
-### 7.3 FFI Safety
-
-All FFI entry points:
-
-- Reject NULL pointers with `QJD_INVALID_ARG` (no panic, no UB).
-- Trust `len` (cannot validate at runtime).
-- Wrap their body in `std::panic::catch_unwind` to prevent unwinding across the C boundary. Internal panics convert to `QJD_OOM`.
-- Use `unsafe extern "C"`.
-
-Rust internal code is panic-free in steady state: no `.unwrap()`, no `.expect()`, no array indexing where bounds aren't pre-validated. Errors propagate via `Result<_, qjd_err>` to the FFI layer.
-
-### 7.4 Lifetime Erasure
-
-The FFI layer materializes a `Document<'static>` from a `&'static [u8]` made via `slice::from_raw_parts`. The actual lifetime equals the caller's input buffer, which Rust cannot enforce. The LuaJIT wrapper (§8) enforces it by holding a strong reference to the original Lua string.
-
-### 7.5 Threading
-
-Single-threaded per `qjd_doc`. No internal locking. Documented in the public header.
-
----
-
-## 8. LuaJIT Wrapper (`lua/quickdecode.lua`)
-
-### 8.1 Responsibilities
-
-1. Declare the C ABI via `ffi.cdef`.
-2. Load the shared library via `ffi.load("quickdecode")`.
-3. Wrap raw C calls into OO-style methods on `Doc` and `Cursor`.
-4. **Strong-hold the original JSON string** to prevent GC while the document is alive.
-5. Register `qjd_free` via `ffi.gc` for automatic cleanup.
-6. Translate `QJD_NOT_FOUND` to Lua `nil`; other errors to `error(qjd_strerror(code))`.
-7. Call `ffi.string(ptr, len)` immediately on string results, eliminating the scratch-invalidate hazard.
-
-### 8.2 API Surface
-
-`Doc` methods: `get_str`, `get_i64`, `get_f64`, `get_bool`, `is_null`, `typeof`, `len`, `open(path)`.
-
-`Cursor` methods: same set + `open(path)`, `field(key)`, `index(i)`.
-
-`#cursor` via `__len` is **not** implemented (Lua 5.1 / LuaJIT compatibility). Use `cursor:len("")`.
-
-### 8.3 Output-Box Reuse
-
-Module-level pre-allocated `ffi.new` buffers (`err_box`, `i64_box`, `strp_box`, `cur_box`, ...) are reused across all calls. New cdata allocation in the hot path would abort LuaJIT traces.
-
-### 8.4 Lifetime Holding
-
-```lua
-function _M.parse(json_str)
-    local err = err_box
-    local ptr = C.qjd_parse(json_str, #json_str, err)
-    if ptr == nil then error(...) end
-    return setmetatable({
-        _ptr  = ffi.gc(ptr, C.qjd_free),
-        _hold = json_str,            -- strong ref keeps buffer alive
-    }, Doc)
-end
-```
-
-Cursors hold a back-reference to their `Doc` to prevent the `Doc` (and therefore the buffer) from being collected while cursors exist.
-
-### 8.5 Integer Precision Caveat
-
-`tonumber(int64_t)` truncates to double; values exceeding 2⁵³ lose precision silently. Documented in the wrapper. A lossless-integer mode returning cdata is on the Roadmap.
-
----
-
-## 9. Testing & Benchmarking
-
-### 9.1 Test Layers
-
-| Layer | Framework | Approx Cases |
-|---|---|---|
-| Rust unit (`#[cfg(test)]`) | `cargo test` | ~100 |
-| Rust integration (`tests/integration.rs`) | `cargo test` | ~30 |
-| Property / fuzz | `proptest`, `cargo-fuzz` | ongoing |
-| Lua integration (`tests/lua/`) | `busted` | ~50 |
-
-### 9.2 Critical Test Matrix
-
-**Phase 1 correctness:**
-- ScalarScanner vs Avx2Scanner produce **bit-identical** `indices` on the same input. Enforced by proptest cross-check.
-- Buffer length boundaries: `len % 64 ∈ {0, 1, 31, 32, 33, 63}`.
-- Pure ASCII vs multi-byte UTF-8 content in strings.
-- Adversarial escape patterns: `\\\"`, `\\\\\"`, `\\\\\\"`, long runs of backslashes.
-- Extreme depth (stack tolerance).
-- Extreme width (10K+ keys / array elements).
-
-**Phase 2 correctness:**
-- Path syntax variants and parsing failures.
-- Non-existence at each path depth.
-- Type-mismatch at each typed getter.
-- Full escape-decode coverage including surrogate pairs (`😀`).
-- Numeric boundaries: `INT64_MIN`, `INT64_MAX`, `2^63`, `1.7e308`, JSON-illegal forms.
-- Wide objects (5K keys) → skip-cache correctness.
-
-**FFI boundary:**
-- Every entry point handles NULL pointers gracefully.
-- `qjd_free(NULL)` is a no-op.
-- `qjd_parse` failure path correctly populates `err_out`.
-- Internal panic surfaces as `QJD_OOM`, not unwinding.
-
-**Lua wrapper (busted):**
-- `nil` on `NOT_FOUND`, `error()` on other failures.
-- GC of `Doc` triggers `qjd_free`.
-- Original JSON string is held against premature GC.
-- Same-fixture value-equivalence with `lua-cjson`.
-
-### 9.3 Benchmark
-
-`benches/lua_bench.lua` directly compares against `lua-cjson` on the same fixtures using `os.clock()` and `collectgarbage('count')` for allocation pressure. No busted involvement (busted overhead is unsuitable for microbenchmarks).
-
-**Fixtures:**
-- `small_api.json` (~5 KB, LLM API request shape)
-- `medium_resp.json` (~200 KB)
-- `large_dump.json` (~20 MB)
-- `deep_nest.json` (depth stress test)
-
-**Acceptance targets (first cut; revise after measurement):**
-
-| Scenario | Target | vs lua-cjson |
-|---|---|---|
-| 200 KB / 5 fields | Phase 1 ≥ 800 MB/s | 3-5× faster |
-| 20 MB / 5 fields | Phase 1 ≥ 2 GB/s (AVX2) | 5-10× faster |
-| Cursor repeated access | < 200 ns / get_str (AVX2) | — |
-
-### 9.4 CI
-
-- `cargo test --features default` (scalar + AVX2)
-- `cargo test --no-default-features` (scalar only, simulates non-AVX2 host)
-- `busted tests/lua/` after building the `.so`
-- Short fuzz runs (1–5 min) per push
-
----
-
-## 10. Roadmap / Deferred
-
-Tracked in `README.md` and to be picked up individually. Items deferred from this design:
-
-- **ARM64 NEON scanner backend** — for Apple Silicon, Graviton, 鲲鹏.
-- **SmallVec fast path for small documents** (< 4 KB) — avoid heap allocation for `indices` on tiny inputs.
-- **SIMD-accelerated backslash search** in the `decode_string` fast path.
-- **`lexical` fast float parser** if `<f64>::from_str` benchmarks as a bottleneck.
-- **Lossless 64-bit integer mode** — return cdata `int64_t` to preserve precision > 2⁵³.
-- **Skip-cache LRU eviction** — only if memory pressure on huge documents proves problematic in practice.
-- **Path-position info on Phase 1 errors** — currently only an opaque `QJD_PARSE_ERROR`.
-
----
-
-## 11. Open Questions for Implementation Plan
-
-The implementation plan (next phase) should resolve:
-
-1. Exact crate features and Cargo.toml shape (workspace vs single crate? feature flags for scalar-only builds?).
-2. Choice of `proptest` vs `quickcheck`.
-3. Whether to vendor `memchr` and `rustc-hash` or add as direct dependencies.
-4. Whether `cargo fuzz` integration runs in CI or only on-demand.
-5. Build flow for LuaJIT tests (must build `.so` first; how to chain `cargo build` → `busted`).
-
-These are tactical decisions deferred to the implementation plan.
diff --git a/include/lua_quick_decode.h b/include/lua_quick_decode.h
index 0c0c0a0..e3aeab2 100644
--- a/include/lua_quick_decode.h
+++ b/include/lua_quick_decode.h
@@ -61,6 +61,10 @@ int qjd_cursor_get_f64  (const qjd_cursor*, const char* path, size_t path_len, d
 int qjd_cursor_get_bool (const qjd_cursor*, const char* path, size_t path_len, int*     out);
 int qjd_cursor_typeof   (const qjd_cursor*, const char* path, size_t path_len, int*     out);
 int qjd_cursor_len      (const qjd_cursor*, const char* path, size_t path_len, size_t*  out);
+int qjd_cursor_bytes    (const qjd_cursor*, size_t* byte_start, size_t* byte_end);
+int qjd_cursor_object_entry_at(const qjd_cursor*, size_t i,
+                                const uint8_t** key_ptr, size_t* key_len,
+                                qjd_cursor* value_out);
 
 #ifdef __cplusplus
 }
diff --git a/lua/quickdecode.lua b/lua/quickdecode.lua
index 9675231..0851895 100644
--- a/lua/quickdecode.lua
+++ b/lua/quickdecode.lua
@@ -30,6 +30,10 @@ int qjd_cursor_get_f64 (const qjd_cursor*, const char*, size_t, double*);
 int qjd_cursor_get_bool(const qjd_cursor*, const char*, size_t, int*);
 int qjd_cursor_typeof  (const qjd_cursor*, const char*, size_t, int*);
 int qjd_cursor_len     (const qjd_cursor*, const char*, size_t, size_t*);
+int qjd_cursor_bytes(const qjd_cursor*, size_t* byte_start, size_t* byte_end);
+int qjd_cursor_object_entry_at(const qjd_cursor*, size_t i,
+                                const uint8_t** key_ptr, size_t* key_len,
+                                qjd_cursor* value_out);
 ]]
 
 local C = ffi.load("quickdecode")
@@ -178,4 +182,17 @@ function Cursor:index(i)
     return setmetatable({ _cur = cur_box[0], _doc = self._doc }, Cursor)
 end
 
+-- Lazy table API (cjson-shaped surface). See lua/quickdecode/table.lua.
+local _lazy = require("quickdecode.table")
+_M.decode         = _lazy.decode
+_M.encode         = _lazy.encode
+_M.materialize    = _lazy.materialize
+_M.pairs          = _lazy.pairs
+_M.ipairs         = _lazy.ipairs
+_M.len            = _lazy.len
+_M.null           = _lazy.null
+_M.empty_array_mt = _lazy.empty_array_mt
+_M._LazyObject    = _lazy._LazyObject
+_M._LazyArray     = _lazy._LazyArray
+
 return _M
diff --git a/lua/quickdecode/table.lua b/lua/quickdecode/table.lua
new file mode 100644
index 0000000..9a8b93d
--- /dev/null
+++ b/lua/quickdecode/table.lua
@@ -0,0 +1,556 @@
+-- Lazy table view + cjson-compatible encoder for quickdecode.
+--
+-- This module relies on the FFI cdef set up by `lua/quickdecode.lua`, so
+-- callers must `require("quickdecode")` (transitively or directly) before
+-- they require this module.
+
+local ffi = require("ffi")
+local C   = ffi.load("quickdecode")
+-- Defer the require to avoid a circular dependency when quickdecode.lua
+-- re-exports this module.  By the time _M.decode is called, quickdecode
+-- is already registered in package.loaded.
+local function get_qd()
+    return require("quickdecode")
+end
+
+-- Optional cjson bridge: reuse its sentinels when available so callers'
+-- `v == cjson.null` comparisons keep working unchanged.
+local has_cjson, cjson = pcall(require, "cjson")
+
+local _M = {}
+
+if has_cjson then
+    _M.null            = cjson.null
+    _M.empty_array_mt  = cjson.empty_array_mt
+else
+    _M.null            = setmetatable({}, { __tostring = function() return "null" end })
+    _M.empty_array_mt  = { __jsontype = "array" }
+end
+
+-- Box scratch used for one-shot FFI returns. Reused across calls to avoid
+-- per-call allocation; safe because the parent Doc / lazy view holds the
+-- buffer alive and these are read-and-copy.
+local err_box  = ffi.new("int[1]")
+local i64_box  = ffi.new("int64_t[1]")
+local f64_box  = ffi.new("double[1]")
+local bool_box = ffi.new("int[1]")
+local size_box = ffi.new("size_t[1]")
+local type_box = ffi.new("int[1]")
+local strp_box = ffi.new("const uint8_t*[1]")
+local cur_box   = ffi.new("qjd_cursor[1]")
+local child_box = ffi.new("qjd_cursor[1]")
+local sz_a      = ffi.new("size_t[1]")
+local sz_b      = ffi.new("size_t[1]")
+
+local QJD_OK        = 0
+local QJD_NOT_FOUND = 2
+local T_NULL = 0
+local T_BOOL = 1
+local T_NUM  = 2
+local T_STR  = 3
+local T_ARR  = 4
+local T_OBJ  = 5
+
+local function check(rc)
+    if rc == QJD_OK then return true end
+    if rc == QJD_NOT_FOUND then return false end
+    error("quickdecode: " .. ffi.string(C.qjd_strerror(rc)))
+end
+
+local LazyObject = {}
+local LazyArray  = {}
+
+-- Build a new lazy view for a child container cursor.
+-- src_box is an FFI cdata `qjd_cursor[1]`; src_box[0] is the cursor whose
+-- data we copy into a fresh per-view allocation so the new view's _cur
+-- survives later overwrites of src_box.
+local function wrap_child(parent_view, src_box)
+    C.qjd_cursor_bytes(src_box[0], sz_a, sz_b)
+    local own_box = ffi.new("qjd_cursor[1]")
+    ffi.copy(own_box, src_box, ffi.sizeof("qjd_cursor"))
+    return {
+        _doc     = parent_view._doc,
+        _cur_box = own_box,        -- keep cdata alive
+        _cur     = own_box[0],     -- stable reference into own_box
+        _bs      = tonumber(sz_a[0]),
+        _be      = tonumber(sz_b[0]),
+    }
+end
+
+-- Decode the value at src_box[0] into a Lua value.
+-- src_box is a `qjd_cursor[1]`; for container types, a new view is created
+-- via wrap_child so the caller's box can be freely reused afterwards.
+local function decode_cursor(parent_view, src_box)
+    local trc = C.qjd_cursor_typeof(src_box[0], "", 0, type_box)
+    if not check(trc) then return nil end
+    local t = type_box[0]
+    if t == T_STR then
+        local rrc = C.qjd_cursor_get_str(src_box[0], "", 0, strp_box, size_box)
+        if not check(rrc) then return nil end
+        return ffi.string(strp_box[0], size_box[0])
+    elseif t == T_NUM then
+        local rrc = C.qjd_cursor_get_f64(src_box[0], "", 0, f64_box)
+        if not check(rrc) then return nil end
+        return f64_box[0]
+    elseif t == T_BOOL then
+        local rrc = C.qjd_cursor_get_bool(src_box[0], "", 0, bool_box)
+        if not check(rrc) then return nil end
+        return bool_box[0] ~= 0
+    elseif t == T_NULL then
+        return _M.null
+    elseif t == T_OBJ then
+        return setmetatable(wrap_child(parent_view, src_box), LazyObject)
+    elseif t == T_ARR then
+        return setmetatable(wrap_child(parent_view, src_box), LazyArray)
+    end
+    return nil
+end
+
+-- Resolve a child cursor at `key` (object) and decode it into a Lua value.
+-- Returns nil for missing keys (cjson semantics).
+-- Container results (lazy proxies) are rawset-cached into `self` so that
+-- subsequent accesses return the same Lua table object. This is required for
+-- `t.a.x = v` to propagate back: __newindex materializes `t.a` in-place, and
+-- the next `t.a` lookup retrieves the already-materialized table from the
+-- raw table rather than creating a fresh proxy.
+local function read_object_field(self, key)
+    if type(key) ~= "string" then return nil end
+    -- Use child_box so the lookup result does not alias self._cur (which is
+    -- itself stored in root_box's backing memory in the decode caller).
+    local rc = C.qjd_cursor_field(self._cur, key, #key, child_box)
+    if not check(rc) then return nil end
+    local v = decode_cursor(self, child_box)
+    -- Cache containers so identity is stable and materialization sticks.
+    if type(v) == "table" then rawset(self, key, v) end
+    return v
+end
+
+LazyObject.__index = read_object_field
+
+-- Resolve a child cursor at integer index `key` (1-based) and decode it.
+-- Returns nil for missing/out-of-range indices and non-integer keys.
+-- Container results are rawset-cached for the same identity-stability reason
+-- as read_object_field.
+local function read_array_index(self, key)
+    if type(key) ~= "number" then return nil end
+    -- 1-based external, 0-based internal
+    local i = key - 1
+    if i < 0 or i ~= math.floor(i) then return nil end
+    local rc = C.qjd_cursor_index(self._cur, i, child_box)
+    if not check(rc) then return nil end
+    local v = decode_cursor(self, child_box)
+    -- Cache containers so identity is stable and materialization sticks.
+    if type(v) == "table" then rawset(self, key, v) end
+    return v
+end
+
+LazyArray.__index = read_array_index
+
+-- Iterator function for lazy_object_iter: advances through object entries by
+-- integer index, returning key/value pairs in source order.
+local function lazy_object_iter(state, _prev_key)
+    local i = state.i
+    state.i = i + 1
+    local rc = C.qjd_cursor_object_entry_at(
+        state.view._cur, i, strp_box, size_box, child_box
+    )
+    if rc == QJD_NOT_FOUND then return nil end
+    check(rc)
+    local k = ffi.string(strp_box[0], size_box[0])
+    local v = decode_cursor(state.view, child_box)
+    return k, v
+end
+
+function LazyObject.__pairs(t)
+    return lazy_object_iter, { view = t, i = 0 }, nil
+end
+
+local function lazy_array_iter(state, _prev_i)
+    local i = state.i
+    local rc = C.qjd_cursor_index(state.view._cur, i, child_box)
+    if rc == QJD_NOT_FOUND then return nil end
+    check(rc)
+    state.i = i + 1
+    local v = decode_cursor(state.view, child_box)
+    return i + 1, v
+end
+
+function LazyArray.__ipairs(t)
+    return lazy_array_iter, { view = t, i = 0 }, 0
+end
+
+function _M.ipairs(t)
+    local mt = getmetatable(t)
+    if mt == LazyArray then
+        return LazyArray.__ipairs(t)
+    end
+    return ipairs(t)
+end
+
+function _M.pairs(t)
+    local mt = getmetatable(t)
+    if mt == LazyObject then
+        return LazyObject.__pairs(t)
+    elseif mt == LazyArray then
+        return _M.ipairs(t)
+    end
+    return pairs(t)
+end
+
+local function lazy_len(self)
+    local rc = C.qjd_cursor_len(self._cur, "", 0, size_box)
+    check(rc)
+    return tonumber(size_box[0])
+end
+
+LazyObject.__len = lazy_len
+LazyArray.__len  = lazy_len
+
+-- Public fallback for `#t` on a lazy proxy. Vanilla LuaJIT 5.1 does not invoke
+-- __len on tables (only userdata) unless built with LUAJIT_ENABLE_LUA52COMPAT
+-- (OpenResty's default). Callers running on a non-compat LuaJIT must use
+-- qt.len(t) — same role qt.pairs / qt.ipairs play for __pairs / __ipairs.
+function _M.len(t)
+    local mt = getmetatable(t)
+    if mt == LazyObject or mt == LazyArray then
+        return lazy_len(t)
+    end
+    return #t
+end
+
+-- Materialize all key/value pairs from a LazyObject view into a plain list.
+-- Returns a sequence of {k, v} pairs. The view is not mutated here; mutation
+-- happens in __newindex after the walk completes successfully.
+local function materialize_object_contents(view)
+    local i = 0
+    local pairs_out = {}
+    while true do
+        local rc = C.qjd_cursor_object_entry_at(view._cur, i, strp_box, size_box, child_box)
+        if rc == QJD_NOT_FOUND then break end
+        check(rc)
+        local k = ffi.string(strp_box[0], size_box[0])
+        local v = decode_cursor(view, child_box)
+        pairs_out[#pairs_out+1] = {k, v}
+        i = i + 1
+    end
+    return pairs_out
+end
+
+-- Materialize all elements from a LazyArray view into a plain sequence.
+-- Returns a sequence indexed 1..n. The view is not mutated here.
+local function materialize_array_contents(view)
+    local i = 0
+    local out = {}
+    while true do
+        local rc = C.qjd_cursor_index(view._cur, i, child_box)
+        if rc == QJD_NOT_FOUND then break end
+        check(rc)
+        out[i + 1] = decode_cursor(view, child_box)
+        i = i + 1
+    end
+    return out
+end
+
+-- The set of keys reserved by the lazy view bookkeeping; user-supplied JSON
+-- keys with these names would collide (minor, deferred). Centralized here so
+-- the dirty check and __newindex can share the list.
+local INTERNAL_KEYS = {
+    _doc = true, _cur_box = true, _cur = true, _bs = true, _be = true,
+}
+
+-- On first write, walk all existing key/value pairs into a plain table,
+-- strip the lazy metatable, then apply the new assignment. Any FFI error
+-- during the walk leaves `t` in its original lazy state.
+-- Existing rawget-cached entries (e.g. previously returned child proxies)
+-- are preserved so callers' references remain valid.
+LazyObject.__newindex = function(t, k, v)
+    local contents = materialize_object_contents(t)
+    -- Snapshot user-key cache BEFORE nilling internals.
+    -- Use next() for raw iteration: pairs() invokes __pairs on lazy tables,
+    -- walking the full JSON via FFI instead of the Lua-side rawget cache.
+    local cache = {}
+    local ck, cv = next(t)
+    while ck ~= nil do
+        if not INTERNAL_KEYS[ck] then
+            cache[ck] = cv
+        end
+        ck, cv = next(t, ck)
+    end
+    t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil
+    setmetatable(t, nil)
+    for _, kv in ipairs(contents) do
+        rawset(t, kv[1], cache[kv[1]] or kv[2])
+    end
+    rawset(t, k, v)
+end
+
+-- On first write, walk all existing elements into a plain sequence,
+-- switch to empty_array_mt (no lazy machinery), then apply the assignment.
+-- Existing rawget-cached entries are preserved so callers' references remain valid.
+LazyArray.__newindex = function(t, k, v)
+    local contents = materialize_array_contents(t)
+    -- Snapshot integer-key cache BEFORE nilling internals.
+    -- Use next() for raw iteration: pairs() would invoke __pairs on lazy arrays,
+    -- walking the full JSON via FFI instead of the Lua-side rawget cache.
+    local cache = {}
+    local ck, cv = next(t)
+    while ck ~= nil do
+        if type(ck) == "number" then
+            cache[ck] = cv
+        end
+        ck, cv = next(t, ck)
+    end
+    t._doc, t._cur_box, t._cur, t._bs, t._be = nil, nil, nil, nil, nil
+    setmetatable(t, _M.empty_array_mt)
+    for i, x in ipairs(contents) do
+        rawset(t, i, cache[i] or x)
+    end
+    rawset(t, k, v)
+end
+
+function _M.decode(json_str)
+    -- Reuse the existing qd.parse path to get a Doc with stable buffer hold.
+    local doc = get_qd().parse(json_str)
+    -- Open the root cursor into cur_box, then copy into a dedicated box owned
+    -- by the view so that later child lookups (which reuse child_box) do not
+    -- alias the root cursor's backing storage.
+    local rc = C.qjd_open(doc._ptr, "", 0, cur_box)
+    if not check(rc) then
+        error("quickdecode: open root failed")
+    end
+    local root_box = ffi.new("qjd_cursor[1]")
+    ffi.copy(root_box, cur_box, ffi.sizeof("qjd_cursor"))
+    -- Determine root container kind (object/array) and wrap accordingly.
+    -- Both have meaningful byte spans for encode.
+    local trc = C.qjd_cursor_typeof(root_box[0], "", 0, type_box)
+    if not check(trc) then
+        error("quickdecode: root typeof failed")
+    end
+    local rt = type_box[0]
+    local brc = C.qjd_cursor_bytes(root_box[0], sz_a, sz_b)
+    if not check(brc) then
+        error("quickdecode: root byte-span failed")
+    end
+    local view = {
+        _doc     = doc,
+        _cur_box = root_box,   -- keep the box alive; _cur is a stable reference
+        _cur     = root_box[0],
+        _bs      = tonumber(sz_a[0]),
+        _be      = tonumber(sz_b[0]),
+    }
+    if rt == T_OBJ then
+        return setmetatable(view, LazyObject)
+    elseif rt == T_ARR then
+        return setmetatable(view, LazyArray)
+    else
+        error("quickdecode: top-level JSON value is not an object or array")
+    end
+end
+
+local function materialize(v)
+    local mt = (type(v) == "table") and getmetatable(v) or nil
+    if mt == LazyObject then
+        local out = {}
+        for _, kv in ipairs(materialize_object_contents(v)) do
+            out[kv[1]] = materialize(kv[2])
+        end
+        return out
+    elseif mt == LazyArray then
+        local raw = materialize_array_contents(v)
+        local out = {}
+        for i, x in ipairs(raw) do
+            out[i] = materialize(x)
+        end
+        if #out == 0 then
+            setmetatable(out, _M.empty_array_mt)
+        end
+        return out
+    end
+    return v
+end
+
+_M.materialize = materialize
+
+local string_byte = string.byte
+local string_format = string.format
+
+-- Minimal JSON string escaper covering the cjson default set.
+local function encode_string(s)
+    local out = {'"'}
+    for i = 1, #s do
+        local b = string_byte(s, i)
+        if b == 0x22 then out[#out+1] = '\\"'
+        elseif b == 0x5C then out[#out+1] = '\\\\'
+        elseif b == 0x0A then out[#out+1] = '\\n'
+        elseif b == 0x0D then out[#out+1] = '\\r'
+        elseif b == 0x09 then out[#out+1] = '\\t'
+        elseif b == 0x08 then out[#out+1] = '\\b'
+        elseif b == 0x0C then out[#out+1] = '\\f'
+        elseif b < 0x20 then out[#out+1] = string_format('\\u%04x', b)
+        else out[#out+1] = string.char(b)
+        end
+    end
+    out[#out+1] = '"'
+    return table.concat(out)
+end
+
+local function encode_number(n)
+    if n ~= n or n == math.huge or n == -math.huge then
+        error("qd.encode: cannot encode non-finite number")
+    end
+    if n == math.floor(n) and math.abs(n) < 1e15 then
+        return string_format("%d", n)
+    end
+    return string_format("%.14g", n)
+end
+
+-- A lazy subtree is "dirty" if any cached descendant has been materialized
+-- (no longer carries Lazy* metatable). Non-cached descendants are guaranteed
+-- untouched, so we only need to walk the rawget-cached entries.
+local function is_dirty(v)
+    if type(v) ~= "table" then return false end
+    local mt = getmetatable(v)
+    if mt ~= LazyObject and mt ~= LazyArray then
+        return true  -- materialized
+    end
+    -- Use next() for raw table iteration: pairs() would invoke __pairs on
+    -- lazy tables, walking the full JSON via FFI instead of the Lua cache.
+    local k, child = next(v)
+    while k ~= nil do
+        if not INTERNAL_KEYS[k] then
+            if is_dirty(child) then return true end
+        end
+        k, child = next(v, k)
+    end
+    return false
+end
+
+-- Forward declaration so encode_lazy_object_walking, encode_lazy_array_walking,
+-- and encode_array/encode_object can reference encode before its definition is
+-- complete (Lua resolves upvalues at call time, but the slot must be declared first).
+local encode
+
+-- Walk a dirty LazyObject and emit JSON, preferring cached children (which
+-- may be materialized) over freshly resolved cursors. Non-cached children
+-- emit through a fresh proxy and naturally fast-path their unmodified subtree.
+local function encode_lazy_object_walking(t)
+    local parts = {}
+    local i = 0
+    while true do
+        local rc = C.qjd_cursor_object_entry_at(t._cur, i, strp_box, size_box, child_box)
+        if rc == QJD_NOT_FOUND then break end
+        check(rc)
+        local k = ffi.string(strp_box[0], size_box[0])
+        local v
+        local cached = rawget(t, k)
+        if cached ~= nil and not INTERNAL_KEYS[k] then
+            v = cached
+        else
+            v = decode_cursor(t, child_box)
+        end
+        parts[#parts + 1] = encode_string(k) .. ":" .. encode(v)
+        i = i + 1
+    end
+    return "{" .. table.concat(parts, ",") .. "}"
+end
+
+local function encode_lazy_array_walking(t)
+    local parts = {}
+    local rc = C.qjd_cursor_len(t._cur, "", 0, size_box)
+    check(rc)
+    local n = tonumber(size_box[0])
+    for i = 0, n - 1 do
+        local irc = C.qjd_cursor_index(t._cur, i, child_box)
+        check(irc)
+        local cached = rawget(t, i + 1)
+        local v
+        if cached ~= nil then
+            v = cached
+        else
+            v = decode_cursor(t, child_box)
+        end
+        parts[#parts + 1] = encode(v)
+    end
+    return "[" .. table.concat(parts, ",") .. "]"
+end
+
+local function encode_proxy(t)
+    if not is_dirty(t) then
+        -- Fast path: no mutations — slice the original buffer bytes.
+        return t._doc._hold:sub(t._bs + 1, t._be)
+    end
+    if getmetatable(t) == LazyObject then
+        return encode_lazy_object_walking(t)
+    end
+    return encode_lazy_array_walking(t)
+end
+
+local function is_array(t)
+    local mt = getmetatable(t)
+    if mt == _M.empty_array_mt then return true end
+    local n = #t
+    local count = 0
+    for k in pairs(t) do
+        count = count + 1
+        if type(k) ~= "number" or k < 1 or k > n or k ~= math.floor(k) then
+            return false
+        end
+    end
+    return count == n and (n > 0 or mt == _M.empty_array_mt)
+end
+
+local function encode_array(t)
+    local parts = {}
+    for i = 1, #t do
+        parts[i] = encode(t[i])
+    end
+    return "[" .. table.concat(parts, ",") .. "]"
+end
+
+local function encode_object(t)
+    local parts = {}
+    for k, v in pairs(t) do
+        if type(k) ~= "string" then
+            error("qd.encode: object key must be a string, got " .. type(k))
+        end
+        parts[#parts+1] = encode_string(k) .. ":" .. encode(v)
+    end
+    return "{" .. table.concat(parts, ",") .. "}"
+end
+
+encode = function(v)
+    if rawequal(v, _M.null) then
+        return "null"
+    end
+    local tv = type(v)
+    if tv == "string" then
+        return encode_string(v)
+    elseif tv == "number" then
+        return encode_number(v)
+    elseif tv == "boolean" then
+        return v and "true" or "false"
+    elseif tv == "table" then
+        local mt = getmetatable(v)
+        if mt == LazyObject or mt == LazyArray then
+            return encode_proxy(v)
+        end
+        if is_array(v) then
+            return encode_array(v)
+        end
+        return encode_object(v)
+    end
+    error("qd.encode: unsupported value type: " .. tv)
+end
+
+_M.encode = encode
+
+-- Debug convenience: tostring(lazy_view) returns the original JSON bytes.
+-- Not the canonical encoder — callers should still use qd.encode for output.
+LazyObject.__tostring = encode_proxy
+LazyArray.__tostring  = encode_proxy
+
+-- Test-only exports for metatable identity checks.
+_M._LazyObject = LazyObject
+_M._LazyArray  = LazyArray
+
+return _M
diff --git a/src/cursor.rs b/src/cursor.rs
index cc21556..b2cd890 100644
--- a/src/cursor.rs
+++ b/src/cursor.rs
@@ -77,7 +77,27 @@ fn walk_children(doc: &Document, cur: Cursor, seg: &PathSeg) -> Result<Cursor, q
     let mut arr_idx: u32 = 0;
     let mut result: Option<Cursor> = None;
 
-    while i < end {
+    // Empty-container guard: if the byte immediately after the opener (ignoring
+    // whitespace) is the closing bracket/brace, there are no children.
+    {
+        let opener_byte_pos = doc.indices[cur.idx_start as usize] as usize;
+        let closer_byte_pos = doc.indices[end as usize] as usize;
+        let mut p = opener_byte_pos + 1;
+        while p < closer_byte_pos && matches!(doc.buf[p], b' ' | b'\t' | b'\n' | b'\r') {
+            p += 1;
+        }
+        if p == closer_byte_pos {
+            let slot = cache.slot_mut(slot_n);
+            slot.child_starts = starts;
+            slot.child_ends   = ends;
+            return Err(qjd_err::QJD_NOT_FOUND);
+        }
+    }
+
+    // Use `<= end` so trailing scalar elements (which have no structural
+    // marker of their own — `indices[end]` is the parent closer) are visited.
+    // The `b'}' | b']' => break` arm below terminates the loop after them.
+    while i <= end {
         starts.push(i);
 
         let value_idx_start = if is_obj { i + 3 } else { i };
diff --git a/src/doc.rs b/src/doc.rs
index 7c6ecda..707bb44 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -66,6 +66,53 @@ impl<'a> Document<'a> {
         Ok(p)
     }
 
+    /// Find the i-th key/value entry of an object cursor. Returns the
+    /// indices position of the key (so the caller can decode it via the
+    /// existing string-decode path) and the value's `Cursor`.
+    ///
+    /// Returns `QJD_TYPE_MISMATCH` for non-object cursors, `QJD_NOT_FOUND`
+    /// when `i` is past the end.
+    pub(crate) fn nth_object_entry(&self, cur: Cursor, n: usize) -> Result<(u32, Cursor), qjd_err> {
+        let pos = self.indices[cur.idx_start as usize] as usize;
+        let b = *self.buf.get(pos).ok_or(qjd_err::QJD_PARSE_ERROR)?;
+        if b != b'{' {
+            return Err(qjd_err::QJD_TYPE_MISMATCH);
+        }
+        // Mirror cursor_len's walk, but stop at the n-th child rather than counting.
+        let closer_pos = self.indices[cur.idx_end as usize] as usize;
+        let mut p = pos + 1;
+        while p < closer_pos && matches!(self.buf[p], b' '|b'\t'|b'\n'|b'\r') {
+            p += 1;
+        }
+        if p == closer_pos {
+            return Err(qjd_err::QJD_NOT_FOUND);
+        }
+        let mut i = cur.idx_start + 1;
+        let end = cur.idx_end;
+        let mut count: usize = 0;
+        loop {
+            // For objects, the key occupies indices[i..=i+1] (open & close quote);
+            // the value cursor starts at i+3 (after the colon at i+2).
+            let key_idx_start = i;
+            let value_idx_start = i + 3;
+            let (cursor_end, skip_end) = crate::cursor::find_value_span(self, value_idx_start)?;
+            if count == n {
+                return Ok((key_idx_start, Cursor { idx_start: value_idx_start, idx_end: cursor_end }));
+            }
+            count += 1;
+            let after_pos = self.indices[skip_end as usize] as usize;
+            if after_pos >= self.buf.len() { return Err(qjd_err::QJD_PARSE_ERROR); }
+            match self.buf[after_pos] {
+                b',' => {
+                    i = skip_end + 1;
+                    if i > end { return Err(qjd_err::QJD_NOT_FOUND); }
+                }
+                b'}' => return Err(qjd_err::QJD_NOT_FOUND),
+                _ => return Err(qjd_err::QJD_PARSE_ERROR),
+            }
+        }
+    }
+
     /// Count direct children of the container at `cur`.
     /// Returns QJD_TYPE_MISMATCH for non-container cursors.
     pub(crate) fn cursor_len(&self, cur: Cursor) -> Result<usize, qjd_err> {
diff --git a/src/ffi.rs b/src/ffi.rs
index 09d4094..b5110b1 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -341,19 +341,26 @@ pub unsafe extern "C" fn qjd_get_bool(
     })
 }
 
-/// Return the byte slice for a scalar value (number, true, false, null).
-/// Uses the cursor convention: cur.idx_start is the position in indices of
-/// the structural char AFTER the scalar (a separator or closer).
-unsafe fn scalar_bytes<'d>(d: &'d Document<'d>, cur: Cursor) -> Result<&'d [u8], qjd_err> {
-    // First byte: just after the previous structural char (skip whitespace).
+/// Compute the byte range of a scalar value (number / true / false / null).
+/// Uses the cursor convention: `cur.idx_start` is the position in indices
+/// of the structural char AFTER the scalar (a separator or closer); the
+/// scalar's bytes sit between `find_scalar_start(cur.idx_start)` and that
+/// structural char, with trailing whitespace stripped.
+unsafe fn scalar_byte_range(d: &Document<'_>, cur: Cursor) -> Result<(usize, usize), qjd_err> {
     let start = d.find_scalar_start(cur.idx_start)?;
-    // End byte: position of the structural char at cur.idx_start (exclusive).
     let end = d.indices[cur.idx_start as usize] as usize;
     if end < start { return Err(qjd_err::QJD_PARSE_ERROR); }
-    // Strip trailing whitespace.
     let mut e = end;
     while e > start && matches!(d.buf[e - 1], b' '|b'\t'|b'\n'|b'\r') { e -= 1; }
-    Ok(&d.buf[start..e])
+    Ok((start, e))
+}
+
+/// Return the byte slice for a scalar value (number, true, false, null).
+/// Uses the cursor convention: cur.idx_start is the position in indices of
+/// the structural char AFTER the scalar (a separator or closer).
+unsafe fn scalar_bytes<'d>(d: &'d Document<'d>, cur: Cursor) -> Result<&'d [u8], qjd_err> {
+    let (s, e) = scalar_byte_range(d, cur)?;
+    Ok(&d.buf[s..e])
 }
 
 // ── qjd_cursor type and cursor-based FFI ────────────────────────────────────
@@ -668,6 +675,105 @@ pub unsafe extern "C" fn qjd_cursor_len(
     })
 }
 
+/// Write the original-buffer byte range `[byte_start, byte_end)` that the
+/// cursor's value occupies. For containers, the range spans the opening
+/// bracket through the closing bracket inclusive (so `byte_end` is one past
+/// the close char). For scalars, leading and trailing whitespace and
+/// surrounding separators are stripped (same convention as `scalar_bytes`).
+///
+/// # Safety
+///
+/// See the module-level [shared safety contract](self#shared-safety-contract).
+/// `c` must point to a cursor produced by an earlier `qjd_*` call whose
+/// document is still alive; `byte_start` and `byte_end` must be non-NULL
+/// and writable.
+#[no_mangle]
+pub unsafe extern "C" fn qjd_cursor_bytes(
+    c: *const qjd_cursor, byte_start: *mut usize, byte_end: *mut usize,
+) -> c_int {
+    ffi_catch!({
+        if byte_start.is_null() || byte_end.is_null() {
+            return qjd_err::QJD_INVALID_ARG as c_int;
+        }
+        let (d, cur) = match cursor_to_internal(c) {
+            Ok(x) => x, Err(e) => return e as c_int,
+        };
+        let pos = d.indices[cur.idx_start as usize] as usize;
+        let lead = match d.buf.get(pos) {
+            Some(b) => *b,
+            None => return qjd_err::QJD_PARSE_ERROR as c_int,
+        };
+        match lead {
+            b'{' | b'[' | b'"' => {
+                // Container or string: span runs from opener to the matching
+                // closer, inclusive.
+                let end = d.indices[cur.idx_end as usize] as usize;
+                if end >= d.buf.len() {
+                    return qjd_err::QJD_PARSE_ERROR as c_int;
+                }
+                *byte_start = pos;
+                *byte_end = end + 1;
+                qjd_err::QJD_OK as c_int
+            }
+            _ => {
+                // Scalar: delegate to scalar_byte_range.
+                let (s, e) = match scalar_byte_range(d, cur) {
+                    Ok(x) => x, Err(e) => return e as c_int,
+                };
+                *byte_start = s;
+                *byte_end = e;
+                qjd_err::QJD_OK as c_int
+            }
+        }
+    })
+}
+
+/// Write the i-th object entry's key (decoded into the doc's scratch
+/// buffer) and value cursor into the out parameters.
+///
+/// Returns `QJD_TYPE_MISMATCH` when the cursor is not an object, or
+/// `QJD_NOT_FOUND` when `i` is past the end.
+///
+/// # Safety
+///
+/// See the module-level [shared safety contract](self#shared-safety-contract).
+/// `c` must point to a live cursor; `key_ptr`, `key_len`, and `value_out`
+/// must be non-NULL and writable. The `(*key_ptr, *key_len)` pair is
+/// invalidated by the next `qjd_get_str` / `qjd_cursor_get_str` /
+/// `qjd_cursor_object_entry_at` call on the same document (scratch reuse).
+#[no_mangle]
+pub unsafe extern "C" fn qjd_cursor_object_entry_at(
+    c: *const qjd_cursor, i: usize,
+    key_ptr: *mut *const u8, key_len: *mut usize,
+    value_out: *mut qjd_cursor,
+) -> c_int {
+    ffi_catch!({
+        if key_ptr.is_null() || key_len.is_null() || value_out.is_null() {
+            return qjd_err::QJD_INVALID_ARG as c_int;
+        }
+        let (d, cur) = match cursor_to_internal(c) {
+            Ok(x) => x, Err(e) => return e as c_int,
+        };
+        let (key_idx_start, value_cur) = match d.nth_object_entry(cur, i) {
+            Ok(x) => x, Err(e) => return e as c_int,
+        };
+        // Decode the key: it sits at indices[key_idx_start..=key_idx_start+1]
+        // — open quote at key_idx_start, close quote at key_idx_start+1.
+        let open_pos = d.indices[key_idx_start as usize] as usize;
+        let close_pos = d.indices[(key_idx_start + 1) as usize] as usize;
+        let mut scratch = d.scratch.borrow_mut();
+        match string::decode_string(d.buf, open_pos + 1, close_pos, &mut scratch) {
+            Ok((p, n)) => {
+                *key_ptr = p;
+                *key_len = n;
+                *value_out = internal_to_cursor((*c).doc, value_cur);
+                qjd_err::QJD_OK as c_int
+            }
+            Err(e) => e as c_int,
+        }
+    })
+}
+
 /// Test-only export that forces a Rust panic to verify the FFI panic barrier
 /// converts it to `QJD_OOM` instead of unwinding across the boundary.
 ///
diff --git a/src/lib.rs b/src/lib.rs
index 43a07ca..90215cd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,4 @@
 //! lua-quick-decode: Rust JSON decoder for LuaJIT FFI consumers.
-//! See docs/superpowers/specs/2026-05-15-rust-quick-json-decode-design.md
 
 pub mod error;
 pub(crate) mod scan;
diff --git a/tests/ffi_cursor.rs b/tests/ffi_cursor.rs
index dde276d..722bc42 100644
--- a/tests/ffi_cursor.rs
+++ b/tests/ffi_cursor.rs
@@ -79,3 +79,53 @@ fn cursor_field_with_dotted_key() {
 
     unsafe { qjd_free(d) };
 }
+
+// Regression: walk_children must visit the trailing scalar (last element has no
+// structural marker of its own — `indices[end]` is the parent closer).
+// Before the fix, `while i < end` stopped one step early and index 2 returned
+// QJD_NOT_FOUND for a 3-element all-scalar array.
+#[test]
+fn walk_children_trailing_scalar_integer() {
+    let d = parse(b"[10,20,30]");
+    let mut c = std::mem::MaybeUninit::<qjd_cursor>::uninit();
+    let empty = b"";
+    unsafe { qjd_open(d, empty.as_ptr() as *const i8, 0, c.as_mut_ptr()) };
+    let c = unsafe { c.assume_init() };
+
+    // Index 2 is the trailing element `30`.
+    let mut sub = std::mem::MaybeUninit::<qjd_cursor>::uninit();
+    let rc = unsafe { qjd_cursor_index(&c, 2, sub.as_mut_ptr()) };
+    assert_eq!(rc, 0, "qjd_cursor_index([2]) must succeed");
+    let sub = unsafe { sub.assume_init() };
+
+    let mut v: i64 = 0;
+    let rc = unsafe { qjd_cursor_get_i64(&sub, empty.as_ptr() as *const i8, 0, &mut v) };
+    assert_eq!(rc, 0, "qjd_cursor_get_i64 on trailing element must succeed");
+    assert_eq!(v, 30);
+
+    unsafe { qjd_free(d) };
+}
+
+// Regression: trailing scalar with non-numeric type — ensures walk_children
+// visits it and type information is correct (not silently skipped).
+#[test]
+fn walk_children_trailing_scalar_bool() {
+    let d = parse(b"[1,\"x\",true]");
+    let mut c = std::mem::MaybeUninit::<qjd_cursor>::uninit();
+    let empty = b"";
+    unsafe { qjd_open(d, empty.as_ptr() as *const i8, 0, c.as_mut_ptr()) };
+    let c = unsafe { c.assume_init() };
+
+    // Index 2 is the trailing element `true`.
+    let mut sub = std::mem::MaybeUninit::<qjd_cursor>::uninit();
+    let rc = unsafe { qjd_cursor_index(&c, 2, sub.as_mut_ptr()) };
+    assert_eq!(rc, 0, "qjd_cursor_index([2]) must succeed");
+    let sub = unsafe { sub.assume_init() };
+
+    let mut b: c_int = -1;
+    let rc = unsafe { qjd_cursor_get_bool(&sub, empty.as_ptr() as *const i8, 0, &mut b) };
+    assert_eq!(rc, 0, "qjd_cursor_get_bool on trailing `true` must succeed");
+    assert_eq!(b, 1);
+
+    unsafe { qjd_free(d) };
+}
diff --git a/tests/ffi_cursor_bytes.rs b/tests/ffi_cursor_bytes.rs
new file mode 100644
index 0000000..c7d4821
--- /dev/null
+++ b/tests/ffi_cursor_bytes.rs
@@ -0,0 +1,90 @@
+use std::os::raw::c_int;
+use std::ptr;
+
+use quickdecode::error::qjd_err;
+use quickdecode::ffi::{
+    qjd_cursor, qjd_cursor_bytes, qjd_cursor_field, qjd_doc, qjd_free, qjd_open, qjd_parse,
+};
+
+unsafe fn open_root(json: &[u8]) -> (*mut qjd_doc, qjd_cursor) {
+    let mut err: c_int = -1;
+    let doc = qjd_parse(json.as_ptr(), json.len(), &mut err);
+    assert!(!doc.is_null());
+    let mut cur: qjd_cursor = std::mem::zeroed();
+    let rc = qjd_open(doc, ptr::null(), 0, &mut cur);
+    assert_eq!(rc, 0);
+    (doc, cur)
+}
+
+#[test]
+fn bytes_of_root_object_covers_full_json() {
+    let json = br#"{"a":1,"b":[2,3]}"#;
+    unsafe {
+        let (doc, cur) = open_root(json);
+        let mut bs: usize = 0;
+        let mut be: usize = 0;
+        let rc = qjd_cursor_bytes(&cur, &mut bs, &mut be);
+        assert_eq!(rc, 0);
+        assert_eq!(&json[bs..be], json.as_ref());
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn bytes_of_string_value_is_quoted_span() {
+    let json = br#"{"k":"hello"}"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let mut child: qjd_cursor = std::mem::zeroed();
+        let rc = qjd_cursor_field(&root, b"k".as_ptr() as *const i8, 1, &mut child);
+        assert_eq!(rc, 0);
+        let mut bs: usize = 0;
+        let mut be: usize = 0;
+        let rc = qjd_cursor_bytes(&child, &mut bs, &mut be);
+        assert_eq!(rc, 0);
+        assert_eq!(&json[bs..be], br#""hello""#);
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn bytes_of_number_value_strips_separators() {
+    let json = br#"{"k": 42 ,"x":1}"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let mut child: qjd_cursor = std::mem::zeroed();
+        let rc = qjd_cursor_field(&root, b"k".as_ptr() as *const i8, 1, &mut child);
+        assert_eq!(rc, 0);
+        let mut bs: usize = 0;
+        let mut be: usize = 0;
+        let rc = qjd_cursor_bytes(&child, &mut bs, &mut be);
+        assert_eq!(rc, 0);
+        assert_eq!(&json[bs..be], b"42");
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn bytes_with_null_out_pointer_returns_invalid_arg() {
+    let json = br#"{"a":1}"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let rc = qjd_cursor_bytes(&root, ptr::null_mut(), ptr::null_mut());
+        assert_eq!(rc, qjd_err::QJD_INVALID_ARG as c_int);
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn bytes_of_root_array_covers_full_json() {
+    let json = br#"[1,"two",true]"#;
+    unsafe {
+        let (doc, cur) = open_root(json);
+        let mut bs: usize = 0;
+        let mut be: usize = 0;
+        let rc = qjd_cursor_bytes(&cur, &mut bs, &mut be);
+        assert_eq!(rc, 0);
+        assert_eq!(&json[bs..be], json.as_ref());
+        qjd_free(doc);
+    }
+}
diff --git a/tests/ffi_object_iter.rs b/tests/ffi_object_iter.rs
new file mode 100644
index 0000000..622abf8
--- /dev/null
+++ b/tests/ffi_object_iter.rs
@@ -0,0 +1,81 @@
+use std::os::raw::c_int;
+use std::ptr;
+
+use quickdecode::ffi::{
+    qjd_cursor, qjd_cursor_object_entry_at, qjd_doc, qjd_free, qjd_open, qjd_parse,
+};
+
+unsafe fn open_root(json: &[u8]) -> (*mut qjd_doc, qjd_cursor) {
+    let mut err: c_int = -1;
+    let doc = qjd_parse(json.as_ptr(), json.len(), &mut err);
+    assert!(!doc.is_null());
+    let mut cur: qjd_cursor = std::mem::zeroed();
+    qjd_open(doc, ptr::null(), 0, &mut cur);
+    (doc, cur)
+}
+
+unsafe fn entry_at(root: &qjd_cursor, i: usize) -> (String, qjd_cursor) {
+    let mut kp: *const u8 = ptr::null();
+    let mut kn: usize = 0;
+    let mut vc: qjd_cursor = std::mem::zeroed();
+    let rc = qjd_cursor_object_entry_at(root, i, &mut kp, &mut kn, &mut vc);
+    assert_eq!(rc, 0, "entry_at({}) failed with rc={}", i, rc);
+    let key = std::slice::from_raw_parts(kp, kn);
+    (String::from_utf8(key.to_vec()).unwrap(), vc)
+}
+
+#[test]
+fn three_keys_in_order() {
+    let json = br#"{"a":1,"b":"x","c":[2,3]}"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let (k0, _) = entry_at(&root, 0);
+        let (k1, _) = entry_at(&root, 1);
+        let (k2, _) = entry_at(&root, 2);
+        assert_eq!(k0, "a");
+        assert_eq!(k1, "b");
+        assert_eq!(k2, "c");
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn key_with_escape_decodes() {
+    // The key `"a\nb"` (3 chars: a, newline, b) — verifies the FFI runs the
+    // string-decode scratch path rather than handing back raw escaped bytes.
+    let json = b"{\"a\\nb\":1}";
+    unsafe {
+        let (doc, root) = open_root(json);
+        let (k0, _) = entry_at(&root, 0);
+        assert_eq!(k0, "a\nb");
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn out_of_range_returns_not_found() {
+    let json = br#"{"a":1}"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let mut kp: *const u8 = ptr::null();
+        let mut kn: usize = 0;
+        let mut vc: qjd_cursor = std::mem::zeroed();
+        let rc = qjd_cursor_object_entry_at(&root, 5, &mut kp, &mut kn, &mut vc);
+        assert_eq!(rc, 2); // QJD_NOT_FOUND
+        qjd_free(doc);
+    }
+}
+
+#[test]
+fn array_cursor_returns_type_mismatch() {
+    let json = br#"[1,2,3]"#;
+    unsafe {
+        let (doc, root) = open_root(json);
+        let mut kp: *const u8 = ptr::null();
+        let mut kn: usize = 0;
+        let mut vc: qjd_cursor = std::mem::zeroed();
+        let rc = qjd_cursor_object_entry_at(&root, 0, &mut kp, &mut kn, &mut vc);
+        assert_eq!(rc, 3); // QJD_TYPE_MISMATCH
+        qjd_free(doc);
+    }
+}
diff --git a/tests/lua/lazy_table_spec.lua b/tests/lua/lazy_table_spec.lua
new file mode 100644
index 0000000..b0bf909
--- /dev/null
+++ b/tests/lua/lazy_table_spec.lua
@@ -0,0 +1,394 @@
+local qd = require("quickdecode")
+local qt = qd                 -- keep tests reading naturally
+
+describe("LazyObject __index — scalars", function()
+    it("reads a string field", function()
+        local t = qt.decode('{"k":"hello"}')
+        assert.are.equal("hello", t.k)
+    end)
+
+    it("reads a number field", function()
+        local t = qt.decode('{"n":42.5}')
+        assert.are.equal(42.5, t.n)
+    end)
+
+    it("reads a boolean field", function()
+        local t = qt.decode('{"b":true,"c":false}')
+        assert.is_true(t.b)
+        assert.is_false(t.c)
+    end)
+
+    it("returns nil for missing key", function()
+        local t = qt.decode('{"a":1}')
+        assert.is_nil(t.missing)
+    end)
+end)
+
+describe("LazyObject __index — nested containers", function()
+    it("returns a LazyObject for a nested object", function()
+        local t = qt.decode('{"a":{"b":"x"}}')
+        local inner = t.a
+        assert.is_table(inner)
+        assert.are.equal("x", inner.b)
+    end)
+
+    it("returns a LazyArray for a nested array", function()
+        local t = qt.decode('{"xs":[10,20]}')
+        local xs = t.xs
+        assert.is_table(xs)
+        -- LazyArray __index is added in a later task; just verify it's
+        -- a table-typed value at this stage.
+    end)
+end)
+
+describe("LazyArray __index", function()
+    it("reads scalar elements by integer index (1-based)", function()
+        local t = qt.decode('[10,"x",true,null]')
+        assert.are.equal(10, t[1])
+        assert.are.equal("x", t[2])
+        assert.is_true(t[3])
+        assert.are.equal(qt.null, t[4])
+    end)
+
+    it("returns nil for out-of-range index", function()
+        local t = qt.decode('[1,2,3]')
+        assert.is_nil(t[0])
+        assert.is_nil(t[4])
+    end)
+
+    it("returns nil for non-integer key", function()
+        local t = qt.decode('[1,2,3]')
+        assert.is_nil(t.foo)
+        assert.is_nil(t[1.5])
+    end)
+
+    it("returns a nested LazyObject", function()
+        local t = qt.decode('[{"a":1},{"a":2}]')
+        assert.are.equal(1, t[1].a)
+        assert.are.equal(2, t[2].a)
+    end)
+end)
+
+-- LuaJIT 5.1 only invokes __len on userdata; it ignores the metamethod on
+-- tables unless built with LUAJIT_ENABLE_LUA52COMPAT (OpenResty's default).
+-- Probe once so the `#t` cases only run where they can pass; qt.len(t) is
+-- the supported path everywhere.
+local LJ52_LEN = (#setmetatable({}, {__len = function() return 99 end}) == 99)
+
+describe("qt.len", function()
+    it("counts object keys", function()
+        local t = qt.decode('{"a":1,"b":2,"c":3}')
+        assert.are.equal(3, qt.len(t))
+    end)
+
+    it("counts array elements", function()
+        local t = qt.decode('[10,20,30,40]')
+        assert.are.equal(4, qt.len(t))
+    end)
+
+    it("returns 0 for empty containers", function()
+        assert.are.equal(0, qt.len(qt.decode('{}')))
+        assert.are.equal(0, qt.len(qt.decode('[]')))
+    end)
+
+    it("falls back to # on a plain table", function()
+        assert.are.equal(3, qt.len({10, 20, 30}))
+    end)
+end)
+
+describe("__len (LJ52 only)", function()
+    it("counts object keys via #t", function()
+        if not LJ52_LEN then return pending("LuaJIT built without LUAJIT_ENABLE_LUA52COMPAT") end
+        local t = qt.decode('{"a":1,"b":2,"c":3}')
+        assert.are.equal(3, #t)
+    end)
+
+    it("counts array elements via #t", function()
+        if not LJ52_LEN then return pending("LuaJIT built without LUAJIT_ENABLE_LUA52COMPAT") end
+        local t = qt.decode('[10,20,30,40]')
+        assert.are.equal(4, #t)
+    end)
+
+    it("returns 0 for empty containers via #t", function()
+        if not LJ52_LEN then return pending("LuaJIT built without LUAJIT_ENABLE_LUA52COMPAT") end
+        assert.are.equal(0, #qt.decode('{}'))
+        assert.are.equal(0, #qt.decode('[]'))
+    end)
+end)
+
+describe("__pairs / qd.pairs over LazyObject", function()
+    it("iterates string keys in source order", function()
+        local t = qt.decode('{"a":1,"b":2,"c":3}')
+        local keys = {}
+        local values = {}
+        for k, v in qt.pairs(t) do
+            keys[#keys+1] = k
+            values[#values+1] = v
+        end
+        assert.are.same({"a","b","c"}, keys)
+        assert.are.same({1, 2, 3}, values)
+    end)
+
+    it("returns nested containers as lazy proxies, not materialized", function()
+        local t = qt.decode('{"a":{"x":1}}')
+        for _, v in qt.pairs(t) do
+            assert.is_table(v)
+            assert.are.equal(1, v.x)
+        end
+    end)
+
+    it("handles empty object", function()
+        local count = 0
+        for _ in qt.pairs(qt.decode('{}')) do count = count + 1 end
+        assert.are.equal(0, count)
+    end)
+end)
+
+describe("__ipairs / qd.ipairs over LazyArray", function()
+    it("iterates elements 1..n in order", function()
+        local t = qt.decode('[10,20,30]')
+        local got = {}
+        for i, v in qt.ipairs(t) do got[i] = v end
+        assert.are.same({10,20,30}, got)
+    end)
+
+    it("yields lazy proxies for nested containers", function()
+        local t = qt.decode('[{"a":1},{"a":2}]')
+        local seen = {}
+        for _, v in qt.ipairs(t) do
+            assert.is_table(v)
+            seen[#seen+1] = v.a
+        end
+        assert.are.same({1, 2}, seen)
+    end)
+
+    it("handles empty array", function()
+        local count = 0
+        for _ in qt.ipairs(qt.decode('[]')) do count = count + 1 end
+        assert.are.equal(0, count)
+    end)
+end)
+
+describe("__newindex — first-write materialization", function()
+    it("converts LazyObject into a plain table preserving existing keys", function()
+        local t = qt.decode('{"a":1,"b":2}')
+        t.c = 3
+        assert.is_nil(getmetatable(t))
+        assert.are.equal(1, t.a)
+        assert.are.equal(2, t.b)
+        assert.are.equal(3, t.c)
+    end)
+
+    it("nested containers remain lazy after parent materialization", function()
+        local t = qt.decode('{"inner":{"x":1}}')
+        t.extra = "y"
+        assert.is_nil(getmetatable(t))
+        local inner = t.inner
+        assert.are.equal(qt._LazyObject, getmetatable(inner))
+        assert.are.equal(1, inner.x)
+    end)
+
+    it("LazyArray materializes preserving empty_array_mt", function()
+        local t = qt.decode('[]')
+        t[1] = "x"
+        assert.are.equal(qt.empty_array_mt, getmetatable(t))
+        assert.are.equal("x", t[1])
+    end)
+
+    it("simple write leaves other keys intact", function()
+        local t = qt.decode('{"a":1}')
+        t.b = 2
+        assert.are.equal(1, t.a)
+        assert.are.equal(2, t.b)
+    end)
+end)
+
+describe("qt.materialize", function()
+    it("converts a LazyObject and its nested containers into real tables", function()
+        local m = qt.materialize(qt.decode('{"a":1,"b":{"c":[10,20]}}'))
+        assert.is_nil(getmetatable(m))
+        assert.are.equal(1, m.a)
+        assert.is_nil(getmetatable(m.b))
+        assert.are.equal(10, m.b.c[1])
+        assert.are.equal(20, m.b.c[2])
+    end)
+
+    it("tags empty arrays with empty_array_mt", function()
+        local m = qt.materialize(qt.decode('[]'))
+        assert.are.equal(qt.empty_array_mt, getmetatable(m))
+    end)
+
+    it("preserves cjson.null", function()
+        local m = qt.materialize(qt.decode('{"x":null}'))
+        assert.are.equal(qt.null, m.x)
+    end)
+
+    it("passes through scalars and plain tables unchanged", function()
+        assert.are.equal(42, qt.materialize(42))
+        assert.are.equal("hi", qt.materialize("hi"))
+        local raw = {1, 2, 3}
+        assert.are.equal(raw, qt.materialize(raw))
+    end)
+end)
+
+describe("qd.encode — lazy proxy substring fast path", function()
+    it("re-emits the original JSON for an unmodified LazyObject", function()
+        local src = '{"a":1,"b":[2,3],"c":"x"}'
+        local t = qt.decode(src)
+        assert.are.equal(src, qt.encode(t))
+    end)
+
+    it("re-emits the original JSON for an unmodified LazyArray", function()
+        local src = '[10,20,{"k":"v"}]'
+        local t = qt.decode(src)
+        assert.are.equal(src, qt.encode(t))
+    end)
+
+    it("trims leading/trailing whitespace at the boundary", function()
+        local src = '  {"a":1}  '
+        local t = qt.decode(src)
+        -- byte span is the value, not its outer whitespace.
+        assert.are.equal('{"a":1}', qt.encode(t))
+    end)
+end)
+
+describe("qd.encode — scalars", function()
+    it("encodes strings with JSON escapes", function()
+        assert.are.equal('"hello"', qt.encode("hello"))
+        assert.are.equal('"a\\nb"', qt.encode("a\nb"))
+        assert.are.equal('"a\\"b"', qt.encode('a"b'))
+        assert.are.equal('"a\\\\b"', qt.encode("a\\b"))
+    end)
+
+    it("encodes booleans", function()
+        assert.are.equal("true", qt.encode(true))
+        assert.are.equal("false", qt.encode(false))
+    end)
+
+    it("encodes numbers", function()
+        assert.are.equal("42", qt.encode(42))
+        assert.are.equal("-3.14", qt.encode(-3.14))
+    end)
+
+    it("encodes qt.null as JSON null", function()
+        assert.are.equal("null", qt.encode(qt.null))
+    end)
+
+    it("errors on unsupported values", function()
+        assert.has_error(function() qt.encode(function() end) end)
+    end)
+end)
+
+describe("qd.encode — real and mixed tables", function()
+    it("encodes a real Lua object", function()
+        local cjson = require("cjson")
+        local s = qt.encode({a = 1, b = "x"})
+        assert.are.same({a = 1, b = "x"}, cjson.decode(s))
+    end)
+
+    it("encodes a real Lua array", function()
+        assert.are.equal("[1,2,3]", qt.encode({1,2,3}))
+    end)
+
+    it("encodes a hand-built empty array with empty_array_mt", function()
+        local arr = setmetatable({}, qt.empty_array_mt)
+        assert.are.equal("[]", qt.encode(arr))
+    end)
+
+    it("encodes mixed lazy + materialized", function()
+        local t = qt.decode('{"keep":{"x":1},"changed":{"y":2}}')
+        t.changed = "now a string"
+        local out = qt.encode(t)
+        local cjson = require("cjson")
+        local parsed = cjson.decode(out)
+        assert.are.same({x=1}, parsed.keep)
+        assert.are.equal("now a string", parsed.changed)
+    end)
+end)
+
+local cjson = require("cjson")
+
+-- Deep-equal aware of cjson.null and empty_array_mt (which qd aliases).
+local function deep_equal(a, b)
+    if a == b then return true end
+    if type(a) ~= "table" or type(b) ~= "table" then return false end
+    for k, v in pairs(a) do
+        if not deep_equal(v, b[k]) then return false end
+    end
+    for k in pairs(b) do
+        if a[k] == nil then return false end
+    end
+    return true
+end
+
+describe("cjson round-trip equivalence", function()
+    local fixtures = {
+        '{"a":1,"b":"x","c":null,"d":true,"e":false,"f":[1,2,3],"g":{"h":4.5}}',
+        '[1,"x",true,null,{},[]]',
+        '{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"hello"}]}',
+        '{}',
+        '[]',
+        '{"escapes":"a\\nb\\tc\\\"d\\\\e"}',
+    }
+    for _, src in ipairs(fixtures) do
+        it("materialize matches cjson.decode for: " .. src:sub(1, 40), function()
+            local from_qd = qd.materialize(qd.decode(src))
+            local from_cj = cjson.decode(src)
+            assert.is_true(deep_equal(from_qd, from_cj))
+        end)
+
+        it("encode round-trips for: " .. src:sub(1, 40), function()
+            local out = qd.encode(qd.decode(src))
+            local back_qd = cjson.decode(out)
+            local back_cj = cjson.decode(src)
+            assert.is_true(deep_equal(back_qd, back_cj))
+        end)
+    end
+end)
+
+describe("sentinel handling", function()
+    it("JSON null reads as qd.null and encodes back", function()
+        local t = qd.decode('{"x":null}')
+        assert.are.equal(qd.null, t.x)
+        assert.are.equal('{"x":null}', qd.encode(t))
+    end)
+
+    it("empty array stays an array through materialize and encode", function()
+        local t = qd.decode('{"xs":[]}')
+        local m = qd.materialize(t)
+        assert.are.equal(qd.empty_array_mt, getmetatable(m.xs))
+        assert.are.equal('{"xs":[]}', qd.encode(t))
+    end)
+end)
+
+describe("qd.encode — nested mutations propagate", function()
+    it("emits nested object mutation, not original bytes", function()
+        local cjson = require("cjson")
+        local t = qd.decode('{"a":{"b":{"c":1}},"d":2}')
+        t.a.b.c = 999
+        local out = qd.encode(t)
+        local parsed = cjson.decode(out)
+        assert.are.equal(999, parsed.a.b.c)
+        assert.are.equal(2, parsed.d)
+    end)
+
+    it("emits nested array mutation", function()
+        local cjson = require("cjson")
+        local t = qd.decode('{"xs":[10,20,30]}')
+        t.xs[2] = 999
+        local out = qd.encode(t)
+        local parsed = cjson.decode(out)
+        assert.are.equal(10, parsed.xs[1])
+        assert.are.equal(999, parsed.xs[2])
+        assert.are.equal(30, parsed.xs[3])
+    end)
+
+    it("preserves cached proxy identity across parent materialization", function()
+        local t = qd.decode('{"a":{"x":1}}')
+        local inner = t.a
+        t.c = 3
+        assert.are.equal(inner, t.a)
+        inner.x = 99
+        assert.are.equal(99, t.a.x)
+    end)
+end)