From 1c3da2374c39ee23ee50d9352772a9b1ef3e5f4e Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:29:16 +0000 Subject: [PATCH 01/21] feat(error): add 6 RFC 8259 audit error codes synced across Rust/C/Lua --- include/lua_quick_decode.h | 24 +++++++++++------- lua/quickdecode.lua | 20 +++++++++++++++ src/error.rs | 51 ++++++++++++++++++++++++-------------- src/ffi.rs | 26 +++++++++++-------- 4 files changed, 84 insertions(+), 37 deletions(-) diff --git a/include/lua_quick_decode.h b/include/lua_quick_decode.h index e3aeab2..54d28b6 100644 --- a/include/lua_quick_decode.h +++ b/include/lua_quick_decode.h @@ -9,15 +9,21 @@ extern "C" { #endif typedef enum { - QJD_OK = 0, - QJD_PARSE_ERROR = 1, - QJD_NOT_FOUND = 2, - QJD_TYPE_MISMATCH = 3, - QJD_OUT_OF_RANGE = 4, - QJD_DECODE_FAILED = 5, - QJD_INVALID_PATH = 6, - QJD_INVALID_ARG = 7, - QJD_OOM = 8 + QJD_OK = 0, + QJD_PARSE_ERROR = 1, + QJD_NOT_FOUND = 2, + QJD_TYPE_MISMATCH = 3, + QJD_OUT_OF_RANGE = 4, + QJD_DECODE_FAILED = 5, + QJD_INVALID_PATH = 6, + QJD_INVALID_ARG = 7, + QJD_OOM = 8, + QJD_NESTING_TOO_DEEP = 9, + QJD_TRAILING_CONTENT = 10, + QJD_NUMBER_OUT_OF_RANGE = 11, + QJD_INVALID_NUMBER = 12, + QJD_INVALID_STRING = 13, + QJD_INVALID_UTF8 = 14 } qjd_err; typedef enum { diff --git a/lua/quickdecode.lua b/lua/quickdecode.lua index 0851895..c4e0c6f 100644 --- a/lua/quickdecode.lua +++ b/lua/quickdecode.lua @@ -48,11 +48,31 @@ local strp_box = ffi.new("const uint8_t*[1]") local cur_box = ffi.new("qjd_cursor[1]") local NOT_FOUND = 2 +-- Error codes mirrored from include/lua_quick_decode.h. Kept in sync manually; +-- src/error.rs has the authoritative numbering. +local ERR = { + OK = 0, + PARSE_ERROR = 1, + NOT_FOUND = 2, + TYPE_MISMATCH = 3, + OUT_OF_RANGE = 4, + DECODE_FAILED = 5, + INVALID_PATH = 6, + INVALID_ARG = 7, + OOM = 8, + NESTING_TOO_DEEP = 9, + TRAILING_CONTENT = 10, + NUMBER_OUT_OF_RANGE = 11, + INVALID_NUMBER = 12, + INVALID_STRING = 13, + INVALID_UTF8 = 14, +} local _M = { T_NULL = 0, T_BOOL = 1, T_NUM = 2, T_STR = 3, T_ARR = 4, T_OBJ = 5, } +_M.ERR = ERR local Doc = {}; Doc.__index = Doc local Cursor = {}; Cursor.__index = Cursor diff --git a/src/error.rs b/src/error.rs index 270ea10..72ff3e9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,15 +3,21 @@ #[repr(C)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum qjd_err { - QJD_OK = 0, - QJD_PARSE_ERROR = 1, - QJD_NOT_FOUND = 2, - QJD_TYPE_MISMATCH = 3, - QJD_OUT_OF_RANGE = 4, - QJD_DECODE_FAILED = 5, - QJD_INVALID_PATH = 6, - QJD_INVALID_ARG = 7, - QJD_OOM = 8, + QJD_OK = 0, + QJD_PARSE_ERROR = 1, + QJD_NOT_FOUND = 2, + QJD_TYPE_MISMATCH = 3, + QJD_OUT_OF_RANGE = 4, + QJD_DECODE_FAILED = 5, + QJD_INVALID_PATH = 6, + QJD_INVALID_ARG = 7, + QJD_OOM = 8, + QJD_NESTING_TOO_DEEP = 9, + QJD_TRAILING_CONTENT = 10, + QJD_NUMBER_OUT_OF_RANGE = 11, + QJD_INVALID_NUMBER = 12, + QJD_INVALID_STRING = 13, + QJD_INVALID_UTF8 = 14, } #[repr(C)] @@ -27,15 +33,21 @@ pub enum qjd_type { pub fn strerror(code: qjd_err) -> &'static str { match code { - qjd_err::QJD_OK => "ok", - qjd_err::QJD_PARSE_ERROR => "JSON parse error", - qjd_err::QJD_NOT_FOUND => "path not found", - qjd_err::QJD_TYPE_MISMATCH => "type mismatch at path", - qjd_err::QJD_OUT_OF_RANGE => "numeric out of range", - qjd_err::QJD_DECODE_FAILED => "decode failed", - qjd_err::QJD_INVALID_PATH => "invalid path syntax", - qjd_err::QJD_INVALID_ARG => "invalid argument", - qjd_err::QJD_OOM => "out of memory", + qjd_err::QJD_OK => "ok", + qjd_err::QJD_PARSE_ERROR => "JSON parse error", + qjd_err::QJD_NOT_FOUND => "path not found", + qjd_err::QJD_TYPE_MISMATCH => "type mismatch at path", + qjd_err::QJD_OUT_OF_RANGE => "numeric out of range", + qjd_err::QJD_DECODE_FAILED => "decode failed", + qjd_err::QJD_INVALID_PATH => "invalid path syntax", + qjd_err::QJD_INVALID_ARG => "invalid argument", + qjd_err::QJD_OOM => "out of memory", + qjd_err::QJD_NESTING_TOO_DEEP => "nesting depth exceeds limit", + qjd_err::QJD_TRAILING_CONTENT => "trailing content after root value", + qjd_err::QJD_NUMBER_OUT_OF_RANGE => "number out of representable range", + qjd_err::QJD_INVALID_NUMBER => "invalid number format (RFC 8259)", + qjd_err::QJD_INVALID_STRING => "invalid string content (unescaped control char)", + qjd_err::QJD_INVALID_UTF8 => "invalid UTF-8 in string", } } @@ -50,6 +62,9 @@ mod tests { qjd_err::QJD_TYPE_MISMATCH, qjd_err::QJD_OUT_OF_RANGE, qjd_err::QJD_DECODE_FAILED, qjd_err::QJD_INVALID_PATH, qjd_err::QJD_INVALID_ARG, qjd_err::QJD_OOM, + qjd_err::QJD_NESTING_TOO_DEEP, qjd_err::QJD_TRAILING_CONTENT, + qjd_err::QJD_NUMBER_OUT_OF_RANGE, qjd_err::QJD_INVALID_NUMBER, + qjd_err::QJD_INVALID_STRING, qjd_err::QJD_INVALID_UTF8, ] { assert!(!strerror(code).is_empty()); } diff --git a/src/ffi.rs b/src/ffi.rs index b5110b1..bdf9038 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -55,16 +55,22 @@ pub struct qjd_doc(pub(crate) Document<'static>); pub unsafe extern "C" fn qjd_strerror(code: c_int) -> *const c_char { // Hardcoded NUL-terminated map; avoids runtime allocation and lifetime issues. let s: &'static [u8] = match code { - 0 => b"ok\0", - 1 => b"JSON parse error\0", - 2 => b"path not found\0", - 3 => b"type mismatch at path\0", - 4 => b"numeric out of range\0", - 5 => b"decode failed\0", - 6 => b"invalid path syntax\0", - 7 => b"invalid argument\0", - 8 => b"out of memory\0", - _ => b"unknown error code\0", + 0 => b"ok\0", + 1 => b"JSON parse error\0", + 2 => b"path not found\0", + 3 => b"type mismatch at path\0", + 4 => b"numeric out of range\0", + 5 => b"decode failed\0", + 6 => b"invalid path syntax\0", + 7 => b"invalid argument\0", + 8 => b"out of memory\0", + 9 => b"nesting depth exceeds limit\0", + 10 => b"trailing content after root value\0", + 11 => b"number out of representable range\0", + 12 => b"invalid number format (RFC 8259)\0", + 13 => b"invalid string content (unescaped control char)\0", + 14 => b"invalid UTF-8 in string\0", + _ => b"unknown error code\0", }; s.as_ptr() as *const c_char } From a872d5cd146ff5da21eb475f15ec7c84f08a69ef Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:35:12 +0000 Subject: [PATCH 02/21] feat(options): introduce Options + Document::parse_with_options scaffold --- src/doc.rs | 26 +++++++++++++++++++++++- src/lib.rs | 1 + src/options.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 src/options.rs diff --git a/src/doc.rs b/src/doc.rs index 707bb44..faed1ec 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -12,9 +12,18 @@ pub struct Document<'a> { impl<'a> Document<'a> { pub fn parse(buf: &'a [u8]) -> Result { + Self::parse_with_options(buf, &crate::options::Options::default()) + } + + pub fn parse_with_options( + buf: &'a [u8], + _opts: &crate::options::Options, + ) -> Result { + // TODO(Task 6+): plug in validate_depth / validate_trailing / + // validate_eager_values. For now this is a structural-only parse + // matching the historical `parse` behavior. let mut indices = Vec::new(); crate::scan::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?; - // Sentinel simplifies boundary checks during Phase 2. indices.push(u32::MAX); Ok(Self { buf, @@ -169,4 +178,19 @@ mod tests { fn parse_error_on_malformed() { assert!(Document::parse(b"{").is_err()); } + + #[test] + fn parse_with_options_defaults_match_parse() { + let opts = crate::options::Options::default(); + let a = Document::parse(b"{\"a\":1}").unwrap(); + let b = Document::parse_with_options(b"{\"a\":1}", &opts).unwrap(); + assert_eq!(a.indices, b.indices); + } + + #[test] + fn parse_with_lazy_skips_eager_validation() { + // Trailing content is an eager-only check; lazy must accept it. + let opts = crate::options::Options { mode: crate::options::QJD_MODE_LAZY, max_depth: 0 }; + assert!(Document::parse_with_options(b"{}garbage", &opts).is_ok()); + } } diff --git a/src/lib.rs b/src/lib.rs index 83f161b..d9c29b7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ //! lua-quick-decode: Rust JSON decoder for LuaJIT FFI consumers. pub mod error; +pub mod options; pub(crate) mod scan; mod skip_cache; mod doc; diff --git a/src/options.rs b/src/options.rs new file mode 100644 index 0000000..3c1241c --- /dev/null +++ b/src/options.rs @@ -0,0 +1,55 @@ +#![allow(non_camel_case_types)] + +pub const QJD_MODE_EAGER: u32 = 0; +pub const QJD_MODE_LAZY: u32 = 1; +pub const QJD_DEFAULT_MAX_DEPTH: u32 = 1024; +pub const QJD_MAX_MAX_DEPTH: u32 = 4096; + +/// Caller-visible parse options. Layout is FFI-stable: kept in sync with +/// `qjd_options` in `include/lua_quick_decode.h`. +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct Options { + /// `QJD_MODE_EAGER` (0) — full RFC 8259 validation during parse. + /// `QJD_MODE_LAZY` (1) — structural-only; defer value errors to access. + pub mode: u32, + /// Max bracket nesting depth. `0` selects `QJD_DEFAULT_MAX_DEPTH` (1024). + /// Values >`QJD_MAX_MAX_DEPTH` are clamped to that ceiling. + pub max_depth: u32, +} + +impl Default for Options { + fn default() -> Self { + Self { mode: QJD_MODE_EAGER, max_depth: 0 } + } +} + +#[allow(dead_code)] // used in Task 6+ validators +impl Options { + pub(crate) fn effective_max_depth(&self) -> u32 { + let d = if self.max_depth == 0 { QJD_DEFAULT_MAX_DEPTH } else { self.max_depth }; + d.min(QJD_MAX_MAX_DEPTH) + } + + pub(crate) fn is_eager(&self) -> bool { + self.mode == QJD_MODE_EAGER + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] fn default_is_eager() { assert!(Options::default().is_eager()); } + + #[test] + fn zero_max_depth_falls_back_to_default() { + assert_eq!(Options::default().effective_max_depth(), QJD_DEFAULT_MAX_DEPTH); + } + + #[test] + fn huge_max_depth_is_clamped() { + let o = Options { mode: 0, max_depth: u32::MAX }; + assert_eq!(o.effective_max_depth(), QJD_MAX_MAX_DEPTH); + } +} From d5aaaec4e2ff9d407920cdbf2a047ab6e0243d8e Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:46:45 +0000 Subject: [PATCH 03/21] feat(ffi): add qjd_parse_ex symbol with qjd_options struct --- include/lua_quick_decode.h | 11 ++++++++++ src/doc.rs | 1 + src/ffi.rs | 32 +++++++++++++++++++++++---- tests/ffi_options_smoke.rs | 44 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 tests/ffi_options_smoke.rs diff --git a/include/lua_quick_decode.h b/include/lua_quick_decode.h index 54d28b6..f920ab1 100644 --- a/include/lua_quick_decode.h +++ b/include/lua_quick_decode.h @@ -31,6 +31,15 @@ typedef enum { QJD_T_STR = 3, QJD_T_ARR = 4, QJD_T_OBJ = 5 } qjd_type; +#define QJD_MODE_EAGER 0u +#define QJD_MODE_LAZY 1u +#define QJD_DEFAULT_MAX_DEPTH 1024u + +typedef struct { + uint32_t mode; /* QJD_MODE_EAGER (0) or QJD_MODE_LAZY (1) */ + uint32_t max_depth; /* 0 = use QJD_DEFAULT_MAX_DEPTH */ +} qjd_options; + typedef struct qjd_doc qjd_doc; typedef struct { @@ -44,6 +53,8 @@ typedef struct { const char* qjd_strerror(int code); qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out); +qjd_doc* qjd_parse_ex(const uint8_t* buf, size_t len, + const qjd_options* opts, int* err_out); void qjd_free (qjd_doc* doc); int qjd_get_str (qjd_doc*, const char* path, size_t path_len, diff --git a/src/doc.rs b/src/doc.rs index faed1ec..d6790ac 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -11,6 +11,7 @@ pub struct Document<'a> { } impl<'a> Document<'a> { + #[allow(dead_code)] // public convenience API; used in tests and external crates pub fn parse(buf: &'a [u8]) -> Result { Self::parse_with_options(buf, &crate::options::Options::default()) } diff --git a/src/ffi.rs b/src/ffi.rs index bdf9038..f5cf5a9 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -92,20 +92,44 @@ pub unsafe extern "C" fn qjd_parse( buf: *const u8, len: usize, err_out: *mut c_int, +) -> *mut qjd_doc { + let default = crate::options::Options::default(); + qjd_parse_ex(buf, len, &default as *const _, err_out) +} + +/// Parse with caller-supplied options. `opts` may be NULL to mean defaults +/// (eager mode, default max_depth). +/// +/// # Safety +/// +/// Same as `qjd_parse`, with the additional contract that `opts`, when +/// non-NULL, points to a readable `qjd_options` for the duration of the call +/// (the struct is copied internally). +#[no_mangle] +pub unsafe extern "C" fn qjd_parse_ex( + buf: *const u8, + len: usize, + opts: *const crate::options::Options, + err_out: *mut c_int, ) -> *mut qjd_doc { let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - if buf.is_null() || err_out.is_null() { + if buf.is_null() { if !err_out.is_null() { *err_out = qjd_err::QJD_INVALID_ARG as c_int; } return ptr::null_mut(); } + let opts_owned = if opts.is_null() { + crate::options::Options::default() + } else { + *opts + }; let slice: &'static [u8] = std::slice::from_raw_parts(buf, len); - match Document::parse(slice) { + match Document::parse_with_options(slice, &opts_owned) { Ok(d) => { - *err_out = qjd_err::QJD_OK as c_int; + if !err_out.is_null() { *err_out = qjd_err::QJD_OK as c_int; } Box::into_raw(Box::new(qjd_doc(d))) } Err(e) => { - *err_out = e as c_int; + if !err_out.is_null() { *err_out = e as c_int; } ptr::null_mut() } } diff --git a/tests/ffi_options_smoke.rs b/tests/ffi_options_smoke.rs new file mode 100644 index 0000000..83d942d --- /dev/null +++ b/tests/ffi_options_smoke.rs @@ -0,0 +1,44 @@ +//! Smoke test for qjd_parse_ex and qjd_options C ABI. + +use std::os::raw::c_int; + +use quickdecode::ffi::{qjd_doc, qjd_free, qjd_parse, qjd_parse_ex}; +use quickdecode::options::Options; + +#[test] +fn parse_ex_default_options_matches_parse() { + let buf = b"{\"a\":1}"; + let mut err: c_int = -1; + let d1: *mut qjd_doc = unsafe { qjd_parse(buf.as_ptr(), buf.len(), &mut err) }; + assert!(!d1.is_null()); + assert_eq!(err, 0); + + let opts = Options { mode: 0, max_depth: 0 }; + let mut err2: c_int = -1; + let d2: *mut qjd_doc = unsafe { qjd_parse_ex(buf.as_ptr(), buf.len(), &opts, &mut err2) }; + assert!(!d2.is_null()); + assert_eq!(err2, 0); + + unsafe { qjd_free(d1); qjd_free(d2); } +} + +#[test] +fn parse_ex_null_opts_uses_defaults() { + let buf = b"{}"; + let mut err: c_int = -1; + let d: *mut qjd_doc = unsafe { + qjd_parse_ex(buf.as_ptr(), buf.len(), std::ptr::null(), &mut err) + }; + assert!(!d.is_null()); + assert_eq!(err, 0); + unsafe { qjd_free(d) }; +} + +#[test] +fn parse_ex_null_err_returns_null_on_bad_buf() { + let opts = Options { mode: 0, max_depth: 0 }; + let d: *mut qjd_doc = unsafe { + qjd_parse_ex(std::ptr::null(), 0, &opts, std::ptr::null_mut()) + }; + assert!(d.is_null()); +} From 80c13587f00a989edeb061a6ef8e54d59e3902ee Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:53:33 +0000 Subject: [PATCH 04/21] docs(ffi): clarify qjd_parse err_out contract and dead_code rationale --- src/doc.rs | 2 +- src/ffi.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/doc.rs b/src/doc.rs index d6790ac..edc51b4 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -11,7 +11,7 @@ pub struct Document<'a> { } impl<'a> Document<'a> { - #[allow(dead_code)] // public convenience API; used in tests and external crates + #[allow(dead_code)] // suppressed until mod doc is re-exported (Task 5) pub fn parse(buf: &'a [u8]) -> Result { Self::parse_with_options(buf, &crate::options::Options::default()) } diff --git a/src/ffi.rs b/src/ffi.rs index f5cf5a9..4cecaef 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -81,8 +81,8 @@ pub unsafe extern "C" fn qjd_strerror(code: c_int) -> *const c_char { /// /// - `buf` must point to `len` readable bytes, or be NULL (in which case the /// function returns NULL with `*err_out = QJD_INVALID_ARG`). -/// - `err_out` must point to a writable `int`, or be NULL (in which case the -/// function returns NULL with no error code written). +/// - `err_out` may be NULL. When non-NULL it receives `QJD_OK` on success or +/// an error code on failure. /// - The buffer must remain valid and unmodified for the lifetime of the /// returned `qjd_doc*`; the document borrows it. /// - On success, the returned pointer must be freed exactly once with From 1e8b55b38c505ecf78765cc0588671d5901cd992 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:55:38 +0000 Subject: [PATCH 05/21] feat(lua): accept opts table in qd.parse(json, { lazy, max_depth }) --- lua/quickdecode.lua | 38 ++++++++++++++++++++++++++++++++++---- tests/lua/options_spec.lua | 26 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 tests/lua/options_spec.lua diff --git a/lua/quickdecode.lua b/lua/quickdecode.lua index c4e0c6f..b8ad662 100644 --- a/lua/quickdecode.lua +++ b/lua/quickdecode.lua @@ -7,9 +7,16 @@ typedef struct { uint32_t idx_start, idx_end, _reserved0, _reserved1; } qjd_cursor; +typedef struct { + uint32_t mode; + uint32_t max_depth; +} qjd_options; + const char* qjd_strerror(int code); -qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out); -void qjd_free(qjd_doc* doc); +qjd_doc* qjd_parse (const uint8_t* buf, size_t len, int* err_out); +qjd_doc* qjd_parse_ex(const uint8_t* buf, size_t len, + const qjd_options* opts, int* err_out); +void qjd_free (qjd_doc* doc); int qjd_get_str (qjd_doc*, const char* path, size_t path_len, const uint8_t** p, size_t* n); int qjd_get_i64 (qjd_doc*, const char* path, size_t path_len, int64_t* out); @@ -83,8 +90,31 @@ local function check_err(rc) error("quickdecode: " .. ffi.string(C.qjd_strerror(rc))) end -function _M.parse(json_str) - local ptr = C.qjd_parse(json_str, #json_str, err_box) +local opts_box = ffi.new("qjd_options[1]") + +local MODE_EAGER = 0 +local MODE_LAZY = 1 + +function _M.parse(json_str, opts) + local ptr + if opts == nil then + ptr = C.qjd_parse(json_str, #json_str, err_box) + else + if type(opts) ~= "table" then + error("quickdecode.parse: opts must be a table") + end + local lazy = opts.lazy + if lazy ~= nil and type(lazy) ~= "boolean" then + error("quickdecode.parse: opts.lazy must be a boolean") + end + local max_depth = opts.max_depth or 0 + if type(max_depth) ~= "number" or max_depth < 0 then + error("quickdecode.parse: opts.max_depth must be a non-negative integer") + end + opts_box[0].mode = lazy and MODE_LAZY or MODE_EAGER + opts_box[0].max_depth = max_depth + ptr = C.qjd_parse_ex(json_str, #json_str, opts_box, err_box) + end if ptr == nil then error("quickdecode: " .. ffi.string(C.qjd_strerror(err_box[0]))) end diff --git a/tests/lua/options_spec.lua b/tests/lua/options_spec.lua new file mode 100644 index 0000000..786b1b1 --- /dev/null +++ b/tests/lua/options_spec.lua @@ -0,0 +1,26 @@ +local qd = require "quickdecode" + +describe("parse with options", function() + it("accepts no second arg (default eager)", function() + assert.is_not_nil(qd.parse('{"a":1}')) + end) + + it("accepts an empty opts table", function() + assert.is_not_nil(qd.parse('{"a":1}', {})) + end) + + it("accepts lazy=true and tolerates trailing content", function() + -- Trailing content is eager-only; lazy must parse OK. + assert.is_not_nil(qd.parse('{}garbage', { lazy = true })) + end) + + it("accepts max_depth", function() + assert.is_not_nil(qd.parse('[[[1]]]', { max_depth = 1024 })) + end) + + it("rejects invalid mode key value", function() + assert.has_error(function() + qd.parse('{}', { lazy = "yes please" }) + end) + end) +end) From 16a149bff38ea0f6aa80b80a371d657399eb44c4 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 15:58:18 +0000 Subject: [PATCH 06/21] fix(lua): reject fractional max_depth; add combined-opts test --- lua/quickdecode.lua | 2 +- tests/lua/options_spec.lua | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lua/quickdecode.lua b/lua/quickdecode.lua index b8ad662..5ab6c5f 100644 --- a/lua/quickdecode.lua +++ b/lua/quickdecode.lua @@ -108,7 +108,7 @@ function _M.parse(json_str, opts) error("quickdecode.parse: opts.lazy must be a boolean") end local max_depth = opts.max_depth or 0 - if type(max_depth) ~= "number" or max_depth < 0 then + if type(max_depth) ~= "number" or max_depth < 0 or max_depth ~= math.floor(max_depth) then error("quickdecode.parse: opts.max_depth must be a non-negative integer") end opts_box[0].mode = lazy and MODE_LAZY or MODE_EAGER diff --git a/tests/lua/options_spec.lua b/tests/lua/options_spec.lua index 786b1b1..c689d2a 100644 --- a/tests/lua/options_spec.lua +++ b/tests/lua/options_spec.lua @@ -23,4 +23,14 @@ describe("parse with options", function() qd.parse('{}', { lazy = "yes please" }) end) end) + + it("accepts lazy=true and max_depth combined", function() + assert.is_not_nil(qd.parse('[[1]]', { lazy = true, max_depth = 256 })) + end) + + it("rejects fractional max_depth", function() + assert.has_error(function() + qd.parse('{}', { max_depth = 1.5 }) + end) + end) end) From c8dfd8446324e2689ed012e85cd4604bb86e4054 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:01:40 +0000 Subject: [PATCH 07/21] test(rfc8259): scaffold compliance suite with cross-mode helper macros --- src/doc.rs | 1 - src/lib.rs | 2 +- tests/rfc8259_compliance.rs | 79 +++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 tests/rfc8259_compliance.rs diff --git a/src/doc.rs b/src/doc.rs index edc51b4..faed1ec 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -11,7 +11,6 @@ pub struct Document<'a> { } impl<'a> Document<'a> { - #[allow(dead_code)] // suppressed until mod doc is re-exported (Task 5) pub fn parse(buf: &'a [u8]) -> Result { Self::parse_with_options(buf, &crate::options::Options::default()) } diff --git a/src/lib.rs b/src/lib.rs index d9c29b7..80c0d5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ pub mod error; pub mod options; pub(crate) mod scan; mod skip_cache; -mod doc; +pub mod doc; mod path; mod cursor; mod decode; diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs new file mode 100644 index 0000000..f982581 --- /dev/null +++ b/tests/rfc8259_compliance.rs @@ -0,0 +1,79 @@ +//! RFC 8259 conformance suite. +//! +//! Cross-mode contract: +//! * `y_*` inputs MUST parse successfully in both EAGER and LAZY modes, +//! and any specified field-level access MUST return the expected value. +//! * `n_*` inputs MUST fail to parse in EAGER mode, and MUST either +//! fail to parse OR fail on the documented field access in LAZY mode. +//! * `i_*` inputs document our current behavior; we assert what we do +//! today (so regressions surface), referencing JSONTestSuite naming. +//! +//! RFC 8259 references are in section-paragraph form, e.g. RFC8259 §6 for +//! the number grammar. + +use quickdecode::doc::Document; +use quickdecode::options::{Options, QJD_MODE_EAGER, QJD_MODE_LAZY}; + +fn eager() -> Options { Options { mode: QJD_MODE_EAGER, max_depth: 0 } } +fn lazy() -> Options { Options { mode: QJD_MODE_LAZY, max_depth: 0 } } + +/// Asserts the input is accepted in both modes. +/// +/// Usage: `assert_accepts!("[]");` +#[macro_export] +macro_rules! assert_accepts { + ($input:expr) => {{ + let buf: &[u8] = $input.as_ref(); + let r_eager = Document::parse_with_options(buf, &eager()); + assert!(r_eager.is_ok(), + "EAGER unexpectedly rejected {:?}: {:?}", $input, r_eager.err()); + let r_lazy = Document::parse_with_options(buf, &lazy()); + assert!(r_lazy.is_ok(), + "LAZY unexpectedly rejected {:?}: {:?}", $input, r_lazy.err()); + }}; +} + +/// Asserts the input is REJECTED by eager parse. +/// +/// Usage: `assert_rejects_eager!("01", QJD_INVALID_NUMBER);` +#[macro_export] +macro_rules! assert_rejects_eager { + ($input:expr, $expected_err:path) => {{ + use quickdecode::error::qjd_err; + let buf: &[u8] = $input.as_ref(); + match Document::parse_with_options(buf, &eager()) { + Err($expected_err) => {} + Err(other) => panic!( + "EAGER rejected {:?} with {:?}, expected {:?}", + $input, other, qjd_err::$expected_err), + Ok(_) => panic!("EAGER unexpectedly accepted {:?}", $input), + } + }}; +} + +/// Asserts the input is rejected at parse time in BOTH modes (structural). +#[macro_export] +macro_rules! assert_rejects_both { + ($input:expr) => {{ + let buf: &[u8] = $input.as_ref(); + assert!(Document::parse_with_options(buf, &eager()).is_err(), + "EAGER unexpectedly accepted {:?}", $input); + assert!(Document::parse_with_options(buf, &lazy()).is_err(), + "LAZY unexpectedly accepted {:?}", $input); + }}; +} + +// ───────────────────────────────────────────────────────────── +// Scaffold smoke tests — replaced by Task 11 with full corpus. +// ───────────────────────────────────────────────────────────── + +#[test] +fn smoke_accepts_empty_object() { assert_accepts!("{}"); } + +#[test] +fn smoke_accepts_empty_array() { assert_accepts!("[]"); } + +#[test] +fn smoke_rejects_unmatched_brace_both_modes() { + assert_rejects_both!("{"); +} From 75c7244251b1a97b6bf8e17339471dda46dc09d8 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:04:56 +0000 Subject: [PATCH 08/21] fix(test): assert_rejects_eager macro now actually matches by variant Switch the fragment specifier from :path to :ident so the variant name can be used in a qjd_err:: path, and replace the pattern arm with a runtime guard (if e == expected) to avoid the binding-vs-pattern ambiguity. Add macro_rejects_wrong_error_code as a regression canary. --- tests/rfc8259_compliance.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index f982581..f1d4f7f 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -38,14 +38,15 @@ macro_rules! assert_accepts { /// Usage: `assert_rejects_eager!("01", QJD_INVALID_NUMBER);` #[macro_export] macro_rules! assert_rejects_eager { - ($input:expr, $expected_err:path) => {{ + ($input:expr, $expected_err:ident) => {{ use quickdecode::error::qjd_err; let buf: &[u8] = $input.as_ref(); + let expected = qjd_err::$expected_err; match Document::parse_with_options(buf, &eager()) { - Err($expected_err) => {} + Err(e) if e == expected => {} Err(other) => panic!( "EAGER rejected {:?} with {:?}, expected {:?}", - $input, other, qjd_err::$expected_err), + $input, other, expected), Ok(_) => panic!("EAGER unexpectedly accepted {:?}", $input), } }}; @@ -77,3 +78,13 @@ fn smoke_accepts_empty_array() { assert_accepts!("[]"); } fn smoke_rejects_unmatched_brace_both_modes() { assert_rejects_both!("{"); } + +#[test] +#[should_panic(expected = "expected QJD_INVALID_NUMBER")] +fn macro_rejects_wrong_error_code() { + // Sanity: passing the wrong expected variant must panic. + // `{` is rejected as QJD_PARSE_ERROR, NOT QJD_INVALID_NUMBER. + // With the buggy macro, this test would NOT panic (false positive + // — the macro would silently bind whatever Err came back). + assert_rejects_eager!("{", QJD_INVALID_NUMBER); +} From 1b86918a4f3f70236c48dda2e22574e7fcad9365 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:10:38 +0000 Subject: [PATCH 09/21] feat(validate): enforce max_depth in both eager and lazy modes --- src/doc.rs | 10 +++--- src/lib.rs | 1 + src/validate.rs | 64 +++++++++++++++++++++++++++++++++++++ tests/rfc8259_compliance.rs | 44 +++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 src/validate.rs diff --git a/src/doc.rs b/src/doc.rs index faed1ec..3a07760 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -17,14 +17,16 @@ impl<'a> Document<'a> { pub fn parse_with_options( buf: &'a [u8], - _opts: &crate::options::Options, + opts: &crate::options::Options, ) -> Result { - // TODO(Task 6+): plug in validate_depth / validate_trailing / - // validate_eager_values. For now this is a structural-only parse - // matching the historical `parse` behavior. + let max_depth = opts.effective_max_depth(); let mut indices = Vec::new(); crate::scan::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?; indices.push(u32::MAX); + + crate::validate::validate_depth(buf, &indices, max_depth)?; + // TODO(Task 7+): trailing-content and eager value validators. + Ok(Self { buf, indices, diff --git a/src/lib.rs b/src/lib.rs index 80c0d5c..87f5c6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,7 @@ pub mod doc; mod path; mod cursor; mod decode; +mod validate; pub mod ffi; #[doc(hidden)] diff --git a/src/validate.rs b/src/validate.rs new file mode 100644 index 0000000..38f0181 --- /dev/null +++ b/src/validate.rs @@ -0,0 +1,64 @@ +//! Post-scan validators invoked by Document::parse_with_options. +//! +//! Walking the already-emitted `indices` array is intentionally +//! decoupled from the SIMD/scalar scanner paths so the structural +//! scanner code stays untouched. + +use crate::error::qjd_err; + +/// Verify that the maximum bracket-stack depth implied by `indices` +/// does not exceed `max_depth`. Walks indices once; assumes scan() has +/// already validated bracket pairing. +/// +/// `indices` is the post-scan vector with the trailing u32::MAX sentinel. +pub(crate) fn validate_depth( + buf: &[u8], + indices: &[u32], + max_depth: u32, +) -> Result<(), qjd_err> { + let mut depth: u32 = 0; + for &idx in indices { + if idx == u32::MAX { break; } + match buf[idx as usize] { + b'{' | b'[' => { + depth += 1; + if depth > max_depth { + return Err(qjd_err::QJD_NESTING_TOO_DEEP); + } + } + b'}' | b']' => { + // Cannot underflow: scan() already validated pairing. + depth -= 1; + } + _ => {} + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ix(buf: &[u8]) -> Vec { + let mut v = Vec::new(); + crate::scan::scan(buf, &mut v).unwrap(); + v.push(u32::MAX); + v + } + + #[test] + fn under_limit_ok() { + let buf = b"[[1]]"; + assert!(validate_depth(buf, &ix(buf), 2).is_ok()); + } + + #[test] + fn over_limit_rejected() { + let buf = b"[[[1]]]"; + assert_eq!( + validate_depth(buf, &ix(buf), 2), + Err(qjd_err::QJD_NESTING_TOO_DEEP), + ); + } +} diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index f1d4f7f..b193c94 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -88,3 +88,47 @@ fn macro_rejects_wrong_error_code() { // — the macro would silently bind whatever Err came back). assert_rejects_eager!("{", QJD_INVALID_NUMBER); } + +// ── Phase 3: nesting depth ─────────────────────────────────── + +#[test] +fn rejects_deeply_nested_at_default_limit() { + use quickdecode::error::qjd_err; + let mut buf = String::new(); + for _ in 0..1100 { buf.push('['); } + for _ in 0..1100 { buf.push(']'); } + match Document::parse_with_options(buf.as_bytes(), &eager()) { + Err(qjd_err::QJD_NESTING_TOO_DEEP) => {} + other => panic!("expected QJD_NESTING_TOO_DEEP, got {:?}", other.err()), + } +} + +#[test] +fn lazy_mode_also_enforces_max_depth() { + use quickdecode::error::qjd_err; + let mut buf = String::new(); + for _ in 0..1100 { buf.push('['); } + for _ in 0..1100 { buf.push(']'); } + assert_eq!( + Document::parse_with_options(buf.as_bytes(), &lazy()).err().unwrap(), + qjd_err::QJD_NESTING_TOO_DEEP, + ); +} + +#[test] +fn accepts_nested_at_configured_limit() { + let mut buf = String::new(); + for _ in 0..256 { buf.push('['); } + for _ in 0..256 { buf.push(']'); } + let opts = Options { mode: QJD_MODE_EAGER, max_depth: 256 }; + assert!(Document::parse_with_options(buf.as_bytes(), &opts).is_ok()); +} + +#[test] +fn rejects_when_one_past_configured_limit() { + let mut buf = String::new(); + for _ in 0..33 { buf.push('['); } + for _ in 0..33 { buf.push(']'); } + let opts = Options { mode: QJD_MODE_EAGER, max_depth: 32 }; + assert!(Document::parse_with_options(buf.as_bytes(), &opts).is_err()); +} From 1f93104d4f54a0ef5c29a2fb784ce4ab74084945 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:17:35 +0000 Subject: [PATCH 10/21] feat(validate): reject trailing content after root value (eager only) --- src/doc.rs | 6 ++- src/validate.rs | 89 +++++++++++++++++++++++++++++++++++++ tests/rfc8259_compliance.rs | 40 +++++++++++++++++ 3 files changed, 134 insertions(+), 1 deletion(-) diff --git a/src/doc.rs b/src/doc.rs index 3a07760..75deb8d 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -25,7 +25,11 @@ impl<'a> Document<'a> { indices.push(u32::MAX); crate::validate::validate_depth(buf, &indices, max_depth)?; - // TODO(Task 7+): trailing-content and eager value validators. + + if opts.is_eager() { + crate::validate::validate_trailing(buf, &indices)?; + // TODO(Task 10): validate_eager_values + } Ok(Self { buf, diff --git a/src/validate.rs b/src/validate.rs index 38f0181..db1fc73 100644 --- a/src/validate.rs +++ b/src/validate.rs @@ -36,6 +36,59 @@ pub(crate) fn validate_depth( Ok(()) } +/// Verify there is no non-whitespace content after the root value. +/// +/// The root value's closer is the last non-sentinel structural offset +/// in `indices` for a container, or the start of the scalar's trailing +/// whitespace for a top-level scalar value. We locate the position +/// `end_of_root` past which only whitespace is allowed. +pub(crate) fn validate_trailing( + buf: &[u8], + indices: &[u32], +) -> Result<(), qjd_err> { + // Find the last real offset (skip the u32::MAX sentinel). + let last = indices.iter().rev() + .find(|&&i| i != u32::MAX) + .copied(); + + let root_end = match last { + // No structural chars at all: input is whitespace or a bare scalar. + // Bare scalar: locate the end by scanning until whitespace or EOF. + None => { + // Strip leading whitespace, then find the scalar's terminator. + let mut p = 0; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + let start = p; + // Scan until next whitespace (end of scalar token). + while p < buf.len() && !is_ws(buf[p]) { p += 1; } + if start == p { return Ok(()); } // input was only whitespace + // Advance past trailing whitespace so `42 ` is accepted. + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + // Structural close (`}` or `]`) of root container, OR root quote + // close, OR last structural (`,`/`:`/`{`/`[`) — in which case the + // parse should already have failed at scan(). The only "valid root + // ending in a structural" cases are a closing `}` / `]` / `"`. + Some(last_idx) => { + let mut p = last_idx as usize + 1; + // Advance past any trailing whitespace. + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + }; + + if root_end < buf.len() { + return Err(qjd_err::QJD_TRAILING_CONTENT); + } + Ok(()) +} + +#[inline(always)] +fn is_ws(b: u8) -> bool { + matches!(b, b' ' | b'\t' | b'\n' | b'\r') +} + #[cfg(test)] mod tests { use super::*; @@ -61,4 +114,40 @@ mod tests { Err(qjd_err::QJD_NESTING_TOO_DEEP), ); } + + #[test] + fn trailing_clean_container() { + let buf = b"{}"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn trailing_whitespace_accepted() { + let buf = b"{} \n\t"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn trailing_garbage_rejected() { + let buf = b"{}garbage"; + assert_eq!( + validate_trailing(buf, &ix(buf)), + Err(qjd_err::QJD_TRAILING_CONTENT), + ); + } + + #[test] + fn bare_scalar_trailing_ws_accepted() { + let buf = b"42 \n\t"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn two_root_scalars_rejected() { + let buf = b"1 2"; + assert_eq!( + validate_trailing(buf, &ix(buf)), + Err(qjd_err::QJD_TRAILING_CONTENT), + ); + } } diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index b193c94..b07f7ab 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -132,3 +132,43 @@ fn rejects_when_one_past_configured_limit() { let opts = Options { mode: QJD_MODE_EAGER, max_depth: 32 }; assert!(Document::parse_with_options(buf.as_bytes(), &opts).is_err()); } + +// ── Phase 6: trailing content ──────────────────────────────── + +#[test] +fn eager_rejects_trailing_content() { + use quickdecode::error::qjd_err; + assert_eq!( + Document::parse_with_options(b"{}garbage", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); +} + +#[test] +fn eager_rejects_multiple_root_values() { + use quickdecode::error::qjd_err; + assert_eq!( + Document::parse_with_options(b"1 2", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); + assert_eq!( + Document::parse_with_options(b"true false", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); +} + +#[test] +fn eager_accepts_trailing_whitespace() { + assert_accepts!("{} \n\t"); +} + +#[test] +fn eager_accepts_top_level_scalar_with_trailing_whitespace() { + assert_accepts!("42 \n\t"); +} + +#[test] +fn lazy_accepts_trailing_garbage() { + // Lazy preserves historical behavior: trailing bytes are ignored. + assert!(Document::parse_with_options(b"{}garbage", &lazy()).is_ok()); +} From e9a2b57c4168f11895582c767cd829a6140dea70 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:25:35 +0000 Subject: [PATCH 11/21] feat(validate): strict RFC 8259 number ABNF (lazy decode + lazy entry points) --- src/decode/number.rs | 24 ++++---- src/{validate.rs => validate/mod.rs} | 3 + src/validate/number.rs | 91 ++++++++++++++++++++++++++++ tests/rfc8259_compliance.rs | 37 +++++++++++ 4 files changed, 142 insertions(+), 13 deletions(-) rename src/{validate.rs => validate/mod.rs} (98%) create mode 100644 src/validate/number.rs diff --git a/src/decode/number.rs b/src/decode/number.rs index 45d2f89..1beda2d 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -1,10 +1,8 @@ use crate::error::qjd_err; pub(crate) fn parse_i64(bytes: &[u8]) -> Result { - if bytes.is_empty() { - return Err(qjd_err::QJD_DECODE_FAILED); - } - // Reject non-integer JSON numbers (with decimal point or exponent). + crate::validate::validate_number(bytes)?; + // After ABNF validation, integer-only inputs have no `.`/`e`/`E`. if bytes.iter().any(|&b| b == b'.' || b == b'e' || b == b'E') { return Err(qjd_err::QJD_TYPE_MISMATCH); } @@ -12,9 +10,7 @@ pub(crate) fn parse_i64(bytes: &[u8]) -> Result { b'-' => (true, &bytes[1..]), _ => (false, bytes), }; - if rest.is_empty() || !rest.iter().all(|c| c.is_ascii_digit()) { - return Err(qjd_err::QJD_DECODE_FAILED); - } + // ABNF guarantees `rest` is non-empty and digit-only here. let mut v: i64 = 0; for &c in rest { let d = (c - b'0') as i64; @@ -29,11 +25,13 @@ pub(crate) fn parse_i64(bytes: &[u8]) -> Result { } pub(crate) fn parse_f64(bytes: &[u8]) -> Result { - if bytes.is_empty() { - return Err(qjd_err::QJD_DECODE_FAILED); - } + crate::validate::validate_number(bytes)?; let s = std::str::from_utf8(bytes).map_err(|_| qjd_err::QJD_DECODE_FAILED)?; - s.parse::().map_err(|_| qjd_err::QJD_DECODE_FAILED) + match s.parse::() { + Ok(v) if v.is_finite() => Ok(v), + Ok(_) => Err(qjd_err::QJD_NUMBER_OUT_OF_RANGE), + Err(_) => Err(qjd_err::QJD_DECODE_FAILED), + } } #[cfg(test)] @@ -63,7 +61,7 @@ mod tests { #[test] fn i64_rejects_empty() { - assert_eq!(parse_i64(b""), Err(qjd_err::QJD_DECODE_FAILED)); + assert_eq!(parse_i64(b""), Err(qjd_err::QJD_INVALID_NUMBER)); } #[test] fn f64_zero() { assert_eq!(parse_f64(b"0.0").unwrap(), 0.0); } @@ -73,6 +71,6 @@ mod tests { #[test] fn f64_rejects_garbage() { - assert_eq!(parse_f64(b"hello"), Err(qjd_err::QJD_DECODE_FAILED)); + assert_eq!(parse_f64(b"hello"), Err(qjd_err::QJD_INVALID_NUMBER)); } } diff --git a/src/validate.rs b/src/validate/mod.rs similarity index 98% rename from src/validate.rs rename to src/validate/mod.rs index db1fc73..28d9311 100644 --- a/src/validate.rs +++ b/src/validate/mod.rs @@ -4,6 +4,9 @@ //! decoupled from the SIMD/scalar scanner paths so the structural //! scanner code stays untouched. +pub(crate) mod number; +pub(crate) use number::validate_number; + use crate::error::qjd_err; /// Verify that the maximum bracket-stack depth implied by `indices` diff --git a/src/validate/number.rs b/src/validate/number.rs new file mode 100644 index 0000000..c212bdb --- /dev/null +++ b/src/validate/number.rs @@ -0,0 +1,91 @@ +//! Strict RFC 8259 §6 number-format validation. + +use crate::error::qjd_err; + +/// Returns Ok if `bytes` matches the JSON `number` grammar exactly. +/// Otherwise returns `QJD_INVALID_NUMBER`. +/// +/// Out-of-range (i.e. f64 overflow) is NOT detected here; the f64 decode +/// step surfaces it as `QJD_NUMBER_OUT_OF_RANGE`. +pub(crate) fn validate_number(bytes: &[u8]) -> Result<(), qjd_err> { + let mut i = 0; + + // optional minus + if bytes.first() == Some(&b'-') { i += 1; } + + // int: "0" | (digit1-9 *digit) + match bytes.get(i) { + Some(&b'0') => { i += 1; } + Some(&(b'1'..=b'9')) => { + i += 1; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + } + _ => return Err(qjd_err::QJD_INVALID_NUMBER), + } + + // optional frac: "." 1*digit + if bytes.get(i) == Some(&b'.') { + i += 1; + let frac_start = i; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + if i == frac_start { return Err(qjd_err::QJD_INVALID_NUMBER); } + } + + // optional exp: ("e"|"E") ["+"|"-"] 1*digit + if matches!(bytes.get(i), Some(&b'e') | Some(&b'E')) { + i += 1; + if matches!(bytes.get(i), Some(&b'+') | Some(&b'-')) { i += 1; } + let exp_start = i; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + if i == exp_start { return Err(qjd_err::QJD_INVALID_NUMBER); } + } + + if i != bytes.len() { return Err(qjd_err::QJD_INVALID_NUMBER); } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ok(s: &[u8]) { assert!(validate_number(s).is_ok(), "{:?}", std::str::from_utf8(s)); } + fn bad(s: &[u8]) { assert!(validate_number(s).is_err(), "{:?}", std::str::from_utf8(s)); } + + #[test] fn zero_ok() { ok(b"0"); } + #[test] fn neg_zero_ok() { ok(b"-0"); } + #[test] fn int_ok() { ok(b"123"); } + #[test] fn neg_int_ok() { ok(b"-456"); } + #[test] fn frac_ok() { ok(b"3.14"); } + #[test] fn neg_frac_ok() { ok(b"-2.718"); } + #[test] fn exp_lower_ok() { ok(b"1e10"); } + #[test] fn exp_upper_ok() { ok(b"1E10"); } + #[test] fn exp_plus_ok() { ok(b"1e+10"); } + #[test] fn exp_minus_ok() { ok(b"1e-10"); } + #[test] fn frac_exp_ok() { ok(b"1.5e2"); } + #[test] fn i64_max_str_ok() { ok(b"9223372036854775807"); } + + #[test] fn leading_plus_bad() { bad(b"+1"); } + #[test] fn leading_zero_bad() { bad(b"01"); } + #[test] fn leading_zeros_bad() { bad(b"00"); } + #[test] fn bare_dot_bad() { bad(b".5"); } + #[test] fn trailing_dot_bad() { bad(b"1."); } + #[test] fn missing_frac_digits_bad() { bad(b"1.e5"); } + #[test] fn hex_bad() { bad(b"0x1F"); } + #[test] fn incomplete_exp_bad() { bad(b"1e"); } + #[test] fn incomplete_exp_sign_bad() { bad(b"1e+"); } + #[test] fn nan_bad() { bad(b"NaN"); } + #[test] fn inf_bad() { bad(b"Infinity"); } + #[test] fn neg_inf_bad() { bad(b"-Infinity"); } + #[test] fn empty_bad() { bad(b""); } + #[test] fn lone_minus_bad() { bad(b"-"); } + #[test] fn double_dot_bad() { bad(b"1..2"); } +} diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index b07f7ab..13022b2 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -172,3 +172,40 @@ fn lazy_accepts_trailing_garbage() { // Lazy preserves historical behavior: trailing bytes are ignored. assert!(Document::parse_with_options(b"{}garbage", &lazy()).is_ok()); } + +// ── Phase 2: number format ─────────────────────────────────── + +#[test] +fn eager_accepts_canonical_numbers() { + for s in ["0", "-0", "1", "-1", "3.14", "-2.718", + "1e10", "1E10", "1e+10", "1e-10", "1.5e2", + "9223372036854775807", "-9223372036854775808"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } +} + +#[test] +#[ignore = "wired in Task 10"] +fn eager_rejects_invalid_numbers() { + use quickdecode::error::qjd_err; + for s in ["+1", "01", "00", ".5", "1.", "1.e5", "0x1F", + "NaN", "Infinity", "-Infinity", "1e", "1e+"] { + let input = format!("[{}]", s); + match Document::parse_with_options(input.as_bytes(), &eager()) { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + Err(other) => panic!( + "expected QJD_INVALID_NUMBER for {:?}, got {:?}", input, other), + Ok(_) => panic!("EAGER unexpectedly accepted {:?}", input), + } + } +} + +#[test] +fn lazy_defers_invalid_number_until_access() { + // In LAZY mode, "[01]" parses; the error surfaces when you ask for the value. + let doc = Document::parse_with_options(b"[01]", &lazy()).unwrap(); + // Walking via FFI tests is verbose; we only check that the LAZY parse + // itself does not fail. Field-level access is covered in tests/ffi_*. + drop(doc); +} From 5e0eb269b9229939fd6f700c12eca862aed270bf Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:30:48 +0000 Subject: [PATCH 12/21] feat(validate): reject control chars and invalid UTF-8 in string spans --- src/decode/string.rs | 1 + src/validate/mod.rs | 3 +++ src/validate/strings.rs | 36 +++++++++++++++++++++++++ tests/rfc8259_compliance.rs | 54 +++++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 src/validate/strings.rs diff --git a/src/decode/string.rs b/src/decode/string.rs index d879ac5..595329d 100644 --- a/src/decode/string.rs +++ b/src/decode/string.rs @@ -7,6 +7,7 @@ pub(crate) fn decode_string( buf: &[u8], start: usize, end: usize, scratch: &mut Vec, ) -> Result<(*const u8, usize), qjd_err> { let slice = &buf[start..end]; + crate::validate::validate_string_span(slice)?; if memchr::memchr(b'\\', slice).is_none() { return Ok((slice.as_ptr(), slice.len())); } diff --git a/src/validate/mod.rs b/src/validate/mod.rs index 28d9311..f21dd59 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -7,6 +7,9 @@ pub(crate) mod number; pub(crate) use number::validate_number; +pub(crate) mod strings; +pub(crate) use strings::validate_string_span; + use crate::error::qjd_err; /// Verify that the maximum bracket-stack depth implied by `indices` diff --git a/src/validate/strings.rs b/src/validate/strings.rs new file mode 100644 index 0000000..f0bece1 --- /dev/null +++ b/src/validate/strings.rs @@ -0,0 +1,36 @@ +//! String-content validation: control chars and UTF-8. + +use crate::error::qjd_err; + +/// Verify that the raw span (excluding surrounding quotes) contains no +/// unescaped control characters (0x00..=0x1F) and is valid UTF-8. +pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjd_err> { + // Control chars are forbidden inside a JSON string per RFC 8259 §7. + // Cheap pass first: bytewise check. + if span.iter().any(|&b| b < 0x20) { + return Err(qjd_err::QJD_INVALID_STRING); + } + // UTF-8 validation. Backslash escapes are not yet expanded; the byte + // immediately after `\` may legally be any escape introducer + // (`"`, `\`, `/`, `b`, `f`, `n`, `r`, `t`, `u`), all of which are ASCII. + // So validating the raw span (with backslashes still in place) gives + // the same answer as validating the escape-decoded result. + if std::str::from_utf8(span).is_err() { + return Err(qjd_err::QJD_INVALID_UTF8); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] fn ascii_ok() { assert!(validate_string_span(b"hello").is_ok()); } + #[test] fn utf8_ok() { assert!(validate_string_span("中文".as_bytes()).is_ok()); } + #[test] fn escapes_ok() { assert!(validate_string_span(b"a\\nb\\u00e9").is_ok()); } + #[test] fn tab_raw_bad() { assert_eq!(validate_string_span(b"a\tb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn null_raw_bad() { assert_eq!(validate_string_span(b"a\x00b").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn newline_raw_bad() { assert_eq!(validate_string_span(b"a\nb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn del_0x7f_ok() { assert!(validate_string_span(b"a\x7fb").is_ok()); } // RFC 8259 does NOT forbid 0x7F + #[test] fn invalid_utf8_bad() { assert_eq!(validate_string_span(&[0xC0, 0xC0]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); } +} diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index 13022b2..d5dbc57 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -209,3 +209,57 @@ fn lazy_defers_invalid_number_until_access() { // itself does not fail. Field-level access is covered in tests/ffi_*. drop(doc); } + +// ── Phase 4 + 5: string content ────────────────────────────── + +#[test] +#[ignore = "wired in Task 10"] +fn eager_rejects_raw_tab_in_string() { + use quickdecode::error::qjd_err; + let input = b"[\"a\tb\"]"; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + Err(other) => panic!("expected QJD_INVALID_STRING, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted raw tab in string"), + } +} + +#[test] +#[ignore = "wired in Task 10"] +fn eager_rejects_raw_null_in_string() { + use quickdecode::error::qjd_err; + let input = b"[\"a\x00b\"]"; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + Err(other) => panic!("expected QJD_INVALID_STRING, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted raw null in string"), + } +} + +#[test] +#[ignore = "wired in Task 10"] +fn eager_rejects_invalid_utf8_in_string() { + use quickdecode::error::qjd_err; + let input = &[b'[', b'"', 0xC0, 0xC0, b'"', b']']; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_UTF8) => {} + Err(other) => panic!("expected QJD_INVALID_UTF8, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted invalid UTF-8 in string"), + } +} + +#[test] +fn eager_accepts_escape_sequences() { + assert_accepts!("[\"a\\nb\\u00e9\"]"); + assert_accepts!("[\"emoji \\uD83D\\uDE00\"]"); +} + +#[test] +fn lazy_accepts_raw_tab_but_decode_fails() { + let input = b"[\"a\tb\"]"; + let doc = Document::parse_with_options(input, &lazy()).expect("lazy accepts raw control"); + drop(doc); + // Field-level rejection on access is enforced by decode/string.rs and + // is covered by tests/ffi_strings.rs (existing decode_string tests cover + // the error type); no extra assertion needed here. +} From 3eb8082805160bff3e03d520a1ff35b980cd753c Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:42:55 +0000 Subject: [PATCH 13/21] =?UTF-8?q?feat(validate):=20wire=20eager=20pass=20?= =?UTF-8?q?=E2=80=94=20full=20RFC=208259=20number+string=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/doc.rs | 2 +- src/validate/mod.rs | 95 +++++++++++++++++++++++++++++++++++++ tests/rfc8259_compliance.rs | 4 -- 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/src/doc.rs b/src/doc.rs index 75deb8d..1e6b5c7 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -28,7 +28,7 @@ impl<'a> Document<'a> { if opts.is_eager() { crate::validate::validate_trailing(buf, &indices)?; - // TODO(Task 10): validate_eager_values + crate::validate::validate_eager_values(buf, &indices)?; } Ok(Self { diff --git a/src/validate/mod.rs b/src/validate/mod.rs index f21dd59..1b804fb 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -90,6 +90,101 @@ pub(crate) fn validate_trailing( Ok(()) } +/// Walk `indices` and validate every scalar value (numbers + strings). +/// Called only in EAGER mode. +pub(crate) fn validate_eager_values( + buf: &[u8], + indices: &[u32], +) -> Result<(), qjd_err> { + let mut i = 0; + while i + 1 < indices.len() { + let idx = indices[i]; + if idx == u32::MAX { break; } + let pos = idx as usize; + let b = buf[pos]; + + // Strings: opening quote here, closing quote at indices[i+1]. + // (The scanner emits BOTH quotes of a string in order.) + if b == b'"' { + let close = indices[i + 1] as usize; + // Defensive: scanner pairs quotes correctly, but guard anyway. + if close <= pos || close >= buf.len() || buf[close] != b'"' { + return Err(qjd_err::QJD_PARSE_ERROR); + } + let span = &buf[pos + 1 .. close]; + strings::validate_string_span(span)?; + i += 2; + continue; + } + + // Container brackets and `:`/`,` are not values; skip. + if matches!(b, b'{' | b'}' | b'[' | b']' | b':' | b',') { + i += 1; + continue; + } + + // Should not happen: scanner only emits the 7 structural chars. + return Err(qjd_err::QJD_PARSE_ERROR); + } + + // Scalar values (numbers, true, false, null) live in the gaps between + // structural offsets. Walk those gaps and dispatch. + validate_scalars_in_gaps(buf, indices) +} + +/// For each consecutive pair of structural offsets, examine the bytes +/// between them. If the gap contains a scalar (anything other than +/// whitespace), validate its grammar. +fn validate_scalars_in_gaps(buf: &[u8], indices: &[u32]) -> Result<(), qjd_err> { + let mut prev_end: usize = 0; + let mut in_str = false; + for &idx in indices { + if idx == u32::MAX { break; } + let pos = idx as usize; + let b = buf[pos]; + + if b == b'"' { + // Toggle: the bytes between two quotes are the string interior + // (already validated above). Skip gap-scanning across them. + if in_str { + in_str = false; + prev_end = pos + 1; + } else { + // Validate any scalar in the gap leading up to this quote. + check_gap(buf, prev_end, pos)?; + in_str = true; + } + continue; + } + if in_str { continue; } + + check_gap(buf, prev_end, pos)?; + prev_end = pos + 1; + } + // Tail gap (top-level scalar like "42") + check_gap(buf, prev_end, buf.len()) +} + +fn check_gap(buf: &[u8], start: usize, end: usize) -> Result<(), qjd_err> { + // Strip surrounding whitespace. + let mut s = start; + while s < end && is_ws(buf[s]) { s += 1; } + let mut e = end; + while e > s && is_ws(buf[e - 1]) { e -= 1; } + if s == e { return Ok(()); } + let scalar = &buf[s..e]; + + // Dispatch on first byte. + match scalar[0] { + b't' => if scalar == b"true" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + b'f' => if scalar == b"false" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + b'n' => if scalar == b"null" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + // Everything else (including `+`, `.`, letters like `N`/`I`) is + // treated as a malformed number so the caller gets QJD_INVALID_NUMBER. + _ => number::validate_number(scalar), + } +} + #[inline(always)] fn is_ws(b: u8) -> bool { matches!(b, b' ' | b'\t' | b'\n' | b'\r') diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index d5dbc57..be9c8de 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -186,7 +186,6 @@ fn eager_accepts_canonical_numbers() { } #[test] -#[ignore = "wired in Task 10"] fn eager_rejects_invalid_numbers() { use quickdecode::error::qjd_err; for s in ["+1", "01", "00", ".5", "1.", "1.e5", "0x1F", @@ -213,7 +212,6 @@ fn lazy_defers_invalid_number_until_access() { // ── Phase 4 + 5: string content ────────────────────────────── #[test] -#[ignore = "wired in Task 10"] fn eager_rejects_raw_tab_in_string() { use quickdecode::error::qjd_err; let input = b"[\"a\tb\"]"; @@ -225,7 +223,6 @@ fn eager_rejects_raw_tab_in_string() { } #[test] -#[ignore = "wired in Task 10"] fn eager_rejects_raw_null_in_string() { use quickdecode::error::qjd_err; let input = b"[\"a\x00b\"]"; @@ -237,7 +234,6 @@ fn eager_rejects_raw_null_in_string() { } #[test] -#[ignore = "wired in Task 10"] fn eager_rejects_invalid_utf8_in_string() { use quickdecode::error::qjd_err; let input = &[b'[', b'"', 0xC0, 0xC0, b'"', b']']; From 69e1b976181ddc459ca5ee323213951b3477f5ee Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 16:50:59 +0000 Subject: [PATCH 14/21] fix(validate): check_gap distinguishes wrong-case literals from number-like tokens --- src/validate/mod.rs | 9 ++++-- tests/rfc8259_compliance.rs | 62 +++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/validate/mod.rs b/src/validate/mod.rs index 1b804fb..c834ea7 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -179,9 +179,12 @@ fn check_gap(buf: &[u8], start: usize, end: usize) -> Result<(), qjd_err> { b't' => if scalar == b"true" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, b'f' => if scalar == b"false" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, b'n' => if scalar == b"null" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, - // Everything else (including `+`, `.`, letters like `N`/`I`) is - // treated as a malformed number so the caller gets QJD_INVALID_NUMBER. - _ => number::validate_number(scalar), + // RFC-valid and common malformed number starters (+, ., -, digit). + b'-' | b'0'..=b'9' | b'+' | b'.' => number::validate_number(scalar), + // NaN / Infinity are "meant as numbers" → QJD_INVALID_NUMBER, not parse error. + _ if scalar == b"NaN" || scalar == b"Infinity" => number::validate_number(scalar), + // Wrong-case literals (TRUE, NULL), identifiers (undefined), other garbage. + _ => Err(qjd_err::QJD_PARSE_ERROR), } } diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index be9c8de..9db082a 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -259,3 +259,65 @@ fn lazy_accepts_raw_tab_but_decode_fails() { // is covered by tests/ffi_strings.rs (existing decode_string tests cover // the error type); no extra assertion needed here. } + +// ── Task 10 fix: check_gap dispatch ────────────────────────── + +#[test] +fn eager_rejects_uppercase_true_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"TRUE", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_uppercase_false_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"False", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_uppercase_null_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"NULL", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_undefined_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"undefined", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_nan_as_invalid_number() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"NaN", &eager()); + match r { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + other => panic!("expected QJD_INVALID_NUMBER, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_infinity_as_invalid_number() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"Infinity", &eager()); + match r { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + other => panic!("expected QJD_INVALID_NUMBER, got {:?}", other.err()), + } +} From 33d85223d2d1a2121425be83f8c105207baf9ffb Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 17:02:47 +0000 Subject: [PATCH 15/21] test(rfc8259): exhaustive RFC 8259 conformance corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add five nested mod blocks (structural / whitespace / literals / strings / numbers) to tests/rfc8259_compliance.rs with 76 tests (73 passing, 3 ignored). Fix two gaps in eager validation: - parse_with_options: reject empty / whitespace-only input (RFC 8259 §2 requires a value; both EAGER and LAZY now return QJD_PARSE_ERROR). - validate_scalars_in_gaps: track prev/next structural context in check_gap so that an empty gap after ':' or ',' (when not followed by a value-starter like '"', '{', '[') is rejected as QJD_PARSE_ERROR. Catches {"a":}, [,], [1,], and {\"a\":1,} without a full grammar-aware walk. Three tests are marked #[ignore] with issue #37 references for cases that require a grammar-aware pass: missing-colon ({\"a\"}), leading-comma-with-value ([,1]), and missing-comma-in-object ({\"a\":1\"b\":2}). --- src/doc.rs | 6 + src/validate/mod.rs | 33 ++- tests/rfc8259_compliance.rs | 396 ++++++++++++++++++++++++++++++++++++ 3 files changed, 429 insertions(+), 6 deletions(-) diff --git a/src/doc.rs b/src/doc.rs index 1e6b5c7..d20e17f 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -19,6 +19,12 @@ impl<'a> Document<'a> { buf: &'a [u8], opts: &crate::options::Options, ) -> Result { + // RFC 8259 §2: "A JSON text is a serialized value." + // Empty input and whitespace-only input contain no value. + if buf.iter().all(|&b| matches!(b, b' ' | b'\t' | b'\n' | b'\r')) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + let max_depth = opts.effective_max_depth(); let mut indices = Vec::new(); crate::scan::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?; diff --git a/src/validate/mod.rs b/src/validate/mod.rs index c834ea7..2a37bd9 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -138,6 +138,9 @@ pub(crate) fn validate_eager_values( fn validate_scalars_in_gaps(buf: &[u8], indices: &[u32]) -> Result<(), qjd_err> { let mut prev_end: usize = 0; let mut in_str = false; + // Track the last non-quote structural char so check_gap can reject empty + // gaps in positions where a value is required (after `:` or `,`). + let mut prev_structural: u8 = 0; for &idx in indices { if idx == u32::MAX { break; } let pos = idx as usize; @@ -151,27 +154,45 @@ fn validate_scalars_in_gaps(buf: &[u8], indices: &[u32]) -> Result<(), qjd_err> prev_end = pos + 1; } else { // Validate any scalar in the gap leading up to this quote. - check_gap(buf, prev_end, pos)?; + // An open-quote is itself a value, so pass it as the next char: + // an empty gap before a string is always fine (`:` `"` and `,` `"` are + // both valid — the string IS the value). + check_gap(buf, prev_end, pos, prev_structural, b'"')?; in_str = true; + prev_structural = b'"'; } continue; } if in_str { continue; } - check_gap(buf, prev_end, pos)?; + check_gap(buf, prev_end, pos, prev_structural, b)?; prev_end = pos + 1; + prev_structural = b; } - // Tail gap (top-level scalar like "42") - check_gap(buf, prev_end, buf.len()) + // Tail gap (top-level scalar like "42"): next char is EOF (0 sentinel) + check_gap(buf, prev_end, buf.len(), prev_structural, 0) } -fn check_gap(buf: &[u8], start: usize, end: usize) -> Result<(), qjd_err> { +/// `prev_structural`: the last non-quote structural char before this gap. +/// `next_structural`: the structural char immediately after this gap (opens or closes). +fn check_gap(buf: &[u8], start: usize, end: usize, prev_structural: u8, next_structural: u8) -> Result<(), qjd_err> { // Strip surrounding whitespace. let mut s = start; while s < end && is_ws(buf[s]) { s += 1; } let mut e = end; while e > s && is_ws(buf[e - 1]) { e -= 1; } - if s == e { return Ok(()); } + if s == e { + // Empty gap: a value is required after `:` (object value) or `,` (next + // element), BUT only when the next token is not a structural value-starter + // (`"`, `{`, `[`) — those ARE the values. An empty gap before `}` / `]` + // / `,` when the preceding token demands a value is a structural error. + // This heuristic catches {"a":}, [,], [1,] without a full grammar walk. + let next_is_value_starter = matches!(next_structural, b'"' | b'{' | b'['); + if matches!(prev_structural, b':' | b',') && !next_is_value_starter { + return Err(qjd_err::QJD_PARSE_ERROR); + } + return Ok(()); + } let scalar = &buf[s..e]; // Dispatch on first byte. diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index 9db082a..b85b921 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -321,3 +321,399 @@ fn eager_rejects_infinity_as_invalid_number() { other => panic!("expected QJD_INVALID_NUMBER, got {:?}", other.err()), } } + +// ───────────────────────────────────────────────────────────────────────────── +// Task 11: Comprehensive RFC 8259 conformance corpus +// Organized into nested mod blocks per category. +// ───────────────────────────────────────────────────────────────────────────── + +mod structural { + use super::*; + + // RFC 8259 §2-3: JSON values — null, true, false are valid root values. + #[test] + fn primitives_valid() { + assert_accepts!("null"); + assert_accepts!("true"); + assert_accepts!("false"); + } + + // RFC 8259 §2: a JSON text contains exactly one value — empty is not valid. + #[test] + fn empty_input_rejected() { + assert_rejects_both!(""); + } + + // RFC 8259 §2: whitespace-only input also contains no value. + #[test] + fn whitespace_only_rejected() { + assert_rejects_both!(" "); + assert_rejects_both!("\t\n\r"); + } + + // RFC 8259 §4-5: empty object and empty array are valid. + #[test] + fn empty_containers() { + assert_accepts!("{}"); + assert_accepts!("[]"); + } + + // RFC 8259 §4-5: nested containers with mixed value types. + #[test] + fn nested_containers() { + assert_accepts!("[{\"a\":[1,{\"b\":2}]}]"); + assert_accepts!("{\"x\":{\"y\":{\"z\":null}}}"); + assert_accepts!("[[],[],[[],[]]]"); + } + + // RFC 8259 §4: '{' must be followed by a matching '}'. + #[test] + fn unclosed_brace() { + assert_rejects_both!("{"); + } + + // RFC 8259 §5: '[' must be followed by a matching ']'. + #[test] + fn unclosed_bracket() { + assert_rejects_both!("["); + } + + // Bracket mismatch: '{' closed by ']'. + #[test] + fn mismatched_brace_bracket() { + assert_rejects_both!("{]"); + } + + // Bracket mismatch: '[' closed by '}'. + #[test] + fn mismatched_bracket_brace() { + assert_rejects_both!("[}"); + } + + // RFC 8259 §4: object value must follow the colon — omitting it is invalid. + // Eager catches the empty gap after ':'; lazy defers (structural-only rule). + #[test] + fn missing_value() { + assert_rejects_eager!("{\"a\":}", QJD_PARSE_ERROR); + } + + // RFC 8259 §4: colon between key and value is mandatory. + // The scanner emits {"a"} as {""} with no ':' — eager does not detect this + // because no structural gap heuristic covers the absence of ':'. + // Deferred to a follow-up grammar-aware pass (issue #37). + #[test] + #[ignore = "missing-colon detection deferred — grammar-aware pass required (issue #37)"] + fn missing_colon() { + assert_rejects_eager!("{\"a\"}", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: a leading comma in an array is invalid. + // [,] — both commas have empty gaps → eager rejects via the ':'/',' + // heuristic in check_gap. + #[test] + fn leading_comma_array_empty() { + assert_rejects_eager!("[,]", QJD_PARSE_ERROR); + } + + // [,1] — leading comma followed by a value: the gap between '[' and ',' + // is empty (no value yet) but prev_structural is '[', not ',' — so the + // heuristic does not fire. Deferred to a grammar-aware pass (issue #37). + #[test] + #[ignore = "leading-comma-before-value detection deferred — grammar-aware pass required (issue #37)"] + fn leading_comma_array_with_value() { + assert_rejects_eager!("[,1]", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: trailing comma in an array is invalid. + #[test] + fn trailing_comma_array() { + assert_rejects_eager!("[1,]", QJD_PARSE_ERROR); + } + + // RFC 8259 §4: trailing comma in an object is invalid. + #[test] + fn trailing_comma_object() { + assert_rejects_eager!("{\"a\":1,}", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: array elements must be separated by exactly one comma. + // [1 2] contains a space-separated pair that validate_number rejects as + // QJD_INVALID_NUMBER (not QJD_PARSE_ERROR) — the element IS rejected by + // eager, just with a different error code. + #[test] + fn missing_comma_in_array_rejected() { + // We assert only that eager rejects; the exact code is QJD_INVALID_NUMBER + // because the "1 2" token fails number validation (space within number). + let input = b"[1 2]"; + assert!( + Document::parse_with_options(input, &eager()).is_err(), + "EAGER should reject [1 2]" + ); + } + + // Missing comma inside an object (no structural separator between values): + // {"a":1"b":2} — the scanner emits `{`, `"`, `"`, `:`, `"`, `"`, `}`. + // The gap between the second close-quote and the third open-quote is empty, + // but prev_structural is `"` (quote) and next is `"` — the heuristic only + // fires on `:` / `,`, so this slips through. + // Deferred to grammar-aware pass (issue #37). + #[test] + #[ignore = "missing-comma-in-object detection deferred — grammar-aware pass required (issue #37)"] + fn missing_comma_in_object() { + assert_rejects_eager!("{\"a\":1\"b\":2}", QJD_PARSE_ERROR); + } +} + +mod whitespace { + use super::*; + + // RFC 8259 §2: insignificant whitespace (space, tab, LF, CR) is allowed + // before and after structural characters. + + #[test] + fn spaces_around_object() { + assert_accepts!(" { } "); + } + + #[test] + fn tabs_around_object() { + assert_accepts!("\t{}\t"); + } + + #[test] + fn newlines_around() { + assert_accepts!("\n{}\n"); + } + + #[test] + fn cr_around() { + assert_accepts!("\r{}\r"); + } + + #[test] + fn inside_object() { + assert_accepts!("{ \"a\" : 1 , \"b\" : 2 }"); + } + + #[test] + fn inside_array() { + assert_accepts!("[ 1 , 2 , 3 ]"); + } + + // All four RFC whitespace characters interleaved. + #[test] + fn mixed_whitespace() { + assert_accepts!(" \t\n\r { \t\n\r } \t\n\r "); + } +} + +mod literals { + use super::*; + + // RFC 8259 §3: only lowercase "true", "false", "null" are valid. + // Wrong case must be rejected by eager. + + #[test] + fn true_must_be_lowercase() { + assert_rejects_eager!("TRUE", QJD_PARSE_ERROR); + assert_rejects_eager!("True", QJD_PARSE_ERROR); + assert_rejects_eager!("tRuE", QJD_PARSE_ERROR); + } + + #[test] + fn false_must_be_lowercase() { + assert_rejects_eager!("FALSE", QJD_PARSE_ERROR); + assert_rejects_eager!("False", QJD_PARSE_ERROR); + } + + #[test] + fn null_must_be_lowercase() { + assert_rejects_eager!("NULL", QJD_PARSE_ERROR); + assert_rejects_eager!("Null", QJD_PARSE_ERROR); + } + + // JavaScript-ism: "nil" is not a valid JSON value. + #[test] + fn nil_rejected() { + assert_rejects_eager!("nil", QJD_PARSE_ERROR); + } + + // JavaScript-ism: "undefined" is not a valid JSON value. + #[test] + fn undefined_rejected() { + assert_rejects_eager!("undefined", QJD_PARSE_ERROR); + } +} + +mod strings { + use super::*; + + // RFC 8259 §7: string grammar. + + // Empty string is valid. + #[test] + fn empty_string() { + assert_accepts!("\"\""); + assert_accepts!("[\"\" ]"); + } + + // Printable ASCII (no special chars) is valid. + #[test] + fn ascii_string() { + assert_accepts!("\"hello world\""); + assert_accepts!("\"abcdefghijklmnopqrstuvwxyz 0123456789 !@#$%^&*()\""); + } + + // RFC 8259 §7: all defined escape sequences must be accepted. + #[test] + fn all_escape_sequences() { + // \" \\ \/ \b \f \n \r \t + assert_accepts!("\"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\""); + } + + // RFC 8259 §7: \uXXXX Unicode escape (4 hex digits). + #[test] + fn unicode_escape() { + assert_accepts!("\"\\u0000\""); // NUL encoded as escape — valid + assert_accepts!("\"\\u00e9\""); // é + assert_accepts!("\"\\u4e2d\\u6587\""); // 中文 + } + + // RFC 8259 §7: surrogate pair (\uD800–\uDBFF followed by \uDC00–\uDFFF). + #[test] + fn surrogate_pair() { + assert_accepts!("\"\\uD83D\\uDE00\""); // 😀 U+1F600 + } + + // RFC 8259 §7: strings must be terminated with a closing '"'. + #[test] + fn unclosed_string_rejected() { + assert_rejects_both!("\"hello"); + assert_rejects_both!("\""); + } + + // JSON does not allow single-quoted strings (JavaScript-ism). + #[test] + fn single_quoted_string_rejected() { + assert_rejects_eager!("'hello'", QJD_PARSE_ERROR); + } + + // RFC 8259 §7: control characters (U+0000–U+001F) must be escaped. + // A raw tab (0x09) inside a string is forbidden. + #[test] + fn raw_control_char_rejected() { + use quickdecode::error::qjd_err; + let with_tab = b"[\"a\tb\"]"; + let with_null = b"[\"a\x00b\"]"; + match Document::parse_with_options(with_tab, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + other => panic!("expected QJD_INVALID_STRING for raw tab, got {:?}", other.err()), + } + match Document::parse_with_options(with_null, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + other => panic!("expected QJD_INVALID_STRING for raw NUL, got {:?}", other.err()), + } + } + + // Strings with valid multi-byte UTF-8 content are accepted. + #[test] + fn utf8_multibyte_string() { + assert_accepts!("\"café\""); // 2-byte sequence + assert_accepts!("\"中文\""); // 3-byte sequences + assert_accepts!("\"😀\""); // 4-byte sequence (emoji) + } +} + +mod numbers { + use super::*; + + // RFC 8259 §6: number grammar. + // These complement the existing top-level number tests with a thorough + // table-driven suite organized by sub-rule. + + // §6 integer: optional minus, zero, or non-zero digit followed by digits. + #[test] + fn integers_valid() { + for s in ["0", "-0", "1", "-1", "123", "-456", + "9223372036854775807", "-9223372036854775808"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6 fraction: a '.' followed by one or more digits. + #[test] + fn fractions_valid() { + for s in ["0.0", "-0.0", "1.5", "-2.718", "3.14159", + "0.123456789"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6 exponent: 'e'/'E' with optional '+'/'-' and one or more digits. + #[test] + fn exponents_valid() { + for s in ["1e10", "1E10", "1e+10", "1e-10", + "1.5e2", "2.5E-3", "0e0", "-0e0"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6: leading '+' is not allowed. + #[test] + fn leading_plus_rejected() { + assert_rejects_eager!("[+1]", QJD_INVALID_NUMBER); + } + + // §6: leading zeros are not allowed (except bare "0"). + #[test] + fn leading_zero_rejected() { + assert_rejects_eager!("[01]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[00]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[007]", QJD_INVALID_NUMBER); + } + + // §6: fraction requires at least one digit after the dot. + #[test] + fn trailing_dot_rejected() { + assert_rejects_eager!("[1.]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1.e5]", QJD_INVALID_NUMBER); + } + + // §6: fraction cannot start without an integer part. + #[test] + fn leading_dot_rejected() { + assert_rejects_eager!("[.5]", QJD_INVALID_NUMBER); + } + + // §6: exponent requires at least one digit. + #[test] + fn incomplete_exponent_rejected() { + assert_rejects_eager!("[1e]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1e+]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1e-]", QJD_INVALID_NUMBER); + } + + // Hex notation is not part of the JSON number grammar. + #[test] + fn hex_notation_rejected() { + assert_rejects_eager!("[0x1F]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[0xFF]", QJD_INVALID_NUMBER); + } + + // Non-finite values are not part of JSON. + #[test] + fn non_finite_rejected() { + assert_rejects_eager!("[NaN]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[Infinity]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[-Infinity]", QJD_INVALID_NUMBER); + } + + // Lone minus is not a valid number. + #[test] + fn lone_minus_rejected() { + assert_rejects_eager!("[-]", QJD_INVALID_NUMBER); + } +} From da99b7dadc6d4ade9de2cc925649aa5f1b91b4f2 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 17:24:29 +0000 Subject: [PATCH 16/21] test(json_test_suite): vendor JSONTestSuite and add cross-mode walker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add JSONTestSuite as a git submodule at tests/vendor/JSONTestSuite and introduce tests/json_test_suite.rs which walks every y_*, n_*, and i_* file: y_ files must parse in both modes, n_ files must fail eager parse, i_ files are logged but not asserted. While running the walker, two real validator gaps were discovered and fixed (both < 20 lines each): - validate_trailing: used the last structural char in the whole buffer as the root-end marker, causing [][], ["a":true]"x" etc. to slip through as if they had no trailing content. Fixed by walking indices to find the first depth-0 container close (or the first root string's close). - validate_string_span: validated UTF-8 and control chars but did not check escape sequences, so \a, \x00, \uZZZZ, dangling \ etc. were accepted. Added a one-pass walker that validates every backslash escape against the RFC 8259 §7 grammar. The three unit tests in decode/string.rs that expected QJD_DECODE_FAILED for bad escapes now expect QJD_INVALID_STRING because validate_string_span (called first by decode_string) catches them before the decode loop does. 13 n_* files remain in KNOWN_N_FAILURES: all require a grammar-aware pass to enforce token-ordering rules (non-string keys, comma-vs-colon placement, missing commas between items). Each entry is annotated with the follow-up reference (issue #37). Walker results: y_* 95/95 pass, n_* 175/188 pass (13 whitelisted), i_* 35 informational verdicts printed. --- .gitmodules | 3 + src/decode/string.rs | 11 +- src/validate/mod.rs | 83 ++++++++++----- src/validate/strings.rs | 49 +++++++-- tests/json_test_suite.rs | 200 +++++++++++++++++++++++++++++++++++++ tests/vendor/JSONTestSuite | 1 + 6 files changed, 309 insertions(+), 38 deletions(-) create mode 100644 tests/json_test_suite.rs create mode 160000 tests/vendor/JSONTestSuite diff --git a/.gitmodules b/.gitmodules index 2d1c2aa..8baae4a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/lua-cjson"] path = vendor/lua-cjson url = https://github.com/openresty/lua-cjson.git +[submodule "tests/vendor/JSONTestSuite"] + path = tests/vendor/JSONTestSuite + url = https://github.com/nst/JSONTestSuite diff --git a/src/decode/string.rs b/src/decode/string.rs index 595329d..8572441 100644 --- a/src/decode/string.rs +++ b/src/decode/string.rs @@ -164,16 +164,21 @@ mod tests { #[test] fn invalid_hex_in_unicode_fails() { - assert_eq!(d(b"\\uZZZZ").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span (called first) catches non-hex digits as + // QJD_INVALID_STRING; the decode loop would also catch it as + // QJD_DECODE_FAILED, but we never reach it. + assert_eq!(d(b"\\uZZZZ").unwrap_err(), qjd_err::QJD_INVALID_STRING); } #[test] fn unknown_escape_fails() { - assert_eq!(d(b"\\q").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span catches unknown escape introducers first. + assert_eq!(d(b"\\q").unwrap_err(), qjd_err::QJD_INVALID_STRING); } #[test] fn dangling_backslash_fails() { - assert_eq!(d(b"a\\").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span catches a trailing lone backslash first. + assert_eq!(d(b"a\\").unwrap_err(), qjd_err::QJD_INVALID_STRING); } } diff --git a/src/validate/mod.rs b/src/validate/mod.rs index 2a37bd9..c6d972d 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -44,43 +44,78 @@ pub(crate) fn validate_depth( /// Verify there is no non-whitespace content after the root value. /// -/// The root value's closer is the last non-sentinel structural offset -/// in `indices` for a container, or the start of the scalar's trailing -/// whitespace for a top-level scalar value. We locate the position -/// `end_of_root` past which only whitespace is allowed. +/// For container roots (`{`/`[`), we walk `indices` to find the closing +/// bracket where nesting depth returns to zero — that is the actual root +/// end, regardless of how many additional structural chars the buffer has. +/// For scalar roots (no opening bracket), we scan the raw bytes. pub(crate) fn validate_trailing( buf: &[u8], indices: &[u32], ) -> Result<(), qjd_err> { - // Find the last real offset (skip the u32::MAX sentinel). - let last = indices.iter().rev() - .find(|&&i| i != u32::MAX) - .copied(); + // Find the first real structural character to determine root kind. + let first = indices.iter().find(|&&i| i != u32::MAX).copied(); - let root_end = match last { - // No structural chars at all: input is whitespace or a bare scalar. - // Bare scalar: locate the end by scanning until whitespace or EOF. + let root_end = match first { None => { - // Strip leading whitespace, then find the scalar's terminator. + // No structural chars: bare scalar (number/true/false/null). let mut p = 0; while p < buf.len() && is_ws(buf[p]) { p += 1; } let start = p; - // Scan until next whitespace (end of scalar token). while p < buf.len() && !is_ws(buf[p]) { p += 1; } - if start == p { return Ok(()); } // input was only whitespace - // Advance past trailing whitespace so `42 ` is accepted. + if start == p { return Ok(()); } // whitespace-only (scan already rejected empty) while p < buf.len() && is_ws(buf[p]) { p += 1; } p } - // Structural close (`}` or `]`) of root container, OR root quote - // close, OR last structural (`,`/`:`/`{`/`[`) — in which case the - // parse should already have failed at scan(). The only "valid root - // ending in a structural" cases are a closing `}` / `]` / `"`. - Some(last_idx) => { - let mut p = last_idx as usize + 1; - // Advance past any trailing whitespace. - while p < buf.len() && is_ws(buf[p]) { p += 1; } - p + Some(first_idx) => { + match buf[first_idx as usize] { + b'{' | b'[' => { + // Walk indices to find the closing bracket at depth 0. + let mut depth: i32 = 0; + let mut closer: usize = first_idx as usize; + // Track whether we're inside a string (skip string interiors). + let mut in_str = false; + for &idx in indices { + if idx == u32::MAX { break; } + let pos = idx as usize; + match buf[pos] { + b'"' => { in_str = !in_str; } + _ if in_str => {} + b'{' | b'[' => { depth += 1; } + b'}' | b']' => { + depth -= 1; + if depth == 0 { closer = pos; break; } + } + _ => {} + } + } + let mut p = closer + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + b'"' => { + // Root is a string: opening quote at first_idx. + // The closing quote is the next structural char. + let close = indices.iter() + .skip(1) // skip the opening quote + .find(|&&i| i != u32::MAX) + .copied() + .unwrap_or(first_idx); // unclosed: scan already rejected + let mut p = close as usize + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + _ => { + // Structural char that's not an opener: scan/eager already + // would have caught a malformed root. Treat last structural as end. + let last = indices.iter().rev() + .find(|&&i| i != u32::MAX) + .copied() + .unwrap_or(first_idx); + let mut p = last as usize + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + } } }; diff --git a/src/validate/strings.rs b/src/validate/strings.rs index f0bece1..dc974f2 100644 --- a/src/validate/strings.rs +++ b/src/validate/strings.rs @@ -3,21 +3,48 @@ use crate::error::qjd_err; /// Verify that the raw span (excluding surrounding quotes) contains no -/// unescaped control characters (0x00..=0x1F) and is valid UTF-8. +/// unescaped control characters (0x00..=0x1F), is valid UTF-8, and that +/// every backslash escape sequence is RFC 8259 §7 compliant. pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjd_err> { - // Control chars are forbidden inside a JSON string per RFC 8259 §7. - // Cheap pass first: bytewise check. - if span.iter().any(|&b| b < 0x20) { - return Err(qjd_err::QJD_INVALID_STRING); - } - // UTF-8 validation. Backslash escapes are not yet expanded; the byte - // immediately after `\` may legally be any escape introducer - // (`"`, `\`, `/`, `b`, `f`, `n`, `r`, `t`, `u`), all of which are ASCII. - // So validating the raw span (with backslashes still in place) gives - // the same answer as validating the escape-decoded result. + // UTF-8 validation first (includes multi-byte content validation). + // Backslash escapes are ASCII, so validating the unexpanded span gives + // the correct answer for the UTF-8 structure of non-escape bytes. if std::str::from_utf8(span).is_err() { return Err(qjd_err::QJD_INVALID_UTF8); } + + // Walk the span validating control chars and escape sequences. + let mut i = 0; + while i < span.len() { + let b = span[i]; + // RFC 8259 §7: control characters must be escaped. + if b < 0x20 { + return Err(qjd_err::QJD_INVALID_STRING); + } + if b == b'\\' { + i += 1; + if i >= span.len() { + return Err(qjd_err::QJD_INVALID_STRING); + } + match span[i] { + b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} + b'u' => { + // Must be followed by exactly 4 hex digits. + if i + 4 >= span.len() { + return Err(qjd_err::QJD_INVALID_STRING); + } + for &h in &span[i + 1..=i + 4] { + if !h.is_ascii_hexdigit() { + return Err(qjd_err::QJD_INVALID_STRING); + } + } + i += 4; // consumed 4 hex digits; loop adds 1 more + } + _ => return Err(qjd_err::QJD_INVALID_STRING), + } + } + i += 1; + } Ok(()) } diff --git a/tests/json_test_suite.rs b/tests/json_test_suite.rs new file mode 100644 index 0000000..6f4d031 --- /dev/null +++ b/tests/json_test_suite.rs @@ -0,0 +1,200 @@ +//! Walker over the JSONTestSuite corpus (submodule at tests/vendor/JSONTestSuite). +//! +//! - `y_*` files: must parse in both EAGER and LAZY modes. +//! - `n_*` files: must fail to parse in EAGER mode. +//! In LAZY mode the file MAY parse (structural-only) but a +//! value-level access of the malformed field would fail; we +//! do not assert against LAZY here. +//! - `i_*` files: implementation-defined; we record our behavior (no +//! assertions). The list of accepted/rejected i_* cases is +//! printed at the end of the test run for documentation. +//! +//! # Known failures +//! +//! Files listed in KNOWN_Y_FAILURES / KNOWN_N_FAILURES are skipped with a +//! logged explanation. Removing a file from these lists re-enables the test. +//! +//! KNOWN_Y_FAILURES: y_* files we don't handle correctly yet. +//! Each entry documents why; follow-up issues are referenced in comments. +//! +//! KNOWN_N_FAILURES: n_* files our eager validator passes when it shouldn't. +//! These correspond to grammar-aware gaps deferred to issue #37. + +use std::fs; +use std::path::Path; + +use quickdecode::doc::Document; +use quickdecode::options::{Options, QJD_MODE_EAGER, QJD_MODE_LAZY}; + +/// y_* files that we currently reject but shouldn't. +/// Each is annotated with why and what follow-up would fix it. +const KNOWN_Y_FAILURES: &[&str] = &[ + // "y_string_utf8.json" — example placeholder (none currently needed) +]; + +/// n_* files that we currently accept but shouldn't (validator gap). +/// +/// All 13 entries below require a grammar-aware structural pass that tracks +/// which token types are legal in each parser state (array element, object +/// key, object value, etc.). That pass is deferred to issue #37. +/// +/// The current validator only catches structural errors detectable from +/// bracket balance + gap heuristics; it does not enforce: +/// - that object keys must be strings +/// - that `:` vs `,` are used in the right places +/// - that array elements are separated by commas (not colons/semicolons) +/// - leading commas before values (gap heuristic fires only for `[,]`) +/// - missing commas between items when no structural gap exists +/// +/// Fix: implement a state-machine pass in src/validate/mod.rs that tracks +/// parser state (AfterKey, AfterColon, AfterValue, …) and rejects tokens +/// that violate the grammar at that state. Removing a file from this list +/// re-enables the assertion. +const KNOWN_N_FAILURES: &[&str] = &[ + // ── array structural gaps ──────────────────────────────────────────── + // ["": 1] — colon inside array (issue #37: grammar-aware pass) + "n_array_colon_instead_of_comma.json", + // [,1] — leading comma before first value (issue #37) + "n_array_comma_and_number.json", + // [3[4]] — missing comma between elements (issue #37) + "n_array_inner_array_no_comma.json", + // [1:2] — semicolon used instead of comma (issue #37) + "n_array_items_separated_by_semicolon.json", + // [ , ""] — leading comma (gap heuristic only catches [,] not [ ,v]) (issue #37) + "n_array_missing_value.json", + // ── object structural gaps ─────────────────────────────────────────── + // {"x", null} — comma instead of colon (issue #37) + "n_object_comma_instead_of_colon.json", + // {"a":"a" 123} — missing comma between key-value pairs (issue #37) + "n_object_garbage_at_end.json", + // {:"b"} — missing object key (issue #37) + "n_object_missing_key.json", + // {"a" "b"} — missing colon between key and value (issue #37) + "n_object_missing_semicolon.json", + // {1:1} — non-string key: number (issue #37) + "n_object_non_string_key.json", + // {9999E9999:1} — non-string key: huge number (issue #37) + "n_object_non_string_key_but_huge_number_instead.json", + // {null:null,null:null} — non-string key: null literal (issue #37) + "n_object_repeated_null_null.json", + // { "foo" : "bar", "a" } — trailing key without value (issue #37) + "n_object_with_single_string.json", +]; + +fn corpus_dir() -> &'static Path { + Path::new(env!("CARGO_MANIFEST_DIR")) +} + +fn parsing_dir() -> std::path::PathBuf { + corpus_dir().join("tests/vendor/JSONTestSuite/test_parsing") +} + +fn iter_files(prefix: &str) -> Vec { + let dir = parsing_dir(); + let entries = fs::read_dir(&dir) + .unwrap_or_else(|e| panic!( + "missing JSONTestSuite submodule at {:?}: {} — run: git submodule update --init", + dir, e + )); + let mut paths: Vec<_> = entries + .filter_map(|r| r.ok()) + .map(|e| e.path()) + .filter(|p| { + p.extension().and_then(|s| s.to_str()) == Some("json") + && p.file_name() + .and_then(|s| s.to_str()) + .map(|n| n.starts_with(prefix)) + .unwrap_or(false) + }) + .collect(); + paths.sort(); + paths +} + +fn is_known_y_failure(path: &std::path::Path) -> bool { + let name = path.file_name().and_then(|s| s.to_str()).unwrap_or(""); + KNOWN_Y_FAILURES.contains(&name) +} + +fn is_known_n_failure(path: &std::path::Path) -> bool { + let name = path.file_name().and_then(|s| s.to_str()).unwrap_or(""); + KNOWN_N_FAILURES.contains(&name) +} + +#[test] +fn y_files_accepted_in_both_modes() { + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + let lazy = Options { mode: QJD_MODE_LAZY, max_depth: 0 }; + let mut failures = Vec::new(); + let mut skipped = 0usize; + + for path in iter_files("y_") { + if is_known_y_failure(&path) { + eprintln!("SKIP (known-y-failure): {:?}", path.file_name().unwrap()); + skipped += 1; + continue; + } + let data = fs::read(&path).unwrap(); + let r_e = Document::parse_with_options(&data, &eager); + let r_l = Document::parse_with_options(&data, &lazy); + if r_e.is_err() || r_l.is_err() { + failures.push(( + path.file_name().unwrap().to_owned(), + format!("eager={:?} lazy={:?}", r_e.err(), r_l.err()), + )); + } + } + + if skipped > 0 { + eprintln!("y_* skipped (known failures): {}", skipped); + } + if !failures.is_empty() { + for (n, e) in &failures { + eprintln!("UNEXPECTED REJECT: {:?} → {}", n, e); + } + panic!("{} y_* file(s) unexpectedly rejected", failures.len()); + } +} + +#[test] +fn n_files_rejected_in_eager_mode() { + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + let mut accepted = Vec::new(); + let mut skipped = 0usize; + + for path in iter_files("n_") { + if is_known_n_failure(&path) { + eprintln!("SKIP (known-n-failure): {:?}", path.file_name().unwrap()); + skipped += 1; + continue; + } + let data = fs::read(&path).unwrap(); + if Document::parse_with_options(&data, &eager).is_ok() { + accepted.push(path.file_name().unwrap().to_owned()); + } + } + + if skipped > 0 { + eprintln!("n_* skipped (known failures): {}", skipped); + } + if !accepted.is_empty() { + for n in &accepted { + eprintln!("UNEXPECTED ACCEPT: {:?}", n); + } + panic!("{} n_* file(s) unexpectedly accepted", accepted.len()); + } +} + +#[test] +fn document_i_files_behavior() { + // Implementation-defined cases — document what we do, do not assert. + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + for path in iter_files("i_") { + let data = fs::read(&path).unwrap(); + let verdict = match Document::parse_with_options(&data, &eager) { + Ok(_) => "ACCEPT".to_owned(), + Err(e) => format!("REJECT({:?})", e), + }; + eprintln!("i_* {:?} → {}", path.file_name().unwrap(), verdict); + } +} diff --git a/tests/vendor/JSONTestSuite b/tests/vendor/JSONTestSuite new file mode 160000 index 0000000..1ef36fa --- /dev/null +++ b/tests/vendor/JSONTestSuite @@ -0,0 +1 @@ +Subproject commit 1ef36fa01286573e846ac449e8683f8833c5b26a From 469b3bb419fece099065802a442d796aa1e46c54 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 17:34:35 +0000 Subject: [PATCH 17/21] fix(test): clippy doc_overindented_list_items in json_test_suite --- tests/json_test_suite.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/json_test_suite.rs b/tests/json_test_suite.rs index 6f4d031..31b7b1b 100644 --- a/tests/json_test_suite.rs +++ b/tests/json_test_suite.rs @@ -2,12 +2,12 @@ //! //! - `y_*` files: must parse in both EAGER and LAZY modes. //! - `n_*` files: must fail to parse in EAGER mode. -//! In LAZY mode the file MAY parse (structural-only) but a -//! value-level access of the malformed field would fail; we -//! do not assert against LAZY here. +//! In LAZY mode the file MAY parse (structural-only) but a value-level +//! access of the malformed field would fail; we do not assert against +//! LAZY here. //! - `i_*` files: implementation-defined; we record our behavior (no -//! assertions). The list of accepted/rejected i_* cases is -//! printed at the end of the test run for documentation. +//! assertions). The list of accepted/rejected i_* cases is printed at +//! the end of the test run for documentation. //! //! # Known failures //! From b56f93d90bfc1b197a8b0ec14124f34b007fe469 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 17:36:27 +0000 Subject: [PATCH 18/21] docs: update two-phase invariants for eager/lazy modes and RFC 8259 audit --- CLAUDE.md | 4 ++-- README.md | 34 ++++++++++++++++++++++++++++++++++ docs/rfc8259-conformance.md | 19 +++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 docs/rfc8259-conformance.md diff --git a/CLAUDE.md b/CLAUDE.md index e5039a8..cec0555 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,12 +45,12 @@ cargo test --features test-panic --release ### Two-phase parse -**Phase 1** (`src/scan/`, called from `Document::parse`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices: Vec`. A `u32::MAX` sentinel is appended. The scanner is selected at first use via `OnceCell` in `src/scan/mod.rs`: +**Phase 1** (`src/scan/`, called from `Document::parse_with_options`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices`. Then `validate_depth` is run unconditionally; in EAGER mode, `validate_trailing` and `validate_eager_values` (number ABNF + string content + UTF-8) follow. In LAZY mode, value-level checks are skipped and rely on the lazy decode path at field-access time. A `u32::MAX` sentinel is appended. The scanner is selected at first use via `OnceCell` in `src/scan/mod.rs`: - `Avx2Scanner` (gated by the `avx2` cargo feature, default-on) when both `avx2` and `pclmulqdq` are detected at runtime. - `ScalarScanner` otherwise. -Validation is shallow — bracket/quote balance only. Value-level errors (bad escapes, malformed numbers, invalid UTF-8 in `\u`) are deferred to Phase 2 and surface only if that field is accessed. +Validation level depends on `qjd_options.mode`. **EAGER** (default): a post-scan pass walks `indices` and validates RFC 8259 number ABNF, string content (no unescaped control chars), and UTF-8 — parse fails on any value-level violation. **LAZY** (opt-in): bracket/quote balance + max-depth only; value-level errors surface when the offending field is accessed (lua-cjson-equivalent behavior). Trailing-content rejection and value-level validation are eager-only; max-depth (default 1024, configurable up to 4096) is enforced in both modes. **Phase 2** (`src/cursor.rs`, `src/path.rs`, `src/decode/`): path strings are parsed by a zero-alloc `PathIter` into `PathSeg::Key | Idx`. A `Cursor` (a `(idx_start, idx_end)` pair into `doc.indices`) is walked to the target, optionally caching sibling spans in `doc.skip` (`SkipCache`) so repeated lookups on the same container skip brace-counting. Strings are decoded into `doc.scratch` only when they contain escapes; otherwise the original buffer slice is handed back. diff --git a/README.md b/README.md index 9054aef..bc9d0c2 100644 --- a/README.md +++ b/README.md @@ -116,3 +116,37 @@ methodology + reproduction command. ```sh make bench # quickdecode vs cjson ``` + +## RFC 8259 conformance + +This crate implements RFC 8259 with both strict and lenient modes; the strict +(eager) mode is the default and is required by API-gateway use cases that must +reject malformed payloads before forwarding them upstream. + +- Strict-mode acceptance corpus: `tests/rfc8259_compliance.rs` +- Industry corpus: `tests/json_test_suite.rs` (against the + [JSONTestSuite](https://github.com/nst/JSONTestSuite) submodule at + `tests/vendor/JSONTestSuite`) +- Behavior on implementation-defined (`i_*`) cases: `docs/rfc8259-conformance.md` + +### Switching modes + +From Lua: + +```lua +local doc = qd.parse(json) -- eager (default) +local doc = qd.parse(json, { lazy = true }) -- lazy mode +local doc = qd.parse(json, { max_depth = 256 }) -- stricter depth limit +local doc = qd.parse(json, { lazy = true, max_depth = 256 }) +``` + +From C: + +```c +qjd_options opts = { .mode = QJD_MODE_LAZY, .max_depth = 256 }; +qjd_doc* doc = qjd_parse_ex(buf, len, &opts, &err); +``` + +### Known gaps + +Three structural-grammar checks are deferred to a follow-up — they require a grammar-aware walk beyond the current heuristic. See `tests/rfc8259_compliance.rs` for the specific `#[ignore]`d cases, and `tests/json_test_suite.rs::KNOWN_N_FAILURES` for the corresponding JSONTestSuite files. diff --git a/docs/rfc8259-conformance.md b/docs/rfc8259-conformance.md new file mode 100644 index 0000000..b203d84 --- /dev/null +++ b/docs/rfc8259-conformance.md @@ -0,0 +1,19 @@ +# RFC 8259 conformance: implementation-defined cases + +JSONTestSuite categorizes some inputs as `i_*` — the spec allows either +acceptance or rejection. This file records `lua-quick-decode`'s behavior on +each, so changes show up in `git diff`. + +Behavior is recorded for the default **EAGER** mode unless noted. + +| File pattern | Our verdict | Rationale | +|---|---|---| +| `i_number_huge_exp` | REJECT (`QJD_NUMBER_OUT_OF_RANGE`) | f64 overflow surfaces at decode. | +| `i_number_very_big_negative_int` | varies — see below | ABNF-valid; representational, not structural. | +| `i_string_*` (UTF-16 surrogate halves in `\u` escapes) | REJECT (`QJD_DECODE_FAILED`) | We require well-formed surrogate pairs. | +| `i_structure_500_nested_arrays` | ACCEPT (within default 1024 max_depth) | Configurable. | + +Run `cargo test --release --test json_test_suite -- --nocapture` to print the +live verdict for every `i_*` file via the `document_i_files_behavior` test. +That is the source of truth for these entries; update this table when a +verdict changes (e.g. after a validator gap is closed). From 1a3a4b7060821ea5bf31aa97a1adc8626e3c88be Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Sun, 17 May 2026 17:48:57 +0000 Subject: [PATCH 19/21] ci: init JSONTestSuite submodule on Rust matrix checkouts --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 225b49f..14adffc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,8 @@ jobs: os: [ubuntu-latest, macos-14] steps: - uses: actions/checkout@v4 + with: + submodules: recursive - name: Install Rust (stable) run: | From 4aac34e693691c1e9f97fdfcbe72459ee2af96df Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 18 May 2026 00:43:23 +0000 Subject: [PATCH 20/21] perf(validate): single-pass string validator with SIMD ASCII fast path Replace the 3-pass string validator (control-char check + std::str::from_utf8 + byte-by-byte escape grammar walk) with a single-pass state machine, fronted by an ASCII-only SIMD fast path that bulk-skips chunks of pure printable ASCII bytes. The previous implementation walked every interior byte three times, which made eager validation 10-48x slower than the lazy baseline on parse+access benchmarks. The single-pass scalar walker combines all three checks; the fast path adds AVX2 (32B chunks) and NEON (16B chunks) skips for the common case where strings contain no escapes, no UTF-8 multi-bytes, and no control characters. Strict UTF-8 per RFC 3629: rejects overlong encodings (C0/C1, E0 with A0-BF only, F0 with 90-BF only), surrogates (ED A0-BF), and out-of-range leads (F5-FF). Matches std::str::from_utf8 for the corpus the project already covers. Module structure: src/validate/strings/mod.rs dispatcher + tests src/validate/strings/scalar.rs pure-Rust state machine src/validate/strings/avx2.rs x86_64 AVX2 ASCII skip src/validate/strings/neon.rs aarch64 NEON ASCII skip All 8 baseline unit tests are preserved verbatim. 16 new tests cover SIMD chunk-boundary cases (UTF-8 straddling, backslash at boundary, long ASCII runs), truncated \uXXXX, dangling backslash, unknown escape introducers, overlong/surrogate UTF-8, and lone continuation bytes. Bench delta (quickdecode.parse + access 3 fields, median ops/s): 100k: 4,004 -> 61,881 (15.5x) 1m: 392 -> 7,075 (18.0x) github-100k: 1,711 -> 1,897 (1.1x; mostly non-ASCII) --- src/validate/strings.rs | 63 ----------- src/validate/strings/avx2.rs | 68 ++++++++++++ src/validate/strings/mod.rs | 192 +++++++++++++++++++++++++++++++++ src/validate/strings/neon.rs | 67 ++++++++++++ src/validate/strings/scalar.rs | 158 +++++++++++++++++++++++++++ 5 files changed, 485 insertions(+), 63 deletions(-) delete mode 100644 src/validate/strings.rs create mode 100644 src/validate/strings/avx2.rs create mode 100644 src/validate/strings/mod.rs create mode 100644 src/validate/strings/neon.rs create mode 100644 src/validate/strings/scalar.rs diff --git a/src/validate/strings.rs b/src/validate/strings.rs deleted file mode 100644 index dc974f2..0000000 --- a/src/validate/strings.rs +++ /dev/null @@ -1,63 +0,0 @@ -//! String-content validation: control chars and UTF-8. - -use crate::error::qjd_err; - -/// Verify that the raw span (excluding surrounding quotes) contains no -/// unescaped control characters (0x00..=0x1F), is valid UTF-8, and that -/// every backslash escape sequence is RFC 8259 §7 compliant. -pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjd_err> { - // UTF-8 validation first (includes multi-byte content validation). - // Backslash escapes are ASCII, so validating the unexpanded span gives - // the correct answer for the UTF-8 structure of non-escape bytes. - if std::str::from_utf8(span).is_err() { - return Err(qjd_err::QJD_INVALID_UTF8); - } - - // Walk the span validating control chars and escape sequences. - let mut i = 0; - while i < span.len() { - let b = span[i]; - // RFC 8259 §7: control characters must be escaped. - if b < 0x20 { - return Err(qjd_err::QJD_INVALID_STRING); - } - if b == b'\\' { - i += 1; - if i >= span.len() { - return Err(qjd_err::QJD_INVALID_STRING); - } - match span[i] { - b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} - b'u' => { - // Must be followed by exactly 4 hex digits. - if i + 4 >= span.len() { - return Err(qjd_err::QJD_INVALID_STRING); - } - for &h in &span[i + 1..=i + 4] { - if !h.is_ascii_hexdigit() { - return Err(qjd_err::QJD_INVALID_STRING); - } - } - i += 4; // consumed 4 hex digits; loop adds 1 more - } - _ => return Err(qjd_err::QJD_INVALID_STRING), - } - } - i += 1; - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] fn ascii_ok() { assert!(validate_string_span(b"hello").is_ok()); } - #[test] fn utf8_ok() { assert!(validate_string_span("中文".as_bytes()).is_ok()); } - #[test] fn escapes_ok() { assert!(validate_string_span(b"a\\nb\\u00e9").is_ok()); } - #[test] fn tab_raw_bad() { assert_eq!(validate_string_span(b"a\tb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } - #[test] fn null_raw_bad() { assert_eq!(validate_string_span(b"a\x00b").unwrap_err(), qjd_err::QJD_INVALID_STRING); } - #[test] fn newline_raw_bad() { assert_eq!(validate_string_span(b"a\nb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } - #[test] fn del_0x7f_ok() { assert!(validate_string_span(b"a\x7fb").is_ok()); } // RFC 8259 does NOT forbid 0x7F - #[test] fn invalid_utf8_bad() { assert_eq!(validate_string_span(&[0xC0, 0xC0]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); } -} diff --git a/src/validate/strings/avx2.rs b/src/validate/strings/avx2.rs new file mode 100644 index 0000000..7823d8c --- /dev/null +++ b/src/validate/strings/avx2.rs @@ -0,0 +1,68 @@ +#![cfg(all(target_arch = "x86_64", feature = "avx2"))] + +//! AVX2 ASCII fast path for string-content validation. +//! +//! For each 32-byte chunk, compute a "needs-attention" mask covering bytes +//! that are either control chars (< 0x20), backslashes, or high-bit bytes. +//! If the mask is all-zero the chunk is pure printable ASCII (no escapes, +//! no UTF-8, no control) and can be skipped entirely. +//! +//! On the first non-zero chunk we hand off to the scalar state machine for +//! the remainder of the span — we don't try to bit-scan inside the chunk. +//! The fast-path payoff comes from cleanly skipping long ASCII prefixes; +//! the scalar tail handles correctness without needing SIMD escape logic. + +use crate::error::qjd_err; +use core::arch::x86_64::*; + +use super::scalar::validate_span_scalar; + +/// Validate `span` using AVX2 to bulk-skip pure-ASCII 32-byte chunks. +pub(crate) fn validate_span_avx2(span: &[u8]) -> Result<(), qjd_err> { + // SAFETY: dispatcher has verified the AVX2 feature is present. + unsafe { validate_span_avx2_impl(span) } +} + +#[target_feature(enable = "avx2")] +unsafe fn validate_span_avx2_impl(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + + // ASCII bytes that need scalar attention have: + // - top bit set → byte >= 0x80 + // - value < 0x20 → control char + // - value == 0x5C ('\\') → escape introducer + // + // Detection via three SIMD compares OR'd together. + let backslash = _mm256_set1_epi8(b'\\' as i8); + // For "< 0x20" we use a signed unsigned trick: compare against 0x1F via + // unsigned MAX. _mm256_cmpgt_epi8 is signed, but bytes <0x20 are also + // <0x20 as signed positive values, so signed cmpgt works here for the + // 0x00..=0x1F range (none of which has the high bit set). + let ctrl_thresh = _mm256_set1_epi8(0x20_i8); + + while i + 32 <= n { + let chunk = _mm256_loadu_si256(span.as_ptr().add(i) as *const __m256i); + + // high bit set? + let high = _mm256_movemask_epi8(chunk) as u32; + // byte == '\\' ? + let bs = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, backslash)) as u32; + // byte < 0x20 ? (signed cmpgt: ctrl_thresh > chunk for 0x00..=0x1F bytes) + let ctrl = _mm256_movemask_epi8(_mm256_cmpgt_epi8(ctrl_thresh, chunk)) as u32; + + let interesting = high | bs | ctrl; + if interesting != 0 { + // Hand off to the scalar state machine starting at the first + // interesting byte in this chunk. We don't try to validate any + // already-cleared bytes — those are pure printable ASCII and + // self-terminating so it's safe to resume there. + let offset = interesting.trailing_zeros() as usize; + return validate_span_scalar(&span[i + offset..]); + } + + i += 32; + } + + validate_span_scalar(&span[i..]) +} diff --git a/src/validate/strings/mod.rs b/src/validate/strings/mod.rs new file mode 100644 index 0000000..ab10090 --- /dev/null +++ b/src/validate/strings/mod.rs @@ -0,0 +1,192 @@ +//! String-content validation: control chars, escape grammar, and UTF-8. +//! +//! Single-pass validator with an optional SIMD ASCII fast path. The public +//! entry point [`validate_string_span`] dispatches once via `OnceCell` to +//! the best available implementation: +//! +//! - x86_64 + AVX2: 32-byte chunk skip → scalar tail. +//! - aarch64 NEON: 16-byte chunk skip → scalar tail. +//! - Otherwise: pure scalar state machine. +//! +//! All paths return identical error codes for any input; the SIMD layers +//! only accelerate the "this chunk is pure printable ASCII" common case. + +mod scalar; +#[cfg(all(target_arch = "x86_64", feature = "avx2"))] +mod avx2; +#[cfg(target_arch = "aarch64")] +mod neon; + +use crate::error::qjd_err; +use once_cell::sync::OnceCell; + +type ValidateFn = fn(&[u8]) -> Result<(), qjd_err>; +static VALIDATE_FN: OnceCell = OnceCell::new(); + +/// Verify that the raw span (excluding surrounding quotes) contains no +/// unescaped control characters (0x00..=0x1F), every backslash escape is +/// RFC 8259 §7 compliant, and the byte sequence is valid UTF-8 per RFC 3629. +pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjd_err> { + let f = *VALIDATE_FN.get_or_init(|| { + #[cfg(all(target_arch = "x86_64", feature = "avx2"))] + { + if std::is_x86_feature_detected!("avx2") { + return avx2::validate_span_avx2 as ValidateFn; + } + } + #[cfg(target_arch = "aarch64")] + { + return neon::validate_span_neon as ValidateFn; + } + #[allow(unreachable_code)] + { + scalar::validate_span_scalar as ValidateFn + } + }); + f(span) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── Pinned baseline contract (DO NOT MODIFY) ───────────────────────── + // These 8 tests reproduce the original 3-pass validator's externally + // observable behavior and pin it down. The single-pass refactor must + // not change any of these outcomes. + + #[test] fn ascii_ok() { assert!(validate_string_span(b"hello").is_ok()); } + #[test] fn utf8_ok() { assert!(validate_string_span("中文".as_bytes()).is_ok()); } + #[test] fn escapes_ok() { assert!(validate_string_span(b"a\\nb\\u00e9").is_ok()); } + #[test] fn tab_raw_bad() { assert_eq!(validate_string_span(b"a\tb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn null_raw_bad() { assert_eq!(validate_string_span(b"a\x00b").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn newline_raw_bad() { assert_eq!(validate_string_span(b"a\nb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn del_0x7f_ok() { assert!(validate_string_span(b"a\x7fb").is_ok()); } // RFC 8259 does NOT forbid 0x7F + #[test] fn invalid_utf8_bad() { assert_eq!(validate_string_span(&[0xC0, 0xC0]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); } + + // ── Single-pass / SIMD edge cases ──────────────────────────────────── + + #[test] + fn empty_span_ok() { + assert!(validate_string_span(b"").is_ok()); + } + + #[test] + fn long_ascii_ok() { + // > 64 bytes hits the SIMD fast path multiple times. + let s = vec![b'x'; 256]; + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn long_ascii_with_trailing_tab_bad() { + // Long ASCII run skipped by SIMD, then a control byte in the tail. + let mut s = vec![b'x'; 200]; + s.push(b'\t'); + assert_eq!(validate_string_span(&s).unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn utf8_at_simd_chunk_boundary() { + // 31 ASCII bytes + 2-byte UTF-8 (é = 0xC3 0xA9). On AVX2 the first + // 32-byte chunk has a high-bit byte at lane 31 → forces scalar tail + // starting at position 31, which must handle the 2-byte sequence. + let mut s = vec![b'x'; 31]; + s.extend_from_slice("é".as_bytes()); + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn backslash_escape_at_simd_chunk_boundary() { + // 31 ASCII + `\n` straddles AVX2 chunk boundary at byte 31. + let mut s = vec![b'x'; 31]; + s.push(b'\\'); + s.push(b'n'); + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn backslash_at_chunk_boundary_with_bad_followup() { + // Backslash lands as the last byte of a 32-byte chunk; the next byte + // is an invalid escape introducer. Tail must reject. + let mut s = vec![b'x'; 31]; + s.push(b'\\'); + s.push(b'q'); + assert_eq!(validate_string_span(&s).unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn truncated_u_escape_at_end() { + // `\uXX` with only 2 hex digits — RFC requires exactly 4. + assert_eq!(validate_string_span(b"\\uAB").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\uABC").unwrap_err(), qjd_err::QJD_INVALID_STRING); + // Bare `\u` at end. + assert_eq!(validate_string_span(b"\\u").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn dangling_backslash_at_end() { + assert_eq!(validate_string_span(b"abc\\").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn unknown_escape_introducer() { + // `\a`, `\q`, etc. are not valid RFC 8259 escapes. + assert_eq!(validate_string_span(b"\\a").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\q").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\x41").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn u_escape_non_hex_bad() { + assert_eq!(validate_string_span(b"\\u00ZZ").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\uGHIJ").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn overlong_utf8_rejected() { + // C0 80 would encode U+0000 in 2 bytes (overlong) — RFC 3629 forbids. + assert_eq!(validate_string_span(&[0xC0, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // E0 80 80 would encode U+0000 in 3 bytes (overlong). + assert_eq!(validate_string_span(&[0xE0, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // F0 80 80 80 would encode U+0000 in 4 bytes (overlong). + assert_eq!(validate_string_span(&[0xF0, 0x80, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn surrogate_in_utf8_rejected() { + // ED A0 80 = U+D800, the start of the high-surrogate range. + assert_eq!(validate_string_span(&[0xED, 0xA0, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // ED BF BF = U+DFFF, the end of the low-surrogate range. + assert_eq!(validate_string_span(&[0xED, 0xBF, 0xBF]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn lone_continuation_byte_rejected() { + assert_eq!(validate_string_span(&[0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + assert_eq!(validate_string_span(&[b'a', 0xBF, b'b']).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn four_byte_emoji_ok() { + // U+1F600 grinning face = F0 9F 98 80. + assert!(validate_string_span(&[0xF0, 0x9F, 0x98, 0x80]).is_ok()); + } + + #[test] + fn truncated_utf8_sequence_rejected() { + // 2-byte lead with no continuation. + assert_eq!(validate_string_span(&[0xC3]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // 3-byte lead with only one continuation. + assert_eq!(validate_string_span(&[0xE4, 0xB8]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // 4-byte lead with only two continuations. + assert_eq!(validate_string_span(&[0xF0, 0x9F, 0x98]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn utf8_out_of_range_rejected() { + // F5..FF are not valid lead bytes (would encode > U+10FFFF). + assert_eq!(validate_string_span(&[0xF5, 0x80, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + assert_eq!(validate_string_span(&[0xFF]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } +} diff --git a/src/validate/strings/neon.rs b/src/validate/strings/neon.rs new file mode 100644 index 0000000..34d887e --- /dev/null +++ b/src/validate/strings/neon.rs @@ -0,0 +1,67 @@ +#![cfg(target_arch = "aarch64")] + +//! NEON ASCII fast path for string-content validation. +//! +//! For each 16-byte chunk, compute a single "needs-attention" mask covering +//! bytes that are control chars (< 0x20), backslashes, or high-bit bytes. +//! If the chunk is pure printable ASCII the mask is all-zero and the chunk +//! can be skipped entirely. The first non-zero chunk hands off to the +//! scalar state machine, which handles correctness for the remainder. + +use crate::error::qjd_err; +use core::arch::aarch64::*; + +use super::scalar::validate_span_scalar; + +/// Validate `span` using NEON to bulk-skip pure-ASCII 16-byte chunks. +pub(crate) fn validate_span_neon(span: &[u8]) -> Result<(), qjd_err> { + // SAFETY: aarch64 NEON is always available on aarch64 (it is part of + // the AArch64 base ISA), so no runtime feature check is required. + unsafe { validate_span_neon_impl(span) } +} + +#[target_feature(enable = "neon")] +unsafe fn validate_span_neon_impl(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + + let backslash = vdupq_n_u8(b'\\'); + let ctrl_top = vdupq_n_u8(0x20); + + while i + 16 <= n { + let chunk = vld1q_u8(span.as_ptr().add(i)); + + // byte >= 0x80 ? high bit set + let high = vcgeq_u8(chunk, vdupq_n_u8(0x80)); + // byte == '\\' ? + let bs = vceqq_u8(chunk, backslash); + // byte < 0x20 ? + let ctrl = vcltq_u8(chunk, ctrl_top); + + let interesting = vorrq_u8(vorrq_u8(high, bs), ctrl); + + // Reduce 16 lanes → single u64 to test for any non-zero byte. + // vmaxvq_u8 returns 0 iff every lane is 0. + if vmaxvq_u8(interesting) != 0 { + // First interesting byte: find via lane index. + // Build 0xFF/0x00 per-lane mask already in `interesting`; convert + // each lane to its index-or-MAX via a small scalar loop. A 16-lane + // ctz would be tidier but isn't critical here — interesting chunks + // are the slow case anyway. + for lane in 0..16usize { + if span[i + lane] >= 0x80 + || span[i + lane] == b'\\' + || span[i + lane] < 0x20 + { + return validate_span_scalar(&span[i + lane..]); + } + } + // Unreachable: vmaxvq_u8 said at least one lane is non-zero. + unreachable!(); + } + + i += 16; + } + + validate_span_scalar(&span[i..]) +} diff --git a/src/validate/strings/scalar.rs b/src/validate/strings/scalar.rs new file mode 100644 index 0000000..7784679 --- /dev/null +++ b/src/validate/strings/scalar.rs @@ -0,0 +1,158 @@ +//! Single-pass scalar validator for a JSON string span (interior bytes, +//! excluding the surrounding quotes). +//! +//! Combines three checks into one byte walk: +//! 1. RFC 8259 §7: no raw control characters (b < 0x20). +//! 2. RFC 8259 §7: every `\` escape is one of `" \ / b f n r t` or `\uXXXX`. +//! 3. RFC 3629: valid UTF-8 (rejects overlong encodings and surrogates, +//! matching `std::str::from_utf8` for full corpus parity). +//! +//! Error-code precedence on mixed inputs: +//! - Control char or invalid escape introducer encountered first → INVALID_STRING. +//! - Bad UTF-8 lead/continuation byte encountered first → INVALID_UTF8. +//! +//! This means a span like `[0x09, 0xFF]` returns INVALID_STRING (control byte +//! seen before the UTF-8 problem), whereas `[0xFF, 0x09]` returns INVALID_UTF8. +//! The previous two-pass code preferred UTF-8 in both cases; no existing test +//! pins down which wins on mixed input, so the position-ordered choice here +//! is the natural single-pass behavior. + +use crate::error::qjd_err; + +/// Validate `span` byte-by-byte. The caller passes the unescaped string +/// interior (between the JSON `"…"` quotes) — `\` therefore introduces an +/// RFC 8259 escape sequence, not a literal backslash byte. +pub(crate) fn validate_span_scalar(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + while i < n { + let b = span[i]; + + // Fast path: plain ASCII non-escape non-control. + if b < 0x80 { + if b < 0x20 { + return Err(qjd_err::QJD_INVALID_STRING); + } + if b == b'\\' { + i = validate_escape(span, i + 1)?; + continue; + } + i += 1; + continue; + } + + // High-bit byte: must be the lead of a 2/3/4-byte UTF-8 sequence. + i = validate_utf8_sequence(span, i)?; + } + Ok(()) +} + +/// At entry `i` points to the byte AFTER the `\`. Returns the index of the +/// next byte to validate (i.e. one past the last consumed escape byte). +#[inline] +fn validate_escape(span: &[u8], i: usize) -> Result { + if i >= span.len() { + // Dangling `\` at end of span. + return Err(qjd_err::QJD_INVALID_STRING); + } + match span[i] { + b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => Ok(i + 1), + b'u' => { + // Must be followed by exactly 4 hex digits. + let hex_start = i + 1; + let hex_end = hex_start + 4; + if hex_end > span.len() { + return Err(qjd_err::QJD_INVALID_STRING); + } + for &h in &span[hex_start..hex_end] { + if !h.is_ascii_hexdigit() { + return Err(qjd_err::QJD_INVALID_STRING); + } + } + Ok(hex_end) + } + _ => Err(qjd_err::QJD_INVALID_STRING), + } +} + +/// At entry `i` points to a byte with the high bit set. Validate the +/// multi-byte UTF-8 sequence starting here per RFC 3629 (rejects overlong +/// encodings and UTF-16 surrogates U+D800..=U+DFFF). Returns the index one +/// past the last byte of the sequence. +#[inline] +fn validate_utf8_sequence(span: &[u8], i: usize) -> Result { + let lead = span[i]; + let n = span.len(); + + // 2-byte: 110xxxxx 10xxxxxx, lead in C2..=DF (C0/C1 are overlong). + if (0xC2..=0xDF).contains(&lead) { + if i + 1 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + if !(0x80..=0xBF).contains(&b1) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 2); + } + + // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx, lead in E0..=EF. + // Extra constraints: E0 second must be A0..BF (else overlong); + // ED second must be 80..9F (else surrogate U+D800..=DFFF). + if (0xE0..=0xEF).contains(&lead) { + if i + 2 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + let b2 = span[i + 2]; + let b1_lo = match lead { + 0xE0 => 0xA0, + _ => 0x80, + }; + let b1_hi = match lead { + 0xED => 0x9F, + _ => 0xBF, + }; + if b1 < b1_lo || b1 > b1_hi { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b2) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 3); + } + + // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, lead in F0..=F4. + // Extra constraints: F0 second must be 90..BF (else overlong); + // F4 second must be 80..8F (else > U+10FFFF). + if (0xF0..=0xF4).contains(&lead) { + if i + 3 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + let b2 = span[i + 2]; + let b3 = span[i + 3]; + let b1_lo = match lead { + 0xF0 => 0x90, + _ => 0x80, + }; + let b1_hi = match lead { + 0xF4 => 0x8F, + _ => 0xBF, + }; + if b1 < b1_lo || b1 > b1_hi { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b2) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b3) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 4); + } + + // C0, C1 (overlong 2-byte lead), F5..FF (out of range), or a bare + // continuation byte (80..BF with no lead) — all invalid. + Err(qjd_err::QJD_INVALID_UTF8) +} From d0999de28791c249375c8fb4e8e6b88c883dda4d Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 18 May 2026 01:02:38 +0000 Subject: [PATCH 21/21] feat(validate): grammar-aware eager pass closes structural gaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the two-pass heuristic (string-span loop + scalar-gap walker with `:`/`,` empty-gap detection) with a single grammar-aware state machine that walks `indices` once. The machine tracks the expected next-token kind in each container context via a stack (Top/TopDone, ArrAfter{Open,Value,Comma}, ObjAfter{Open,Key,Colon,Value,Comma}). String tokens and structural characters are validated against the state; scalar tokens living in the byte gap before the next structural are dispatched through the same true/false/null/number precedence the previous `check_gap` used, so existing tests keep their current error codes. Closes the 3 ignored cases in tests/rfc8259_compliance::structural (missing_colon, leading_comma_array_with_value, missing_comma_in_object) and drops all 13 entries from KNOWN_N_FAILURES in tests/json_test_suite — every grammar-only n_* case in JSONTestSuite is now correctly rejected. --- src/validate/mod.rs | 335 +++++++++++++++++++++++++++--------- tests/json_test_suite.rs | 49 +----- tests/rfc8259_compliance.rs | 23 +-- 3 files changed, 269 insertions(+), 138 deletions(-) diff --git a/src/validate/mod.rs b/src/validate/mod.rs index c6d972d..8ddee23 100644 --- a/src/validate/mod.rs +++ b/src/validate/mod.rs @@ -125,121 +125,232 @@ pub(crate) fn validate_trailing( Ok(()) } -/// Walk `indices` and validate every scalar value (numbers + strings). -/// Called only in EAGER mode. +/// Grammar-aware eager pass: walk `indices` once and validate every +/// structural transition, key/value string, and scalar value. +/// +/// The state machine tracks the expected next-token kind in each +/// container context (object/array) via a stack. Empty gaps where a +/// value is required (`[,]`, `{"a":}`), missing colons (`{"a"}`), +/// missing commas (`{"a":1"b":2}`), non-string object keys (`{1:1}`), +/// and stray structural tokens (`[1:2]`) all surface here as +/// `QJD_PARSE_ERROR`. +/// +/// Scalar tokens (numbers, `true`, `false`, `null`) live in the byte +/// gap before the *next* structural offset. They are dispatched to +/// `validate_number` or matched against the three literal keywords; +/// the error-code precedence matches the previous heuristic-based +/// `check_gap` so existing tests keep their current error codes. pub(crate) fn validate_eager_values( buf: &[u8], indices: &[u32], ) -> Result<(), qjd_err> { - let mut i = 0; - while i + 1 < indices.len() { + // Stack of container contexts; the top is the current state. + // We use a single seed entry `CtxKind::Top` for the root value. + let mut stack: Vec = Vec::with_capacity(16); + stack.push(CtxKind::Top); + + // Byte position just past the previous structural we consumed — + // i.e. the start of the current gap. A gap may contain a scalar + // value or be whitespace-only. + let mut prev_end: usize = 0; + + let mut i: usize = 0; + while i < indices.len() { let idx = indices[i]; if idx == u32::MAX { break; } let pos = idx as usize; let b = buf[pos]; - // Strings: opening quote here, closing quote at indices[i+1]. - // (The scanner emits BOTH quotes of a string in order.) - if b == b'"' { - let close = indices[i + 1] as usize; - // Defensive: scanner pairs quotes correctly, but guard anyway. - if close <= pos || close >= buf.len() || buf[close] != b'"' { - return Err(qjd_err::QJD_PARSE_ERROR); + // First, consume any scalar token sitting in the gap before + // this structural. This may transition the current state from + // a value-expecting form to its "AfterValue" form. + consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?; + + match b { + b'{' | b'[' => { + let cur = stack.last_mut().unwrap(); + match *cur { + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon => { + // Transition parent to AfterValue ahead of the + // descent; the inner container's close pops back. + *cur = parent_after_value(*cur); + stack.push(if b == b'{' { + CtxKind::ObjAfterOpen + } else { + CtxKind::ArrAfterOpen + }); + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; } - let span = &buf[pos + 1 .. close]; - strings::validate_string_span(span)?; - i += 2; - continue; - } + b'}' => { + let top = stack.pop().ok_or(qjd_err::QJD_PARSE_ERROR)?; + if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + if stack.is_empty() { return Err(qjd_err::QJD_PARSE_ERROR); } + prev_end = pos + 1; + i += 1; + } + b']' => { + let top = stack.pop().ok_or(qjd_err::QJD_PARSE_ERROR)?; + if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + if stack.is_empty() { return Err(qjd_err::QJD_PARSE_ERROR); } + prev_end = pos + 1; + i += 1; + } + b',' => { + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma, + CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma, + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; + } + b':' => { + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon, + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; + } + b'"' => { + // The scanner pairs the opening and closing quotes; the + // closing quote is at indices[i + 1]. + if i + 1 >= indices.len() { return Err(qjd_err::QJD_PARSE_ERROR); } + let close = indices[i + 1] as usize; + if close <= pos || close >= buf.len() || buf[close] != b'"' { + return Err(qjd_err::QJD_PARSE_ERROR); + } + strings::validate_string_span(&buf[pos + 1 .. close])?; - // Container brackets and `:`/`,` are not values; skip. - if matches!(b, b'{' | b'}' | b'[' | b']' | b':' | b',') { - i += 1; - continue; + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + // Key position in an object. + CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => { + *cur = CtxKind::ObjAfterKey; + } + // Value position (top-level, array element, or object value). + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon => { + *cur = parent_after_value(*cur); + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = close + 1; + i += 2; + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), } + } - // Should not happen: scanner only emits the 7 structural chars. + // Tail: a top-level scalar root (e.g. `42`, `true`) lives in the + // gap after the last structural — or, if there are no structurals, + // the whole buffer. + consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?; + + // After the walk, the stack must hold exactly one frame: the root + // context, which must be `TopDone` (root value consumed). + if stack.len() != 1 || stack[0] != CtxKind::TopDone { return Err(qjd_err::QJD_PARSE_ERROR); } - - // Scalar values (numbers, true, false, null) live in the gaps between - // structural offsets. Walk those gaps and dispatch. - validate_scalars_in_gaps(buf, indices) + Ok(()) } -/// For each consecutive pair of structural offsets, examine the bytes -/// between them. If the gap contains a scalar (anything other than -/// whitespace), validate its grammar. -fn validate_scalars_in_gaps(buf: &[u8], indices: &[u32]) -> Result<(), qjd_err> { - let mut prev_end: usize = 0; - let mut in_str = false; - // Track the last non-quote structural char so check_gap can reject empty - // gaps in positions where a value is required (after `:` or `,`). - let mut prev_structural: u8 = 0; - for &idx in indices { - if idx == u32::MAX { break; } - let pos = idx as usize; - let b = buf[pos]; - - if b == b'"' { - // Toggle: the bytes between two quotes are the string interior - // (already validated above). Skip gap-scanning across them. - if in_str { - in_str = false; - prev_end = pos + 1; - } else { - // Validate any scalar in the gap leading up to this quote. - // An open-quote is itself a value, so pass it as the next char: - // an empty gap before a string is always fine (`:` `"` and `,` `"` are - // both valid — the string IS the value). - check_gap(buf, prev_end, pos, prev_structural, b'"')?; - in_str = true; - prev_structural = b'"'; - } - continue; - } - if in_str { continue; } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CtxKind { + Top, // top-level value not yet consumed + TopDone, // top-level value consumed; only whitespace/EOI allowed + ArrAfterOpen, // just saw `[`; expect value or `]` + ArrAfterValue, // just saw a value; expect `,` or `]` + ArrAfterComma, // just saw `,`; expect value (no trailing comma) + ObjAfterOpen, // just saw `{`; expect key (string) or `}` + ObjAfterKey, // just saw key string; expect `:` + ObjAfterColon, // just saw `:`; expect value + ObjAfterValue, // just saw value; expect `,` or `}` + ObjAfterComma, // just saw `,`; expect key (no trailing comma) +} - check_gap(buf, prev_end, pos, prev_structural, b)?; - prev_end = pos + 1; - prev_structural = b; +/// Transition the value-expecting state to its corresponding +/// "after value" state once the value (scalar / string / container) +/// has been consumed. +#[inline] +fn parent_after_value(s: CtxKind) -> CtxKind { + match s { + CtxKind::Top => CtxKind::TopDone, + CtxKind::ArrAfterOpen => CtxKind::ArrAfterValue, + CtxKind::ArrAfterComma => CtxKind::ArrAfterValue, + CtxKind::ObjAfterColon => CtxKind::ObjAfterValue, + other => other, // unreachable for callers } - // Tail gap (top-level scalar like "42"): next char is EOF (0 sentinel) - check_gap(buf, prev_end, buf.len(), prev_structural, 0) } -/// `prev_structural`: the last non-quote structural char before this gap. -/// `next_structural`: the structural char immediately after this gap (opens or closes). -fn check_gap(buf: &[u8], start: usize, end: usize, prev_structural: u8, next_structural: u8) -> Result<(), qjd_err> { - // Strip surrounding whitespace. +/// Examine the byte gap `[start, end)` between two structurals. +/// If the gap contains a scalar token, validate it and transition +/// `*state` to its corresponding "AfterValue" form. If the gap is +/// whitespace only, leave `*state` unchanged — the next structural's +/// own check rejects empty values where they are not allowed +/// (e.g. `ObjAfterColon` followed by `}` is caught when `}` pops). +fn consume_scalar_gap( + buf: &[u8], + start: usize, + end: usize, + state: &mut CtxKind, +) -> Result<(), qjd_err> { + // Strip whitespace. let mut s = start; while s < end && is_ws(buf[s]) { s += 1; } let mut e = end; while e > s && is_ws(buf[e - 1]) { e -= 1; } + if s == e { - // Empty gap: a value is required after `:` (object value) or `,` (next - // element), BUT only when the next token is not a structural value-starter - // (`"`, `{`, `[`) — those ARE the values. An empty gap before `}` / `]` - // / `,` when the preceding token demands a value is a structural error. - // This heuristic catches {"a":}, [,], [1,] without a full grammar walk. - let next_is_value_starter = matches!(next_structural, b'"' | b'{' | b'['); - if matches!(prev_structural, b':' | b',') && !next_is_value_starter { - return Err(qjd_err::QJD_PARSE_ERROR); - } return Ok(()); } - let scalar = &buf[s..e]; - // Dispatch on first byte. + // The gap is non-empty: it MUST be a scalar token, and the state + // must allow a scalar at this position. Strings and containers are + // handled by their structural-token cases, not here. + if !matches!( + *state, + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon + ) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + + validate_scalar(&buf[s..e])?; + *state = parent_after_value(*state); + Ok(()) +} + +/// Dispatch a non-empty whitespace-trimmed scalar token to its +/// grammar validator. Mirrors the previous `check_gap` precedence: +/// - `true` / `false` / `null` exact → Ok +/// - `NaN` / `Infinity` → `QJD_INVALID_NUMBER` (via validate_number) +/// - `-` / digit / `+` / `.` → `validate_number` +/// - Else → `QJD_PARSE_ERROR` +fn validate_scalar(scalar: &[u8]) -> Result<(), qjd_err> { match scalar[0] { b't' => if scalar == b"true" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, b'f' => if scalar == b"false" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, b'n' => if scalar == b"null" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, - // RFC-valid and common malformed number starters (+, ., -, digit). b'-' | b'0'..=b'9' | b'+' | b'.' => number::validate_number(scalar), - // NaN / Infinity are "meant as numbers" → QJD_INVALID_NUMBER, not parse error. _ if scalar == b"NaN" || scalar == b"Infinity" => number::validate_number(scalar), - // Wrong-case literals (TRUE, NULL), identifiers (undefined), other garbage. _ => Err(qjd_err::QJD_PARSE_ERROR), } } @@ -310,4 +421,68 @@ mod tests { Err(qjd_err::QJD_TRAILING_CONTENT), ); } + + // ── grammar state machine (validate_eager_values) ────────────────── + + #[test] + fn grammar_accepts_empty_containers() { + for buf in [&b"{}"[..], &b"[]"[..]] { + assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + "grammar should accept {:?}", buf); + } + } + + #[test] + fn grammar_accepts_simple_values() { + for buf in [ + &b"{\"a\":1}"[..], &b"[1,2,3]"[..], + &b"[true,false,null]"[..], &b"\"hi\""[..], &b"42"[..], + &b"{\"a\":[1,{\"b\":2}]}"[..], + ] { + assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + "grammar should accept {:?}", buf); + } + } + + #[test] + fn grammar_rejects_missing_colon() { + let buf = b"{\"a\"}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_leading_comma_with_value() { + let buf = b"[,1]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_missing_comma_in_object() { + let buf = b"{\"a\":1\"b\":2}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_non_string_object_key() { + let buf = b"{1:1}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_colon_in_array() { + let buf = b"[1:2]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_missing_comma_between_arrays() { + let buf = b"[3[4]]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_trailing_garbage_inside_object() { + let buf = b"{\"a\":\"a\" 123}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } } diff --git a/tests/json_test_suite.rs b/tests/json_test_suite.rs index 31b7b1b..c799395 100644 --- a/tests/json_test_suite.rs +++ b/tests/json_test_suite.rs @@ -34,51 +34,12 @@ const KNOWN_Y_FAILURES: &[&str] = &[ /// n_* files that we currently accept but shouldn't (validator gap). /// -/// All 13 entries below require a grammar-aware structural pass that tracks -/// which token types are legal in each parser state (array element, object -/// key, object value, etc.). That pass is deferred to issue #37. -/// -/// The current validator only catches structural errors detectable from -/// bracket balance + gap heuristics; it does not enforce: -/// - that object keys must be strings -/// - that `:` vs `,` are used in the right places -/// - that array elements are separated by commas (not colons/semicolons) -/// - leading commas before values (gap heuristic fires only for `[,]`) -/// - missing commas between items when no structural gap exists -/// -/// Fix: implement a state-machine pass in src/validate/mod.rs that tracks -/// parser state (AfterKey, AfterColon, AfterValue, …) and rejects tokens -/// that violate the grammar at that state. Removing a file from this list -/// re-enables the assertion. +/// The grammar-aware eager pass in src/validate/mod.rs tracks parser +/// state per container and rejects token transitions that violate +/// RFC 8259. Removing a file from this list re-enables the assertion. const KNOWN_N_FAILURES: &[&str] = &[ - // ── array structural gaps ──────────────────────────────────────────── - // ["": 1] — colon inside array (issue #37: grammar-aware pass) - "n_array_colon_instead_of_comma.json", - // [,1] — leading comma before first value (issue #37) - "n_array_comma_and_number.json", - // [3[4]] — missing comma between elements (issue #37) - "n_array_inner_array_no_comma.json", - // [1:2] — semicolon used instead of comma (issue #37) - "n_array_items_separated_by_semicolon.json", - // [ , ""] — leading comma (gap heuristic only catches [,] not [ ,v]) (issue #37) - "n_array_missing_value.json", - // ── object structural gaps ─────────────────────────────────────────── - // {"x", null} — comma instead of colon (issue #37) - "n_object_comma_instead_of_colon.json", - // {"a":"a" 123} — missing comma between key-value pairs (issue #37) - "n_object_garbage_at_end.json", - // {:"b"} — missing object key (issue #37) - "n_object_missing_key.json", - // {"a" "b"} — missing colon between key and value (issue #37) - "n_object_missing_semicolon.json", - // {1:1} — non-string key: number (issue #37) - "n_object_non_string_key.json", - // {9999E9999:1} — non-string key: huge number (issue #37) - "n_object_non_string_key_but_huge_number_instead.json", - // {null:null,null:null} — non-string key: null literal (issue #37) - "n_object_repeated_null_null.json", - // { "foo" : "bar", "a" } — trailing key without value (issue #37) - "n_object_with_single_string.json", + // (intentionally empty — see git history for the previous list, + // which was closed by the grammar-aware structural pass.) ]; fn corpus_dir() -> &'static Path { diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs index b85b921..790511d 100644 --- a/tests/rfc8259_compliance.rs +++ b/tests/rfc8259_compliance.rs @@ -398,11 +398,10 @@ mod structural { } // RFC 8259 §4: colon between key and value is mandatory. - // The scanner emits {"a"} as {""} with no ':' — eager does not detect this - // because no structural gap heuristic covers the absence of ':'. - // Deferred to a follow-up grammar-aware pass (issue #37). + // The grammar-aware pass detects this: after consuming the key + // string the state is ObjAfterKey, and `}` is rejected because + // it can only close ObjAfterOpen/ObjAfterValue. #[test] - #[ignore = "missing-colon detection deferred — grammar-aware pass required (issue #37)"] fn missing_colon() { assert_rejects_eager!("{\"a\"}", QJD_PARSE_ERROR); } @@ -415,11 +414,10 @@ mod structural { assert_rejects_eager!("[,]", QJD_PARSE_ERROR); } - // [,1] — leading comma followed by a value: the gap between '[' and ',' - // is empty (no value yet) but prev_structural is '[', not ',' — so the - // heuristic does not fire. Deferred to a grammar-aware pass (issue #37). + // [,1] — leading comma followed by a value: the grammar-aware + // pass rejects this because `,` is invalid in the ArrAfterOpen + // state (only a value or `]` is allowed after `[`). #[test] - #[ignore = "leading-comma-before-value detection deferred — grammar-aware pass required (issue #37)"] fn leading_comma_array_with_value() { assert_rejects_eager!("[,1]", QJD_PARSE_ERROR); } @@ -452,13 +450,10 @@ mod structural { } // Missing comma inside an object (no structural separator between values): - // {"a":1"b":2} — the scanner emits `{`, `"`, `"`, `:`, `"`, `"`, `}`. - // The gap between the second close-quote and the third open-quote is empty, - // but prev_structural is `"` (quote) and next is `"` — the heuristic only - // fires on `:` / `,`, so this slips through. - // Deferred to grammar-aware pass (issue #37). + // {"a":1"b":2} — after consuming the value `1`, the state is + // ObjAfterValue; the next `"` (start of "b") is rejected because + // a key/value-position quote is not legal there. #[test] - #[ignore = "missing-comma-in-object detection deferred — grammar-aware pass required (issue #37)"] fn missing_comma_in_object() { assert_rejects_eager!("{\"a\":1\"b\":2}", QJD_PARSE_ERROR); }