diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 225b49f..14adffc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,8 @@ jobs: os: [ubuntu-latest, macos-14] steps: - uses: actions/checkout@v4 + with: + submodules: recursive - name: Install Rust (stable) run: | diff --git a/.gitmodules b/.gitmodules index 2d1c2aa..8baae4a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/lua-cjson"] path = vendor/lua-cjson url = https://github.com/openresty/lua-cjson.git +[submodule "tests/vendor/JSONTestSuite"] + path = tests/vendor/JSONTestSuite + url = https://github.com/nst/JSONTestSuite diff --git a/CLAUDE.md b/CLAUDE.md index e5039a8..cec0555 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,12 +45,12 @@ cargo test --features test-panic --release ### Two-phase parse -**Phase 1** (`src/scan/`, called from `Document::parse`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices: Vec`. A `u32::MAX` sentinel is appended. The scanner is selected at first use via `OnceCell` in `src/scan/mod.rs`: +**Phase 1** (`src/scan/`, called from `Document::parse_with_options`): a structural scanner walks the input once and writes the byte offset of every non-string-interior `{ } [ ] : , "` into `doc.indices`. Then `validate_depth` is run unconditionally; in EAGER mode, `validate_trailing` and `validate_eager_values` (number ABNF + string content + UTF-8) follow. In LAZY mode, value-level checks are skipped and rely on the lazy decode path at field-access time. A `u32::MAX` sentinel is appended. The scanner is selected at first use via `OnceCell` in `src/scan/mod.rs`: - `Avx2Scanner` (gated by the `avx2` cargo feature, default-on) when both `avx2` and `pclmulqdq` are detected at runtime. - `ScalarScanner` otherwise. -Validation is shallow — bracket/quote balance only. Value-level errors (bad escapes, malformed numbers, invalid UTF-8 in `\u`) are deferred to Phase 2 and surface only if that field is accessed. +Validation level depends on `qjd_options.mode`. **EAGER** (default): a post-scan pass walks `indices` and validates RFC 8259 number ABNF, string content (no unescaped control chars), and UTF-8 — parse fails on any value-level violation. **LAZY** (opt-in): bracket/quote balance + max-depth only; value-level errors surface when the offending field is accessed (lua-cjson-equivalent behavior). Trailing-content rejection and value-level validation are eager-only; max-depth (default 1024, configurable up to 4096) is enforced in both modes. **Phase 2** (`src/cursor.rs`, `src/path.rs`, `src/decode/`): path strings are parsed by a zero-alloc `PathIter` into `PathSeg::Key | Idx`. A `Cursor` (a `(idx_start, idx_end)` pair into `doc.indices`) is walked to the target, optionally caching sibling spans in `doc.skip` (`SkipCache`) so repeated lookups on the same container skip brace-counting. Strings are decoded into `doc.scratch` only when they contain escapes; otherwise the original buffer slice is handed back. diff --git a/README.md b/README.md index 9054aef..bc9d0c2 100644 --- a/README.md +++ b/README.md @@ -116,3 +116,37 @@ methodology + reproduction command. ```sh make bench # quickdecode vs cjson ``` + +## RFC 8259 conformance + +This crate implements RFC 8259 with both strict and lenient modes; the strict +(eager) mode is the default and is required by API-gateway use cases that must +reject malformed payloads before forwarding them upstream. + +- Strict-mode acceptance corpus: `tests/rfc8259_compliance.rs` +- Industry corpus: `tests/json_test_suite.rs` (against the + [JSONTestSuite](https://github.com/nst/JSONTestSuite) submodule at + `tests/vendor/JSONTestSuite`) +- Behavior on implementation-defined (`i_*`) cases: `docs/rfc8259-conformance.md` + +### Switching modes + +From Lua: + +```lua +local doc = qd.parse(json) -- eager (default) +local doc = qd.parse(json, { lazy = true }) -- lazy mode +local doc = qd.parse(json, { max_depth = 256 }) -- stricter depth limit +local doc = qd.parse(json, { lazy = true, max_depth = 256 }) +``` + +From C: + +```c +qjd_options opts = { .mode = QJD_MODE_LAZY, .max_depth = 256 }; +qjd_doc* doc = qjd_parse_ex(buf, len, &opts, &err); +``` + +### Known gaps + +Three structural-grammar checks are deferred to a follow-up — they require a grammar-aware walk beyond the current heuristic. See `tests/rfc8259_compliance.rs` for the specific `#[ignore]`d cases, and `tests/json_test_suite.rs::KNOWN_N_FAILURES` for the corresponding JSONTestSuite files. diff --git a/docs/rfc8259-conformance.md b/docs/rfc8259-conformance.md new file mode 100644 index 0000000..b203d84 --- /dev/null +++ b/docs/rfc8259-conformance.md @@ -0,0 +1,19 @@ +# RFC 8259 conformance: implementation-defined cases + +JSONTestSuite categorizes some inputs as `i_*` — the spec allows either +acceptance or rejection. This file records `lua-quick-decode`'s behavior on +each, so changes show up in `git diff`. + +Behavior is recorded for the default **EAGER** mode unless noted. + +| File pattern | Our verdict | Rationale | +|---|---|---| +| `i_number_huge_exp` | REJECT (`QJD_NUMBER_OUT_OF_RANGE`) | f64 overflow surfaces at decode. | +| `i_number_very_big_negative_int` | varies — see below | ABNF-valid; representational, not structural. | +| `i_string_*` (UTF-16 surrogate halves in `\u` escapes) | REJECT (`QJD_DECODE_FAILED`) | We require well-formed surrogate pairs. | +| `i_structure_500_nested_arrays` | ACCEPT (within default 1024 max_depth) | Configurable. | + +Run `cargo test --release --test json_test_suite -- --nocapture` to print the +live verdict for every `i_*` file via the `document_i_files_behavior` test. +That is the source of truth for these entries; update this table when a +verdict changes (e.g. after a validator gap is closed). diff --git a/include/lua_quick_decode.h b/include/lua_quick_decode.h index e3aeab2..f920ab1 100644 --- a/include/lua_quick_decode.h +++ b/include/lua_quick_decode.h @@ -9,15 +9,21 @@ extern "C" { #endif typedef enum { - QJD_OK = 0, - QJD_PARSE_ERROR = 1, - QJD_NOT_FOUND = 2, - QJD_TYPE_MISMATCH = 3, - QJD_OUT_OF_RANGE = 4, - QJD_DECODE_FAILED = 5, - QJD_INVALID_PATH = 6, - QJD_INVALID_ARG = 7, - QJD_OOM = 8 + QJD_OK = 0, + QJD_PARSE_ERROR = 1, + QJD_NOT_FOUND = 2, + QJD_TYPE_MISMATCH = 3, + QJD_OUT_OF_RANGE = 4, + QJD_DECODE_FAILED = 5, + QJD_INVALID_PATH = 6, + QJD_INVALID_ARG = 7, + QJD_OOM = 8, + QJD_NESTING_TOO_DEEP = 9, + QJD_TRAILING_CONTENT = 10, + QJD_NUMBER_OUT_OF_RANGE = 11, + QJD_INVALID_NUMBER = 12, + QJD_INVALID_STRING = 13, + QJD_INVALID_UTF8 = 14 } qjd_err; typedef enum { @@ -25,6 +31,15 @@ typedef enum { QJD_T_STR = 3, QJD_T_ARR = 4, QJD_T_OBJ = 5 } qjd_type; +#define QJD_MODE_EAGER 0u +#define QJD_MODE_LAZY 1u +#define QJD_DEFAULT_MAX_DEPTH 1024u + +typedef struct { + uint32_t mode; /* QJD_MODE_EAGER (0) or QJD_MODE_LAZY (1) */ + uint32_t max_depth; /* 0 = use QJD_DEFAULT_MAX_DEPTH */ +} qjd_options; + typedef struct qjd_doc qjd_doc; typedef struct { @@ -38,6 +53,8 @@ typedef struct { const char* qjd_strerror(int code); qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out); +qjd_doc* qjd_parse_ex(const uint8_t* buf, size_t len, + const qjd_options* opts, int* err_out); void qjd_free (qjd_doc* doc); int qjd_get_str (qjd_doc*, const char* path, size_t path_len, diff --git a/lua/quickdecode.lua b/lua/quickdecode.lua index 0851895..5ab6c5f 100644 --- a/lua/quickdecode.lua +++ b/lua/quickdecode.lua @@ -7,9 +7,16 @@ typedef struct { uint32_t idx_start, idx_end, _reserved0, _reserved1; } qjd_cursor; +typedef struct { + uint32_t mode; + uint32_t max_depth; +} qjd_options; + const char* qjd_strerror(int code); -qjd_doc* qjd_parse(const uint8_t* buf, size_t len, int* err_out); -void qjd_free(qjd_doc* doc); +qjd_doc* qjd_parse (const uint8_t* buf, size_t len, int* err_out); +qjd_doc* qjd_parse_ex(const uint8_t* buf, size_t len, + const qjd_options* opts, int* err_out); +void qjd_free (qjd_doc* doc); int qjd_get_str (qjd_doc*, const char* path, size_t path_len, const uint8_t** p, size_t* n); int qjd_get_i64 (qjd_doc*, const char* path, size_t path_len, int64_t* out); @@ -48,11 +55,31 @@ local strp_box = ffi.new("const uint8_t*[1]") local cur_box = ffi.new("qjd_cursor[1]") local NOT_FOUND = 2 +-- Error codes mirrored from include/lua_quick_decode.h. Kept in sync manually; +-- src/error.rs has the authoritative numbering. +local ERR = { + OK = 0, + PARSE_ERROR = 1, + NOT_FOUND = 2, + TYPE_MISMATCH = 3, + OUT_OF_RANGE = 4, + DECODE_FAILED = 5, + INVALID_PATH = 6, + INVALID_ARG = 7, + OOM = 8, + NESTING_TOO_DEEP = 9, + TRAILING_CONTENT = 10, + NUMBER_OUT_OF_RANGE = 11, + INVALID_NUMBER = 12, + INVALID_STRING = 13, + INVALID_UTF8 = 14, +} local _M = { T_NULL = 0, T_BOOL = 1, T_NUM = 2, T_STR = 3, T_ARR = 4, T_OBJ = 5, } +_M.ERR = ERR local Doc = {}; Doc.__index = Doc local Cursor = {}; Cursor.__index = Cursor @@ -63,8 +90,31 @@ local function check_err(rc) error("quickdecode: " .. ffi.string(C.qjd_strerror(rc))) end -function _M.parse(json_str) - local ptr = C.qjd_parse(json_str, #json_str, err_box) +local opts_box = ffi.new("qjd_options[1]") + +local MODE_EAGER = 0 +local MODE_LAZY = 1 + +function _M.parse(json_str, opts) + local ptr + if opts == nil then + ptr = C.qjd_parse(json_str, #json_str, err_box) + else + if type(opts) ~= "table" then + error("quickdecode.parse: opts must be a table") + end + local lazy = opts.lazy + if lazy ~= nil and type(lazy) ~= "boolean" then + error("quickdecode.parse: opts.lazy must be a boolean") + end + local max_depth = opts.max_depth or 0 + if type(max_depth) ~= "number" or max_depth < 0 or max_depth ~= math.floor(max_depth) then + error("quickdecode.parse: opts.max_depth must be a non-negative integer") + end + opts_box[0].mode = lazy and MODE_LAZY or MODE_EAGER + opts_box[0].max_depth = max_depth + ptr = C.qjd_parse_ex(json_str, #json_str, opts_box, err_box) + end if ptr == nil then error("quickdecode: " .. ffi.string(C.qjd_strerror(err_box[0]))) end diff --git a/src/decode/number.rs b/src/decode/number.rs index 45d2f89..1beda2d 100644 --- a/src/decode/number.rs +++ b/src/decode/number.rs @@ -1,10 +1,8 @@ use crate::error::qjd_err; pub(crate) fn parse_i64(bytes: &[u8]) -> Result { - if bytes.is_empty() { - return Err(qjd_err::QJD_DECODE_FAILED); - } - // Reject non-integer JSON numbers (with decimal point or exponent). + crate::validate::validate_number(bytes)?; + // After ABNF validation, integer-only inputs have no `.`/`e`/`E`. if bytes.iter().any(|&b| b == b'.' || b == b'e' || b == b'E') { return Err(qjd_err::QJD_TYPE_MISMATCH); } @@ -12,9 +10,7 @@ pub(crate) fn parse_i64(bytes: &[u8]) -> Result { b'-' => (true, &bytes[1..]), _ => (false, bytes), }; - if rest.is_empty() || !rest.iter().all(|c| c.is_ascii_digit()) { - return Err(qjd_err::QJD_DECODE_FAILED); - } + // ABNF guarantees `rest` is non-empty and digit-only here. let mut v: i64 = 0; for &c in rest { let d = (c - b'0') as i64; @@ -29,11 +25,13 @@ pub(crate) fn parse_i64(bytes: &[u8]) -> Result { } pub(crate) fn parse_f64(bytes: &[u8]) -> Result { - if bytes.is_empty() { - return Err(qjd_err::QJD_DECODE_FAILED); - } + crate::validate::validate_number(bytes)?; let s = std::str::from_utf8(bytes).map_err(|_| qjd_err::QJD_DECODE_FAILED)?; - s.parse::().map_err(|_| qjd_err::QJD_DECODE_FAILED) + match s.parse::() { + Ok(v) if v.is_finite() => Ok(v), + Ok(_) => Err(qjd_err::QJD_NUMBER_OUT_OF_RANGE), + Err(_) => Err(qjd_err::QJD_DECODE_FAILED), + } } #[cfg(test)] @@ -63,7 +61,7 @@ mod tests { #[test] fn i64_rejects_empty() { - assert_eq!(parse_i64(b""), Err(qjd_err::QJD_DECODE_FAILED)); + assert_eq!(parse_i64(b""), Err(qjd_err::QJD_INVALID_NUMBER)); } #[test] fn f64_zero() { assert_eq!(parse_f64(b"0.0").unwrap(), 0.0); } @@ -73,6 +71,6 @@ mod tests { #[test] fn f64_rejects_garbage() { - assert_eq!(parse_f64(b"hello"), Err(qjd_err::QJD_DECODE_FAILED)); + assert_eq!(parse_f64(b"hello"), Err(qjd_err::QJD_INVALID_NUMBER)); } } diff --git a/src/decode/string.rs b/src/decode/string.rs index d879ac5..8572441 100644 --- a/src/decode/string.rs +++ b/src/decode/string.rs @@ -7,6 +7,7 @@ pub(crate) fn decode_string( buf: &[u8], start: usize, end: usize, scratch: &mut Vec, ) -> Result<(*const u8, usize), qjd_err> { let slice = &buf[start..end]; + crate::validate::validate_string_span(slice)?; if memchr::memchr(b'\\', slice).is_none() { return Ok((slice.as_ptr(), slice.len())); } @@ -163,16 +164,21 @@ mod tests { #[test] fn invalid_hex_in_unicode_fails() { - assert_eq!(d(b"\\uZZZZ").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span (called first) catches non-hex digits as + // QJD_INVALID_STRING; the decode loop would also catch it as + // QJD_DECODE_FAILED, but we never reach it. + assert_eq!(d(b"\\uZZZZ").unwrap_err(), qjd_err::QJD_INVALID_STRING); } #[test] fn unknown_escape_fails() { - assert_eq!(d(b"\\q").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span catches unknown escape introducers first. + assert_eq!(d(b"\\q").unwrap_err(), qjd_err::QJD_INVALID_STRING); } #[test] fn dangling_backslash_fails() { - assert_eq!(d(b"a\\").unwrap_err(), qjd_err::QJD_DECODE_FAILED); + // validate_string_span catches a trailing lone backslash first. + assert_eq!(d(b"a\\").unwrap_err(), qjd_err::QJD_INVALID_STRING); } } diff --git a/src/doc.rs b/src/doc.rs index 707bb44..d20e17f 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -12,10 +12,31 @@ pub struct Document<'a> { impl<'a> Document<'a> { pub fn parse(buf: &'a [u8]) -> Result { + Self::parse_with_options(buf, &crate::options::Options::default()) + } + + pub fn parse_with_options( + buf: &'a [u8], + opts: &crate::options::Options, + ) -> Result { + // RFC 8259 §2: "A JSON text is a serialized value." + // Empty input and whitespace-only input contain no value. + if buf.iter().all(|&b| matches!(b, b' ' | b'\t' | b'\n' | b'\r')) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + + let max_depth = opts.effective_max_depth(); let mut indices = Vec::new(); crate::scan::scan(buf, &mut indices).map_err(|_| qjd_err::QJD_PARSE_ERROR)?; - // Sentinel simplifies boundary checks during Phase 2. indices.push(u32::MAX); + + crate::validate::validate_depth(buf, &indices, max_depth)?; + + if opts.is_eager() { + crate::validate::validate_trailing(buf, &indices)?; + crate::validate::validate_eager_values(buf, &indices)?; + } + Ok(Self { buf, indices, @@ -169,4 +190,19 @@ mod tests { fn parse_error_on_malformed() { assert!(Document::parse(b"{").is_err()); } + + #[test] + fn parse_with_options_defaults_match_parse() { + let opts = crate::options::Options::default(); + let a = Document::parse(b"{\"a\":1}").unwrap(); + let b = Document::parse_with_options(b"{\"a\":1}", &opts).unwrap(); + assert_eq!(a.indices, b.indices); + } + + #[test] + fn parse_with_lazy_skips_eager_validation() { + // Trailing content is an eager-only check; lazy must accept it. + let opts = crate::options::Options { mode: crate::options::QJD_MODE_LAZY, max_depth: 0 }; + assert!(Document::parse_with_options(b"{}garbage", &opts).is_ok()); + } } diff --git a/src/error.rs b/src/error.rs index 270ea10..72ff3e9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,15 +3,21 @@ #[repr(C)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum qjd_err { - QJD_OK = 0, - QJD_PARSE_ERROR = 1, - QJD_NOT_FOUND = 2, - QJD_TYPE_MISMATCH = 3, - QJD_OUT_OF_RANGE = 4, - QJD_DECODE_FAILED = 5, - QJD_INVALID_PATH = 6, - QJD_INVALID_ARG = 7, - QJD_OOM = 8, + QJD_OK = 0, + QJD_PARSE_ERROR = 1, + QJD_NOT_FOUND = 2, + QJD_TYPE_MISMATCH = 3, + QJD_OUT_OF_RANGE = 4, + QJD_DECODE_FAILED = 5, + QJD_INVALID_PATH = 6, + QJD_INVALID_ARG = 7, + QJD_OOM = 8, + QJD_NESTING_TOO_DEEP = 9, + QJD_TRAILING_CONTENT = 10, + QJD_NUMBER_OUT_OF_RANGE = 11, + QJD_INVALID_NUMBER = 12, + QJD_INVALID_STRING = 13, + QJD_INVALID_UTF8 = 14, } #[repr(C)] @@ -27,15 +33,21 @@ pub enum qjd_type { pub fn strerror(code: qjd_err) -> &'static str { match code { - qjd_err::QJD_OK => "ok", - qjd_err::QJD_PARSE_ERROR => "JSON parse error", - qjd_err::QJD_NOT_FOUND => "path not found", - qjd_err::QJD_TYPE_MISMATCH => "type mismatch at path", - qjd_err::QJD_OUT_OF_RANGE => "numeric out of range", - qjd_err::QJD_DECODE_FAILED => "decode failed", - qjd_err::QJD_INVALID_PATH => "invalid path syntax", - qjd_err::QJD_INVALID_ARG => "invalid argument", - qjd_err::QJD_OOM => "out of memory", + qjd_err::QJD_OK => "ok", + qjd_err::QJD_PARSE_ERROR => "JSON parse error", + qjd_err::QJD_NOT_FOUND => "path not found", + qjd_err::QJD_TYPE_MISMATCH => "type mismatch at path", + qjd_err::QJD_OUT_OF_RANGE => "numeric out of range", + qjd_err::QJD_DECODE_FAILED => "decode failed", + qjd_err::QJD_INVALID_PATH => "invalid path syntax", + qjd_err::QJD_INVALID_ARG => "invalid argument", + qjd_err::QJD_OOM => "out of memory", + qjd_err::QJD_NESTING_TOO_DEEP => "nesting depth exceeds limit", + qjd_err::QJD_TRAILING_CONTENT => "trailing content after root value", + qjd_err::QJD_NUMBER_OUT_OF_RANGE => "number out of representable range", + qjd_err::QJD_INVALID_NUMBER => "invalid number format (RFC 8259)", + qjd_err::QJD_INVALID_STRING => "invalid string content (unescaped control char)", + qjd_err::QJD_INVALID_UTF8 => "invalid UTF-8 in string", } } @@ -50,6 +62,9 @@ mod tests { qjd_err::QJD_TYPE_MISMATCH, qjd_err::QJD_OUT_OF_RANGE, qjd_err::QJD_DECODE_FAILED, qjd_err::QJD_INVALID_PATH, qjd_err::QJD_INVALID_ARG, qjd_err::QJD_OOM, + qjd_err::QJD_NESTING_TOO_DEEP, qjd_err::QJD_TRAILING_CONTENT, + qjd_err::QJD_NUMBER_OUT_OF_RANGE, qjd_err::QJD_INVALID_NUMBER, + qjd_err::QJD_INVALID_STRING, qjd_err::QJD_INVALID_UTF8, ] { assert!(!strerror(code).is_empty()); } diff --git a/src/ffi.rs b/src/ffi.rs index b5110b1..4cecaef 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -55,16 +55,22 @@ pub struct qjd_doc(pub(crate) Document<'static>); pub unsafe extern "C" fn qjd_strerror(code: c_int) -> *const c_char { // Hardcoded NUL-terminated map; avoids runtime allocation and lifetime issues. let s: &'static [u8] = match code { - 0 => b"ok\0", - 1 => b"JSON parse error\0", - 2 => b"path not found\0", - 3 => b"type mismatch at path\0", - 4 => b"numeric out of range\0", - 5 => b"decode failed\0", - 6 => b"invalid path syntax\0", - 7 => b"invalid argument\0", - 8 => b"out of memory\0", - _ => b"unknown error code\0", + 0 => b"ok\0", + 1 => b"JSON parse error\0", + 2 => b"path not found\0", + 3 => b"type mismatch at path\0", + 4 => b"numeric out of range\0", + 5 => b"decode failed\0", + 6 => b"invalid path syntax\0", + 7 => b"invalid argument\0", + 8 => b"out of memory\0", + 9 => b"nesting depth exceeds limit\0", + 10 => b"trailing content after root value\0", + 11 => b"number out of representable range\0", + 12 => b"invalid number format (RFC 8259)\0", + 13 => b"invalid string content (unescaped control char)\0", + 14 => b"invalid UTF-8 in string\0", + _ => b"unknown error code\0", }; s.as_ptr() as *const c_char } @@ -75,8 +81,8 @@ pub unsafe extern "C" fn qjd_strerror(code: c_int) -> *const c_char { /// /// - `buf` must point to `len` readable bytes, or be NULL (in which case the /// function returns NULL with `*err_out = QJD_INVALID_ARG`). -/// - `err_out` must point to a writable `int`, or be NULL (in which case the -/// function returns NULL with no error code written). +/// - `err_out` may be NULL. When non-NULL it receives `QJD_OK` on success or +/// an error code on failure. /// - The buffer must remain valid and unmodified for the lifetime of the /// returned `qjd_doc*`; the document borrows it. /// - On success, the returned pointer must be freed exactly once with @@ -86,20 +92,44 @@ pub unsafe extern "C" fn qjd_parse( buf: *const u8, len: usize, err_out: *mut c_int, +) -> *mut qjd_doc { + let default = crate::options::Options::default(); + qjd_parse_ex(buf, len, &default as *const _, err_out) +} + +/// Parse with caller-supplied options. `opts` may be NULL to mean defaults +/// (eager mode, default max_depth). +/// +/// # Safety +/// +/// Same as `qjd_parse`, with the additional contract that `opts`, when +/// non-NULL, points to a readable `qjd_options` for the duration of the call +/// (the struct is copied internally). +#[no_mangle] +pub unsafe extern "C" fn qjd_parse_ex( + buf: *const u8, + len: usize, + opts: *const crate::options::Options, + err_out: *mut c_int, ) -> *mut qjd_doc { let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - if buf.is_null() || err_out.is_null() { + if buf.is_null() { if !err_out.is_null() { *err_out = qjd_err::QJD_INVALID_ARG as c_int; } return ptr::null_mut(); } + let opts_owned = if opts.is_null() { + crate::options::Options::default() + } else { + *opts + }; let slice: &'static [u8] = std::slice::from_raw_parts(buf, len); - match Document::parse(slice) { + match Document::parse_with_options(slice, &opts_owned) { Ok(d) => { - *err_out = qjd_err::QJD_OK as c_int; + if !err_out.is_null() { *err_out = qjd_err::QJD_OK as c_int; } Box::into_raw(Box::new(qjd_doc(d))) } Err(e) => { - *err_out = e as c_int; + if !err_out.is_null() { *err_out = e as c_int; } ptr::null_mut() } } diff --git a/src/lib.rs b/src/lib.rs index 83f161b..87f5c6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,14 @@ //! lua-quick-decode: Rust JSON decoder for LuaJIT FFI consumers. pub mod error; +pub mod options; pub(crate) mod scan; mod skip_cache; -mod doc; +pub mod doc; mod path; mod cursor; mod decode; +mod validate; pub mod ffi; #[doc(hidden)] diff --git a/src/options.rs b/src/options.rs new file mode 100644 index 0000000..3c1241c --- /dev/null +++ b/src/options.rs @@ -0,0 +1,55 @@ +#![allow(non_camel_case_types)] + +pub const QJD_MODE_EAGER: u32 = 0; +pub const QJD_MODE_LAZY: u32 = 1; +pub const QJD_DEFAULT_MAX_DEPTH: u32 = 1024; +pub const QJD_MAX_MAX_DEPTH: u32 = 4096; + +/// Caller-visible parse options. Layout is FFI-stable: kept in sync with +/// `qjd_options` in `include/lua_quick_decode.h`. +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct Options { + /// `QJD_MODE_EAGER` (0) — full RFC 8259 validation during parse. + /// `QJD_MODE_LAZY` (1) — structural-only; defer value errors to access. + pub mode: u32, + /// Max bracket nesting depth. `0` selects `QJD_DEFAULT_MAX_DEPTH` (1024). + /// Values >`QJD_MAX_MAX_DEPTH` are clamped to that ceiling. + pub max_depth: u32, +} + +impl Default for Options { + fn default() -> Self { + Self { mode: QJD_MODE_EAGER, max_depth: 0 } + } +} + +#[allow(dead_code)] // used in Task 6+ validators +impl Options { + pub(crate) fn effective_max_depth(&self) -> u32 { + let d = if self.max_depth == 0 { QJD_DEFAULT_MAX_DEPTH } else { self.max_depth }; + d.min(QJD_MAX_MAX_DEPTH) + } + + pub(crate) fn is_eager(&self) -> bool { + self.mode == QJD_MODE_EAGER + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] fn default_is_eager() { assert!(Options::default().is_eager()); } + + #[test] + fn zero_max_depth_falls_back_to_default() { + assert_eq!(Options::default().effective_max_depth(), QJD_DEFAULT_MAX_DEPTH); + } + + #[test] + fn huge_max_depth_is_clamped() { + let o = Options { mode: 0, max_depth: u32::MAX }; + assert_eq!(o.effective_max_depth(), QJD_MAX_MAX_DEPTH); + } +} diff --git a/src/validate/mod.rs b/src/validate/mod.rs new file mode 100644 index 0000000..8ddee23 --- /dev/null +++ b/src/validate/mod.rs @@ -0,0 +1,488 @@ +//! Post-scan validators invoked by Document::parse_with_options. +//! +//! Walking the already-emitted `indices` array is intentionally +//! decoupled from the SIMD/scalar scanner paths so the structural +//! scanner code stays untouched. + +pub(crate) mod number; +pub(crate) use number::validate_number; + +pub(crate) mod strings; +pub(crate) use strings::validate_string_span; + +use crate::error::qjd_err; + +/// Verify that the maximum bracket-stack depth implied by `indices` +/// does not exceed `max_depth`. Walks indices once; assumes scan() has +/// already validated bracket pairing. +/// +/// `indices` is the post-scan vector with the trailing u32::MAX sentinel. +pub(crate) fn validate_depth( + buf: &[u8], + indices: &[u32], + max_depth: u32, +) -> Result<(), qjd_err> { + let mut depth: u32 = 0; + for &idx in indices { + if idx == u32::MAX { break; } + match buf[idx as usize] { + b'{' | b'[' => { + depth += 1; + if depth > max_depth { + return Err(qjd_err::QJD_NESTING_TOO_DEEP); + } + } + b'}' | b']' => { + // Cannot underflow: scan() already validated pairing. + depth -= 1; + } + _ => {} + } + } + Ok(()) +} + +/// Verify there is no non-whitespace content after the root value. +/// +/// For container roots (`{`/`[`), we walk `indices` to find the closing +/// bracket where nesting depth returns to zero — that is the actual root +/// end, regardless of how many additional structural chars the buffer has. +/// For scalar roots (no opening bracket), we scan the raw bytes. +pub(crate) fn validate_trailing( + buf: &[u8], + indices: &[u32], +) -> Result<(), qjd_err> { + // Find the first real structural character to determine root kind. + let first = indices.iter().find(|&&i| i != u32::MAX).copied(); + + let root_end = match first { + None => { + // No structural chars: bare scalar (number/true/false/null). + let mut p = 0; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + let start = p; + while p < buf.len() && !is_ws(buf[p]) { p += 1; } + if start == p { return Ok(()); } // whitespace-only (scan already rejected empty) + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + Some(first_idx) => { + match buf[first_idx as usize] { + b'{' | b'[' => { + // Walk indices to find the closing bracket at depth 0. + let mut depth: i32 = 0; + let mut closer: usize = first_idx as usize; + // Track whether we're inside a string (skip string interiors). + let mut in_str = false; + for &idx in indices { + if idx == u32::MAX { break; } + let pos = idx as usize; + match buf[pos] { + b'"' => { in_str = !in_str; } + _ if in_str => {} + b'{' | b'[' => { depth += 1; } + b'}' | b']' => { + depth -= 1; + if depth == 0 { closer = pos; break; } + } + _ => {} + } + } + let mut p = closer + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + b'"' => { + // Root is a string: opening quote at first_idx. + // The closing quote is the next structural char. + let close = indices.iter() + .skip(1) // skip the opening quote + .find(|&&i| i != u32::MAX) + .copied() + .unwrap_or(first_idx); // unclosed: scan already rejected + let mut p = close as usize + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + _ => { + // Structural char that's not an opener: scan/eager already + // would have caught a malformed root. Treat last structural as end. + let last = indices.iter().rev() + .find(|&&i| i != u32::MAX) + .copied() + .unwrap_or(first_idx); + let mut p = last as usize + 1; + while p < buf.len() && is_ws(buf[p]) { p += 1; } + p + } + } + } + }; + + if root_end < buf.len() { + return Err(qjd_err::QJD_TRAILING_CONTENT); + } + Ok(()) +} + +/// Grammar-aware eager pass: walk `indices` once and validate every +/// structural transition, key/value string, and scalar value. +/// +/// The state machine tracks the expected next-token kind in each +/// container context (object/array) via a stack. Empty gaps where a +/// value is required (`[,]`, `{"a":}`), missing colons (`{"a"}`), +/// missing commas (`{"a":1"b":2}`), non-string object keys (`{1:1}`), +/// and stray structural tokens (`[1:2]`) all surface here as +/// `QJD_PARSE_ERROR`. +/// +/// Scalar tokens (numbers, `true`, `false`, `null`) live in the byte +/// gap before the *next* structural offset. They are dispatched to +/// `validate_number` or matched against the three literal keywords; +/// the error-code precedence matches the previous heuristic-based +/// `check_gap` so existing tests keep their current error codes. +pub(crate) fn validate_eager_values( + buf: &[u8], + indices: &[u32], +) -> Result<(), qjd_err> { + // Stack of container contexts; the top is the current state. + // We use a single seed entry `CtxKind::Top` for the root value. + let mut stack: Vec = Vec::with_capacity(16); + stack.push(CtxKind::Top); + + // Byte position just past the previous structural we consumed — + // i.e. the start of the current gap. A gap may contain a scalar + // value or be whitespace-only. + let mut prev_end: usize = 0; + + let mut i: usize = 0; + while i < indices.len() { + let idx = indices[i]; + if idx == u32::MAX { break; } + let pos = idx as usize; + let b = buf[pos]; + + // First, consume any scalar token sitting in the gap before + // this structural. This may transition the current state from + // a value-expecting form to its "AfterValue" form. + consume_scalar_gap(buf, prev_end, pos, stack.last_mut().unwrap())?; + + match b { + b'{' | b'[' => { + let cur = stack.last_mut().unwrap(); + match *cur { + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon => { + // Transition parent to AfterValue ahead of the + // descent; the inner container's close pops back. + *cur = parent_after_value(*cur); + stack.push(if b == b'{' { + CtxKind::ObjAfterOpen + } else { + CtxKind::ArrAfterOpen + }); + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; + } + b'}' => { + let top = stack.pop().ok_or(qjd_err::QJD_PARSE_ERROR)?; + if !matches!(top, CtxKind::ObjAfterOpen | CtxKind::ObjAfterValue) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + if stack.is_empty() { return Err(qjd_err::QJD_PARSE_ERROR); } + prev_end = pos + 1; + i += 1; + } + b']' => { + let top = stack.pop().ok_or(qjd_err::QJD_PARSE_ERROR)?; + if !matches!(top, CtxKind::ArrAfterOpen | CtxKind::ArrAfterValue) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + if stack.is_empty() { return Err(qjd_err::QJD_PARSE_ERROR); } + prev_end = pos + 1; + i += 1; + } + b',' => { + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + CtxKind::ArrAfterValue => *cur = CtxKind::ArrAfterComma, + CtxKind::ObjAfterValue => *cur = CtxKind::ObjAfterComma, + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; + } + b':' => { + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + CtxKind::ObjAfterKey => *cur = CtxKind::ObjAfterColon, + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = pos + 1; + i += 1; + } + b'"' => { + // The scanner pairs the opening and closing quotes; the + // closing quote is at indices[i + 1]. + if i + 1 >= indices.len() { return Err(qjd_err::QJD_PARSE_ERROR); } + let close = indices[i + 1] as usize; + if close <= pos || close >= buf.len() || buf[close] != b'"' { + return Err(qjd_err::QJD_PARSE_ERROR); + } + strings::validate_string_span(&buf[pos + 1 .. close])?; + + let cur = stack.last_mut().ok_or(qjd_err::QJD_PARSE_ERROR)?; + match *cur { + // Key position in an object. + CtxKind::ObjAfterOpen | CtxKind::ObjAfterComma => { + *cur = CtxKind::ObjAfterKey; + } + // Value position (top-level, array element, or object value). + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon => { + *cur = parent_after_value(*cur); + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + prev_end = close + 1; + i += 2; + } + _ => return Err(qjd_err::QJD_PARSE_ERROR), + } + } + + // Tail: a top-level scalar root (e.g. `42`, `true`) lives in the + // gap after the last structural — or, if there are no structurals, + // the whole buffer. + consume_scalar_gap(buf, prev_end, buf.len(), stack.last_mut().unwrap())?; + + // After the walk, the stack must hold exactly one frame: the root + // context, which must be `TopDone` (root value consumed). + if stack.len() != 1 || stack[0] != CtxKind::TopDone { + return Err(qjd_err::QJD_PARSE_ERROR); + } + Ok(()) +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CtxKind { + Top, // top-level value not yet consumed + TopDone, // top-level value consumed; only whitespace/EOI allowed + ArrAfterOpen, // just saw `[`; expect value or `]` + ArrAfterValue, // just saw a value; expect `,` or `]` + ArrAfterComma, // just saw `,`; expect value (no trailing comma) + ObjAfterOpen, // just saw `{`; expect key (string) or `}` + ObjAfterKey, // just saw key string; expect `:` + ObjAfterColon, // just saw `:`; expect value + ObjAfterValue, // just saw value; expect `,` or `}` + ObjAfterComma, // just saw `,`; expect key (no trailing comma) +} + +/// Transition the value-expecting state to its corresponding +/// "after value" state once the value (scalar / string / container) +/// has been consumed. +#[inline] +fn parent_after_value(s: CtxKind) -> CtxKind { + match s { + CtxKind::Top => CtxKind::TopDone, + CtxKind::ArrAfterOpen => CtxKind::ArrAfterValue, + CtxKind::ArrAfterComma => CtxKind::ArrAfterValue, + CtxKind::ObjAfterColon => CtxKind::ObjAfterValue, + other => other, // unreachable for callers + } +} + +/// Examine the byte gap `[start, end)` between two structurals. +/// If the gap contains a scalar token, validate it and transition +/// `*state` to its corresponding "AfterValue" form. If the gap is +/// whitespace only, leave `*state` unchanged — the next structural's +/// own check rejects empty values where they are not allowed +/// (e.g. `ObjAfterColon` followed by `}` is caught when `}` pops). +fn consume_scalar_gap( + buf: &[u8], + start: usize, + end: usize, + state: &mut CtxKind, +) -> Result<(), qjd_err> { + // Strip whitespace. + let mut s = start; + while s < end && is_ws(buf[s]) { s += 1; } + let mut e = end; + while e > s && is_ws(buf[e - 1]) { e -= 1; } + + if s == e { + return Ok(()); + } + + // The gap is non-empty: it MUST be a scalar token, and the state + // must allow a scalar at this position. Strings and containers are + // handled by their structural-token cases, not here. + if !matches!( + *state, + CtxKind::Top + | CtxKind::ArrAfterOpen + | CtxKind::ArrAfterComma + | CtxKind::ObjAfterColon + ) { + return Err(qjd_err::QJD_PARSE_ERROR); + } + + validate_scalar(&buf[s..e])?; + *state = parent_after_value(*state); + Ok(()) +} + +/// Dispatch a non-empty whitespace-trimmed scalar token to its +/// grammar validator. Mirrors the previous `check_gap` precedence: +/// - `true` / `false` / `null` exact → Ok +/// - `NaN` / `Infinity` → `QJD_INVALID_NUMBER` (via validate_number) +/// - `-` / digit / `+` / `.` → `validate_number` +/// - Else → `QJD_PARSE_ERROR` +fn validate_scalar(scalar: &[u8]) -> Result<(), qjd_err> { + match scalar[0] { + b't' => if scalar == b"true" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + b'f' => if scalar == b"false" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + b'n' => if scalar == b"null" { Ok(()) } else { Err(qjd_err::QJD_PARSE_ERROR) }, + b'-' | b'0'..=b'9' | b'+' | b'.' => number::validate_number(scalar), + _ if scalar == b"NaN" || scalar == b"Infinity" => number::validate_number(scalar), + _ => Err(qjd_err::QJD_PARSE_ERROR), + } +} + +#[inline(always)] +fn is_ws(b: u8) -> bool { + matches!(b, b' ' | b'\t' | b'\n' | b'\r') +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ix(buf: &[u8]) -> Vec { + let mut v = Vec::new(); + crate::scan::scan(buf, &mut v).unwrap(); + v.push(u32::MAX); + v + } + + #[test] + fn under_limit_ok() { + let buf = b"[[1]]"; + assert!(validate_depth(buf, &ix(buf), 2).is_ok()); + } + + #[test] + fn over_limit_rejected() { + let buf = b"[[[1]]]"; + assert_eq!( + validate_depth(buf, &ix(buf), 2), + Err(qjd_err::QJD_NESTING_TOO_DEEP), + ); + } + + #[test] + fn trailing_clean_container() { + let buf = b"{}"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn trailing_whitespace_accepted() { + let buf = b"{} \n\t"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn trailing_garbage_rejected() { + let buf = b"{}garbage"; + assert_eq!( + validate_trailing(buf, &ix(buf)), + Err(qjd_err::QJD_TRAILING_CONTENT), + ); + } + + #[test] + fn bare_scalar_trailing_ws_accepted() { + let buf = b"42 \n\t"; + assert!(validate_trailing(buf, &ix(buf)).is_ok()); + } + + #[test] + fn two_root_scalars_rejected() { + let buf = b"1 2"; + assert_eq!( + validate_trailing(buf, &ix(buf)), + Err(qjd_err::QJD_TRAILING_CONTENT), + ); + } + + // ── grammar state machine (validate_eager_values) ────────────────── + + #[test] + fn grammar_accepts_empty_containers() { + for buf in [&b"{}"[..], &b"[]"[..]] { + assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + "grammar should accept {:?}", buf); + } + } + + #[test] + fn grammar_accepts_simple_values() { + for buf in [ + &b"{\"a\":1}"[..], &b"[1,2,3]"[..], + &b"[true,false,null]"[..], &b"\"hi\""[..], &b"42"[..], + &b"{\"a\":[1,{\"b\":2}]}"[..], + ] { + assert!(validate_eager_values(buf, &ix(buf)).is_ok(), + "grammar should accept {:?}", buf); + } + } + + #[test] + fn grammar_rejects_missing_colon() { + let buf = b"{\"a\"}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_leading_comma_with_value() { + let buf = b"[,1]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_missing_comma_in_object() { + let buf = b"{\"a\":1\"b\":2}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_non_string_object_key() { + let buf = b"{1:1}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_colon_in_array() { + let buf = b"[1:2]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_missing_comma_between_arrays() { + let buf = b"[3[4]]"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } + + #[test] + fn grammar_rejects_trailing_garbage_inside_object() { + let buf = b"{\"a\":\"a\" 123}"; + assert_eq!(validate_eager_values(buf, &ix(buf)), Err(qjd_err::QJD_PARSE_ERROR)); + } +} diff --git a/src/validate/number.rs b/src/validate/number.rs new file mode 100644 index 0000000..c212bdb --- /dev/null +++ b/src/validate/number.rs @@ -0,0 +1,91 @@ +//! Strict RFC 8259 §6 number-format validation. + +use crate::error::qjd_err; + +/// Returns Ok if `bytes` matches the JSON `number` grammar exactly. +/// Otherwise returns `QJD_INVALID_NUMBER`. +/// +/// Out-of-range (i.e. f64 overflow) is NOT detected here; the f64 decode +/// step surfaces it as `QJD_NUMBER_OUT_OF_RANGE`. +pub(crate) fn validate_number(bytes: &[u8]) -> Result<(), qjd_err> { + let mut i = 0; + + // optional minus + if bytes.first() == Some(&b'-') { i += 1; } + + // int: "0" | (digit1-9 *digit) + match bytes.get(i) { + Some(&b'0') => { i += 1; } + Some(&(b'1'..=b'9')) => { + i += 1; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + } + _ => return Err(qjd_err::QJD_INVALID_NUMBER), + } + + // optional frac: "." 1*digit + if bytes.get(i) == Some(&b'.') { + i += 1; + let frac_start = i; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + if i == frac_start { return Err(qjd_err::QJD_INVALID_NUMBER); } + } + + // optional exp: ("e"|"E") ["+"|"-"] 1*digit + if matches!(bytes.get(i), Some(&b'e') | Some(&b'E')) { + i += 1; + if matches!(bytes.get(i), Some(&b'+') | Some(&b'-')) { i += 1; } + let exp_start = i; + while let Some(&c) = bytes.get(i) { + if !c.is_ascii_digit() { break; } + i += 1; + } + if i == exp_start { return Err(qjd_err::QJD_INVALID_NUMBER); } + } + + if i != bytes.len() { return Err(qjd_err::QJD_INVALID_NUMBER); } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ok(s: &[u8]) { assert!(validate_number(s).is_ok(), "{:?}", std::str::from_utf8(s)); } + fn bad(s: &[u8]) { assert!(validate_number(s).is_err(), "{:?}", std::str::from_utf8(s)); } + + #[test] fn zero_ok() { ok(b"0"); } + #[test] fn neg_zero_ok() { ok(b"-0"); } + #[test] fn int_ok() { ok(b"123"); } + #[test] fn neg_int_ok() { ok(b"-456"); } + #[test] fn frac_ok() { ok(b"3.14"); } + #[test] fn neg_frac_ok() { ok(b"-2.718"); } + #[test] fn exp_lower_ok() { ok(b"1e10"); } + #[test] fn exp_upper_ok() { ok(b"1E10"); } + #[test] fn exp_plus_ok() { ok(b"1e+10"); } + #[test] fn exp_minus_ok() { ok(b"1e-10"); } + #[test] fn frac_exp_ok() { ok(b"1.5e2"); } + #[test] fn i64_max_str_ok() { ok(b"9223372036854775807"); } + + #[test] fn leading_plus_bad() { bad(b"+1"); } + #[test] fn leading_zero_bad() { bad(b"01"); } + #[test] fn leading_zeros_bad() { bad(b"00"); } + #[test] fn bare_dot_bad() { bad(b".5"); } + #[test] fn trailing_dot_bad() { bad(b"1."); } + #[test] fn missing_frac_digits_bad() { bad(b"1.e5"); } + #[test] fn hex_bad() { bad(b"0x1F"); } + #[test] fn incomplete_exp_bad() { bad(b"1e"); } + #[test] fn incomplete_exp_sign_bad() { bad(b"1e+"); } + #[test] fn nan_bad() { bad(b"NaN"); } + #[test] fn inf_bad() { bad(b"Infinity"); } + #[test] fn neg_inf_bad() { bad(b"-Infinity"); } + #[test] fn empty_bad() { bad(b""); } + #[test] fn lone_minus_bad() { bad(b"-"); } + #[test] fn double_dot_bad() { bad(b"1..2"); } +} diff --git a/src/validate/strings/avx2.rs b/src/validate/strings/avx2.rs new file mode 100644 index 0000000..7823d8c --- /dev/null +++ b/src/validate/strings/avx2.rs @@ -0,0 +1,68 @@ +#![cfg(all(target_arch = "x86_64", feature = "avx2"))] + +//! AVX2 ASCII fast path for string-content validation. +//! +//! For each 32-byte chunk, compute a "needs-attention" mask covering bytes +//! that are either control chars (< 0x20), backslashes, or high-bit bytes. +//! If the mask is all-zero the chunk is pure printable ASCII (no escapes, +//! no UTF-8, no control) and can be skipped entirely. +//! +//! On the first non-zero chunk we hand off to the scalar state machine for +//! the remainder of the span — we don't try to bit-scan inside the chunk. +//! The fast-path payoff comes from cleanly skipping long ASCII prefixes; +//! the scalar tail handles correctness without needing SIMD escape logic. + +use crate::error::qjd_err; +use core::arch::x86_64::*; + +use super::scalar::validate_span_scalar; + +/// Validate `span` using AVX2 to bulk-skip pure-ASCII 32-byte chunks. +pub(crate) fn validate_span_avx2(span: &[u8]) -> Result<(), qjd_err> { + // SAFETY: dispatcher has verified the AVX2 feature is present. + unsafe { validate_span_avx2_impl(span) } +} + +#[target_feature(enable = "avx2")] +unsafe fn validate_span_avx2_impl(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + + // ASCII bytes that need scalar attention have: + // - top bit set → byte >= 0x80 + // - value < 0x20 → control char + // - value == 0x5C ('\\') → escape introducer + // + // Detection via three SIMD compares OR'd together. + let backslash = _mm256_set1_epi8(b'\\' as i8); + // For "< 0x20" we use a signed unsigned trick: compare against 0x1F via + // unsigned MAX. _mm256_cmpgt_epi8 is signed, but bytes <0x20 are also + // <0x20 as signed positive values, so signed cmpgt works here for the + // 0x00..=0x1F range (none of which has the high bit set). + let ctrl_thresh = _mm256_set1_epi8(0x20_i8); + + while i + 32 <= n { + let chunk = _mm256_loadu_si256(span.as_ptr().add(i) as *const __m256i); + + // high bit set? + let high = _mm256_movemask_epi8(chunk) as u32; + // byte == '\\' ? + let bs = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, backslash)) as u32; + // byte < 0x20 ? (signed cmpgt: ctrl_thresh > chunk for 0x00..=0x1F bytes) + let ctrl = _mm256_movemask_epi8(_mm256_cmpgt_epi8(ctrl_thresh, chunk)) as u32; + + let interesting = high | bs | ctrl; + if interesting != 0 { + // Hand off to the scalar state machine starting at the first + // interesting byte in this chunk. We don't try to validate any + // already-cleared bytes — those are pure printable ASCII and + // self-terminating so it's safe to resume there. + let offset = interesting.trailing_zeros() as usize; + return validate_span_scalar(&span[i + offset..]); + } + + i += 32; + } + + validate_span_scalar(&span[i..]) +} diff --git a/src/validate/strings/mod.rs b/src/validate/strings/mod.rs new file mode 100644 index 0000000..ab10090 --- /dev/null +++ b/src/validate/strings/mod.rs @@ -0,0 +1,192 @@ +//! String-content validation: control chars, escape grammar, and UTF-8. +//! +//! Single-pass validator with an optional SIMD ASCII fast path. The public +//! entry point [`validate_string_span`] dispatches once via `OnceCell` to +//! the best available implementation: +//! +//! - x86_64 + AVX2: 32-byte chunk skip → scalar tail. +//! - aarch64 NEON: 16-byte chunk skip → scalar tail. +//! - Otherwise: pure scalar state machine. +//! +//! All paths return identical error codes for any input; the SIMD layers +//! only accelerate the "this chunk is pure printable ASCII" common case. + +mod scalar; +#[cfg(all(target_arch = "x86_64", feature = "avx2"))] +mod avx2; +#[cfg(target_arch = "aarch64")] +mod neon; + +use crate::error::qjd_err; +use once_cell::sync::OnceCell; + +type ValidateFn = fn(&[u8]) -> Result<(), qjd_err>; +static VALIDATE_FN: OnceCell = OnceCell::new(); + +/// Verify that the raw span (excluding surrounding quotes) contains no +/// unescaped control characters (0x00..=0x1F), every backslash escape is +/// RFC 8259 §7 compliant, and the byte sequence is valid UTF-8 per RFC 3629. +pub(crate) fn validate_string_span(span: &[u8]) -> Result<(), qjd_err> { + let f = *VALIDATE_FN.get_or_init(|| { + #[cfg(all(target_arch = "x86_64", feature = "avx2"))] + { + if std::is_x86_feature_detected!("avx2") { + return avx2::validate_span_avx2 as ValidateFn; + } + } + #[cfg(target_arch = "aarch64")] + { + return neon::validate_span_neon as ValidateFn; + } + #[allow(unreachable_code)] + { + scalar::validate_span_scalar as ValidateFn + } + }); + f(span) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── Pinned baseline contract (DO NOT MODIFY) ───────────────────────── + // These 8 tests reproduce the original 3-pass validator's externally + // observable behavior and pin it down. The single-pass refactor must + // not change any of these outcomes. + + #[test] fn ascii_ok() { assert!(validate_string_span(b"hello").is_ok()); } + #[test] fn utf8_ok() { assert!(validate_string_span("中文".as_bytes()).is_ok()); } + #[test] fn escapes_ok() { assert!(validate_string_span(b"a\\nb\\u00e9").is_ok()); } + #[test] fn tab_raw_bad() { assert_eq!(validate_string_span(b"a\tb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn null_raw_bad() { assert_eq!(validate_string_span(b"a\x00b").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn newline_raw_bad() { assert_eq!(validate_string_span(b"a\nb").unwrap_err(), qjd_err::QJD_INVALID_STRING); } + #[test] fn del_0x7f_ok() { assert!(validate_string_span(b"a\x7fb").is_ok()); } // RFC 8259 does NOT forbid 0x7F + #[test] fn invalid_utf8_bad() { assert_eq!(validate_string_span(&[0xC0, 0xC0]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); } + + // ── Single-pass / SIMD edge cases ──────────────────────────────────── + + #[test] + fn empty_span_ok() { + assert!(validate_string_span(b"").is_ok()); + } + + #[test] + fn long_ascii_ok() { + // > 64 bytes hits the SIMD fast path multiple times. + let s = vec![b'x'; 256]; + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn long_ascii_with_trailing_tab_bad() { + // Long ASCII run skipped by SIMD, then a control byte in the tail. + let mut s = vec![b'x'; 200]; + s.push(b'\t'); + assert_eq!(validate_string_span(&s).unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn utf8_at_simd_chunk_boundary() { + // 31 ASCII bytes + 2-byte UTF-8 (é = 0xC3 0xA9). On AVX2 the first + // 32-byte chunk has a high-bit byte at lane 31 → forces scalar tail + // starting at position 31, which must handle the 2-byte sequence. + let mut s = vec![b'x'; 31]; + s.extend_from_slice("é".as_bytes()); + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn backslash_escape_at_simd_chunk_boundary() { + // 31 ASCII + `\n` straddles AVX2 chunk boundary at byte 31. + let mut s = vec![b'x'; 31]; + s.push(b'\\'); + s.push(b'n'); + assert!(validate_string_span(&s).is_ok()); + } + + #[test] + fn backslash_at_chunk_boundary_with_bad_followup() { + // Backslash lands as the last byte of a 32-byte chunk; the next byte + // is an invalid escape introducer. Tail must reject. + let mut s = vec![b'x'; 31]; + s.push(b'\\'); + s.push(b'q'); + assert_eq!(validate_string_span(&s).unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn truncated_u_escape_at_end() { + // `\uXX` with only 2 hex digits — RFC requires exactly 4. + assert_eq!(validate_string_span(b"\\uAB").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\uABC").unwrap_err(), qjd_err::QJD_INVALID_STRING); + // Bare `\u` at end. + assert_eq!(validate_string_span(b"\\u").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn dangling_backslash_at_end() { + assert_eq!(validate_string_span(b"abc\\").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn unknown_escape_introducer() { + // `\a`, `\q`, etc. are not valid RFC 8259 escapes. + assert_eq!(validate_string_span(b"\\a").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\q").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\x41").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn u_escape_non_hex_bad() { + assert_eq!(validate_string_span(b"\\u00ZZ").unwrap_err(), qjd_err::QJD_INVALID_STRING); + assert_eq!(validate_string_span(b"\\uGHIJ").unwrap_err(), qjd_err::QJD_INVALID_STRING); + } + + #[test] + fn overlong_utf8_rejected() { + // C0 80 would encode U+0000 in 2 bytes (overlong) — RFC 3629 forbids. + assert_eq!(validate_string_span(&[0xC0, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // E0 80 80 would encode U+0000 in 3 bytes (overlong). + assert_eq!(validate_string_span(&[0xE0, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // F0 80 80 80 would encode U+0000 in 4 bytes (overlong). + assert_eq!(validate_string_span(&[0xF0, 0x80, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn surrogate_in_utf8_rejected() { + // ED A0 80 = U+D800, the start of the high-surrogate range. + assert_eq!(validate_string_span(&[0xED, 0xA0, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // ED BF BF = U+DFFF, the end of the low-surrogate range. + assert_eq!(validate_string_span(&[0xED, 0xBF, 0xBF]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn lone_continuation_byte_rejected() { + assert_eq!(validate_string_span(&[0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + assert_eq!(validate_string_span(&[b'a', 0xBF, b'b']).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn four_byte_emoji_ok() { + // U+1F600 grinning face = F0 9F 98 80. + assert!(validate_string_span(&[0xF0, 0x9F, 0x98, 0x80]).is_ok()); + } + + #[test] + fn truncated_utf8_sequence_rejected() { + // 2-byte lead with no continuation. + assert_eq!(validate_string_span(&[0xC3]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // 3-byte lead with only one continuation. + assert_eq!(validate_string_span(&[0xE4, 0xB8]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + // 4-byte lead with only two continuations. + assert_eq!(validate_string_span(&[0xF0, 0x9F, 0x98]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } + + #[test] + fn utf8_out_of_range_rejected() { + // F5..FF are not valid lead bytes (would encode > U+10FFFF). + assert_eq!(validate_string_span(&[0xF5, 0x80, 0x80, 0x80]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + assert_eq!(validate_string_span(&[0xFF]).unwrap_err(), qjd_err::QJD_INVALID_UTF8); + } +} diff --git a/src/validate/strings/neon.rs b/src/validate/strings/neon.rs new file mode 100644 index 0000000..34d887e --- /dev/null +++ b/src/validate/strings/neon.rs @@ -0,0 +1,67 @@ +#![cfg(target_arch = "aarch64")] + +//! NEON ASCII fast path for string-content validation. +//! +//! For each 16-byte chunk, compute a single "needs-attention" mask covering +//! bytes that are control chars (< 0x20), backslashes, or high-bit bytes. +//! If the chunk is pure printable ASCII the mask is all-zero and the chunk +//! can be skipped entirely. The first non-zero chunk hands off to the +//! scalar state machine, which handles correctness for the remainder. + +use crate::error::qjd_err; +use core::arch::aarch64::*; + +use super::scalar::validate_span_scalar; + +/// Validate `span` using NEON to bulk-skip pure-ASCII 16-byte chunks. +pub(crate) fn validate_span_neon(span: &[u8]) -> Result<(), qjd_err> { + // SAFETY: aarch64 NEON is always available on aarch64 (it is part of + // the AArch64 base ISA), so no runtime feature check is required. + unsafe { validate_span_neon_impl(span) } +} + +#[target_feature(enable = "neon")] +unsafe fn validate_span_neon_impl(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + + let backslash = vdupq_n_u8(b'\\'); + let ctrl_top = vdupq_n_u8(0x20); + + while i + 16 <= n { + let chunk = vld1q_u8(span.as_ptr().add(i)); + + // byte >= 0x80 ? high bit set + let high = vcgeq_u8(chunk, vdupq_n_u8(0x80)); + // byte == '\\' ? + let bs = vceqq_u8(chunk, backslash); + // byte < 0x20 ? + let ctrl = vcltq_u8(chunk, ctrl_top); + + let interesting = vorrq_u8(vorrq_u8(high, bs), ctrl); + + // Reduce 16 lanes → single u64 to test for any non-zero byte. + // vmaxvq_u8 returns 0 iff every lane is 0. + if vmaxvq_u8(interesting) != 0 { + // First interesting byte: find via lane index. + // Build 0xFF/0x00 per-lane mask already in `interesting`; convert + // each lane to its index-or-MAX via a small scalar loop. A 16-lane + // ctz would be tidier but isn't critical here — interesting chunks + // are the slow case anyway. + for lane in 0..16usize { + if span[i + lane] >= 0x80 + || span[i + lane] == b'\\' + || span[i + lane] < 0x20 + { + return validate_span_scalar(&span[i + lane..]); + } + } + // Unreachable: vmaxvq_u8 said at least one lane is non-zero. + unreachable!(); + } + + i += 16; + } + + validate_span_scalar(&span[i..]) +} diff --git a/src/validate/strings/scalar.rs b/src/validate/strings/scalar.rs new file mode 100644 index 0000000..7784679 --- /dev/null +++ b/src/validate/strings/scalar.rs @@ -0,0 +1,158 @@ +//! Single-pass scalar validator for a JSON string span (interior bytes, +//! excluding the surrounding quotes). +//! +//! Combines three checks into one byte walk: +//! 1. RFC 8259 §7: no raw control characters (b < 0x20). +//! 2. RFC 8259 §7: every `\` escape is one of `" \ / b f n r t` or `\uXXXX`. +//! 3. RFC 3629: valid UTF-8 (rejects overlong encodings and surrogates, +//! matching `std::str::from_utf8` for full corpus parity). +//! +//! Error-code precedence on mixed inputs: +//! - Control char or invalid escape introducer encountered first → INVALID_STRING. +//! - Bad UTF-8 lead/continuation byte encountered first → INVALID_UTF8. +//! +//! This means a span like `[0x09, 0xFF]` returns INVALID_STRING (control byte +//! seen before the UTF-8 problem), whereas `[0xFF, 0x09]` returns INVALID_UTF8. +//! The previous two-pass code preferred UTF-8 in both cases; no existing test +//! pins down which wins on mixed input, so the position-ordered choice here +//! is the natural single-pass behavior. + +use crate::error::qjd_err; + +/// Validate `span` byte-by-byte. The caller passes the unescaped string +/// interior (between the JSON `"…"` quotes) — `\` therefore introduces an +/// RFC 8259 escape sequence, not a literal backslash byte. +pub(crate) fn validate_span_scalar(span: &[u8]) -> Result<(), qjd_err> { + let mut i: usize = 0; + let n = span.len(); + while i < n { + let b = span[i]; + + // Fast path: plain ASCII non-escape non-control. + if b < 0x80 { + if b < 0x20 { + return Err(qjd_err::QJD_INVALID_STRING); + } + if b == b'\\' { + i = validate_escape(span, i + 1)?; + continue; + } + i += 1; + continue; + } + + // High-bit byte: must be the lead of a 2/3/4-byte UTF-8 sequence. + i = validate_utf8_sequence(span, i)?; + } + Ok(()) +} + +/// At entry `i` points to the byte AFTER the `\`. Returns the index of the +/// next byte to validate (i.e. one past the last consumed escape byte). +#[inline] +fn validate_escape(span: &[u8], i: usize) -> Result { + if i >= span.len() { + // Dangling `\` at end of span. + return Err(qjd_err::QJD_INVALID_STRING); + } + match span[i] { + b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => Ok(i + 1), + b'u' => { + // Must be followed by exactly 4 hex digits. + let hex_start = i + 1; + let hex_end = hex_start + 4; + if hex_end > span.len() { + return Err(qjd_err::QJD_INVALID_STRING); + } + for &h in &span[hex_start..hex_end] { + if !h.is_ascii_hexdigit() { + return Err(qjd_err::QJD_INVALID_STRING); + } + } + Ok(hex_end) + } + _ => Err(qjd_err::QJD_INVALID_STRING), + } +} + +/// At entry `i` points to a byte with the high bit set. Validate the +/// multi-byte UTF-8 sequence starting here per RFC 3629 (rejects overlong +/// encodings and UTF-16 surrogates U+D800..=U+DFFF). Returns the index one +/// past the last byte of the sequence. +#[inline] +fn validate_utf8_sequence(span: &[u8], i: usize) -> Result { + let lead = span[i]; + let n = span.len(); + + // 2-byte: 110xxxxx 10xxxxxx, lead in C2..=DF (C0/C1 are overlong). + if (0xC2..=0xDF).contains(&lead) { + if i + 1 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + if !(0x80..=0xBF).contains(&b1) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 2); + } + + // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx, lead in E0..=EF. + // Extra constraints: E0 second must be A0..BF (else overlong); + // ED second must be 80..9F (else surrogate U+D800..=DFFF). + if (0xE0..=0xEF).contains(&lead) { + if i + 2 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + let b2 = span[i + 2]; + let b1_lo = match lead { + 0xE0 => 0xA0, + _ => 0x80, + }; + let b1_hi = match lead { + 0xED => 0x9F, + _ => 0xBF, + }; + if b1 < b1_lo || b1 > b1_hi { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b2) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 3); + } + + // 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, lead in F0..=F4. + // Extra constraints: F0 second must be 90..BF (else overlong); + // F4 second must be 80..8F (else > U+10FFFF). + if (0xF0..=0xF4).contains(&lead) { + if i + 3 >= n { + return Err(qjd_err::QJD_INVALID_UTF8); + } + let b1 = span[i + 1]; + let b2 = span[i + 2]; + let b3 = span[i + 3]; + let b1_lo = match lead { + 0xF0 => 0x90, + _ => 0x80, + }; + let b1_hi = match lead { + 0xF4 => 0x8F, + _ => 0xBF, + }; + if b1 < b1_lo || b1 > b1_hi { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b2) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + if !(0x80..=0xBF).contains(&b3) { + return Err(qjd_err::QJD_INVALID_UTF8); + } + return Ok(i + 4); + } + + // C0, C1 (overlong 2-byte lead), F5..FF (out of range), or a bare + // continuation byte (80..BF with no lead) — all invalid. + Err(qjd_err::QJD_INVALID_UTF8) +} diff --git a/tests/ffi_options_smoke.rs b/tests/ffi_options_smoke.rs new file mode 100644 index 0000000..83d942d --- /dev/null +++ b/tests/ffi_options_smoke.rs @@ -0,0 +1,44 @@ +//! Smoke test for qjd_parse_ex and qjd_options C ABI. + +use std::os::raw::c_int; + +use quickdecode::ffi::{qjd_doc, qjd_free, qjd_parse, qjd_parse_ex}; +use quickdecode::options::Options; + +#[test] +fn parse_ex_default_options_matches_parse() { + let buf = b"{\"a\":1}"; + let mut err: c_int = -1; + let d1: *mut qjd_doc = unsafe { qjd_parse(buf.as_ptr(), buf.len(), &mut err) }; + assert!(!d1.is_null()); + assert_eq!(err, 0); + + let opts = Options { mode: 0, max_depth: 0 }; + let mut err2: c_int = -1; + let d2: *mut qjd_doc = unsafe { qjd_parse_ex(buf.as_ptr(), buf.len(), &opts, &mut err2) }; + assert!(!d2.is_null()); + assert_eq!(err2, 0); + + unsafe { qjd_free(d1); qjd_free(d2); } +} + +#[test] +fn parse_ex_null_opts_uses_defaults() { + let buf = b"{}"; + let mut err: c_int = -1; + let d: *mut qjd_doc = unsafe { + qjd_parse_ex(buf.as_ptr(), buf.len(), std::ptr::null(), &mut err) + }; + assert!(!d.is_null()); + assert_eq!(err, 0); + unsafe { qjd_free(d) }; +} + +#[test] +fn parse_ex_null_err_returns_null_on_bad_buf() { + let opts = Options { mode: 0, max_depth: 0 }; + let d: *mut qjd_doc = unsafe { + qjd_parse_ex(std::ptr::null(), 0, &opts, std::ptr::null_mut()) + }; + assert!(d.is_null()); +} diff --git a/tests/json_test_suite.rs b/tests/json_test_suite.rs new file mode 100644 index 0000000..c799395 --- /dev/null +++ b/tests/json_test_suite.rs @@ -0,0 +1,161 @@ +//! Walker over the JSONTestSuite corpus (submodule at tests/vendor/JSONTestSuite). +//! +//! - `y_*` files: must parse in both EAGER and LAZY modes. +//! - `n_*` files: must fail to parse in EAGER mode. +//! In LAZY mode the file MAY parse (structural-only) but a value-level +//! access of the malformed field would fail; we do not assert against +//! LAZY here. +//! - `i_*` files: implementation-defined; we record our behavior (no +//! assertions). The list of accepted/rejected i_* cases is printed at +//! the end of the test run for documentation. +//! +//! # Known failures +//! +//! Files listed in KNOWN_Y_FAILURES / KNOWN_N_FAILURES are skipped with a +//! logged explanation. Removing a file from these lists re-enables the test. +//! +//! KNOWN_Y_FAILURES: y_* files we don't handle correctly yet. +//! Each entry documents why; follow-up issues are referenced in comments. +//! +//! KNOWN_N_FAILURES: n_* files our eager validator passes when it shouldn't. +//! These correspond to grammar-aware gaps deferred to issue #37. + +use std::fs; +use std::path::Path; + +use quickdecode::doc::Document; +use quickdecode::options::{Options, QJD_MODE_EAGER, QJD_MODE_LAZY}; + +/// y_* files that we currently reject but shouldn't. +/// Each is annotated with why and what follow-up would fix it. +const KNOWN_Y_FAILURES: &[&str] = &[ + // "y_string_utf8.json" — example placeholder (none currently needed) +]; + +/// n_* files that we currently accept but shouldn't (validator gap). +/// +/// The grammar-aware eager pass in src/validate/mod.rs tracks parser +/// state per container and rejects token transitions that violate +/// RFC 8259. Removing a file from this list re-enables the assertion. +const KNOWN_N_FAILURES: &[&str] = &[ + // (intentionally empty — see git history for the previous list, + // which was closed by the grammar-aware structural pass.) +]; + +fn corpus_dir() -> &'static Path { + Path::new(env!("CARGO_MANIFEST_DIR")) +} + +fn parsing_dir() -> std::path::PathBuf { + corpus_dir().join("tests/vendor/JSONTestSuite/test_parsing") +} + +fn iter_files(prefix: &str) -> Vec { + let dir = parsing_dir(); + let entries = fs::read_dir(&dir) + .unwrap_or_else(|e| panic!( + "missing JSONTestSuite submodule at {:?}: {} — run: git submodule update --init", + dir, e + )); + let mut paths: Vec<_> = entries + .filter_map(|r| r.ok()) + .map(|e| e.path()) + .filter(|p| { + p.extension().and_then(|s| s.to_str()) == Some("json") + && p.file_name() + .and_then(|s| s.to_str()) + .map(|n| n.starts_with(prefix)) + .unwrap_or(false) + }) + .collect(); + paths.sort(); + paths +} + +fn is_known_y_failure(path: &std::path::Path) -> bool { + let name = path.file_name().and_then(|s| s.to_str()).unwrap_or(""); + KNOWN_Y_FAILURES.contains(&name) +} + +fn is_known_n_failure(path: &std::path::Path) -> bool { + let name = path.file_name().and_then(|s| s.to_str()).unwrap_or(""); + KNOWN_N_FAILURES.contains(&name) +} + +#[test] +fn y_files_accepted_in_both_modes() { + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + let lazy = Options { mode: QJD_MODE_LAZY, max_depth: 0 }; + let mut failures = Vec::new(); + let mut skipped = 0usize; + + for path in iter_files("y_") { + if is_known_y_failure(&path) { + eprintln!("SKIP (known-y-failure): {:?}", path.file_name().unwrap()); + skipped += 1; + continue; + } + let data = fs::read(&path).unwrap(); + let r_e = Document::parse_with_options(&data, &eager); + let r_l = Document::parse_with_options(&data, &lazy); + if r_e.is_err() || r_l.is_err() { + failures.push(( + path.file_name().unwrap().to_owned(), + format!("eager={:?} lazy={:?}", r_e.err(), r_l.err()), + )); + } + } + + if skipped > 0 { + eprintln!("y_* skipped (known failures): {}", skipped); + } + if !failures.is_empty() { + for (n, e) in &failures { + eprintln!("UNEXPECTED REJECT: {:?} → {}", n, e); + } + panic!("{} y_* file(s) unexpectedly rejected", failures.len()); + } +} + +#[test] +fn n_files_rejected_in_eager_mode() { + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + let mut accepted = Vec::new(); + let mut skipped = 0usize; + + for path in iter_files("n_") { + if is_known_n_failure(&path) { + eprintln!("SKIP (known-n-failure): {:?}", path.file_name().unwrap()); + skipped += 1; + continue; + } + let data = fs::read(&path).unwrap(); + if Document::parse_with_options(&data, &eager).is_ok() { + accepted.push(path.file_name().unwrap().to_owned()); + } + } + + if skipped > 0 { + eprintln!("n_* skipped (known failures): {}", skipped); + } + if !accepted.is_empty() { + for n in &accepted { + eprintln!("UNEXPECTED ACCEPT: {:?}", n); + } + panic!("{} n_* file(s) unexpectedly accepted", accepted.len()); + } +} + +#[test] +fn document_i_files_behavior() { + // Implementation-defined cases — document what we do, do not assert. + let eager = Options { mode: QJD_MODE_EAGER, max_depth: 0 }; + for path in iter_files("i_") { + let data = fs::read(&path).unwrap(); + let verdict = match Document::parse_with_options(&data, &eager) { + Ok(_) => "ACCEPT".to_owned(), + Err(e) => format!("REJECT({:?})", e), + }; + eprintln!("i_* {:?} → {}", path.file_name().unwrap(), verdict); + } +} diff --git a/tests/lua/options_spec.lua b/tests/lua/options_spec.lua new file mode 100644 index 0000000..c689d2a --- /dev/null +++ b/tests/lua/options_spec.lua @@ -0,0 +1,36 @@ +local qd = require "quickdecode" + +describe("parse with options", function() + it("accepts no second arg (default eager)", function() + assert.is_not_nil(qd.parse('{"a":1}')) + end) + + it("accepts an empty opts table", function() + assert.is_not_nil(qd.parse('{"a":1}', {})) + end) + + it("accepts lazy=true and tolerates trailing content", function() + -- Trailing content is eager-only; lazy must parse OK. + assert.is_not_nil(qd.parse('{}garbage', { lazy = true })) + end) + + it("accepts max_depth", function() + assert.is_not_nil(qd.parse('[[[1]]]', { max_depth = 1024 })) + end) + + it("rejects invalid mode key value", function() + assert.has_error(function() + qd.parse('{}', { lazy = "yes please" }) + end) + end) + + it("accepts lazy=true and max_depth combined", function() + assert.is_not_nil(qd.parse('[[1]]', { lazy = true, max_depth = 256 })) + end) + + it("rejects fractional max_depth", function() + assert.has_error(function() + qd.parse('{}', { max_depth = 1.5 }) + end) + end) +end) diff --git a/tests/rfc8259_compliance.rs b/tests/rfc8259_compliance.rs new file mode 100644 index 0000000..790511d --- /dev/null +++ b/tests/rfc8259_compliance.rs @@ -0,0 +1,714 @@ +//! RFC 8259 conformance suite. +//! +//! Cross-mode contract: +//! * `y_*` inputs MUST parse successfully in both EAGER and LAZY modes, +//! and any specified field-level access MUST return the expected value. +//! * `n_*` inputs MUST fail to parse in EAGER mode, and MUST either +//! fail to parse OR fail on the documented field access in LAZY mode. +//! * `i_*` inputs document our current behavior; we assert what we do +//! today (so regressions surface), referencing JSONTestSuite naming. +//! +//! RFC 8259 references are in section-paragraph form, e.g. RFC8259 §6 for +//! the number grammar. + +use quickdecode::doc::Document; +use quickdecode::options::{Options, QJD_MODE_EAGER, QJD_MODE_LAZY}; + +fn eager() -> Options { Options { mode: QJD_MODE_EAGER, max_depth: 0 } } +fn lazy() -> Options { Options { mode: QJD_MODE_LAZY, max_depth: 0 } } + +/// Asserts the input is accepted in both modes. +/// +/// Usage: `assert_accepts!("[]");` +#[macro_export] +macro_rules! assert_accepts { + ($input:expr) => {{ + let buf: &[u8] = $input.as_ref(); + let r_eager = Document::parse_with_options(buf, &eager()); + assert!(r_eager.is_ok(), + "EAGER unexpectedly rejected {:?}: {:?}", $input, r_eager.err()); + let r_lazy = Document::parse_with_options(buf, &lazy()); + assert!(r_lazy.is_ok(), + "LAZY unexpectedly rejected {:?}: {:?}", $input, r_lazy.err()); + }}; +} + +/// Asserts the input is REJECTED by eager parse. +/// +/// Usage: `assert_rejects_eager!("01", QJD_INVALID_NUMBER);` +#[macro_export] +macro_rules! assert_rejects_eager { + ($input:expr, $expected_err:ident) => {{ + use quickdecode::error::qjd_err; + let buf: &[u8] = $input.as_ref(); + let expected = qjd_err::$expected_err; + match Document::parse_with_options(buf, &eager()) { + Err(e) if e == expected => {} + Err(other) => panic!( + "EAGER rejected {:?} with {:?}, expected {:?}", + $input, other, expected), + Ok(_) => panic!("EAGER unexpectedly accepted {:?}", $input), + } + }}; +} + +/// Asserts the input is rejected at parse time in BOTH modes (structural). +#[macro_export] +macro_rules! assert_rejects_both { + ($input:expr) => {{ + let buf: &[u8] = $input.as_ref(); + assert!(Document::parse_with_options(buf, &eager()).is_err(), + "EAGER unexpectedly accepted {:?}", $input); + assert!(Document::parse_with_options(buf, &lazy()).is_err(), + "LAZY unexpectedly accepted {:?}", $input); + }}; +} + +// ───────────────────────────────────────────────────────────── +// Scaffold smoke tests — replaced by Task 11 with full corpus. +// ───────────────────────────────────────────────────────────── + +#[test] +fn smoke_accepts_empty_object() { assert_accepts!("{}"); } + +#[test] +fn smoke_accepts_empty_array() { assert_accepts!("[]"); } + +#[test] +fn smoke_rejects_unmatched_brace_both_modes() { + assert_rejects_both!("{"); +} + +#[test] +#[should_panic(expected = "expected QJD_INVALID_NUMBER")] +fn macro_rejects_wrong_error_code() { + // Sanity: passing the wrong expected variant must panic. + // `{` is rejected as QJD_PARSE_ERROR, NOT QJD_INVALID_NUMBER. + // With the buggy macro, this test would NOT panic (false positive + // — the macro would silently bind whatever Err came back). + assert_rejects_eager!("{", QJD_INVALID_NUMBER); +} + +// ── Phase 3: nesting depth ─────────────────────────────────── + +#[test] +fn rejects_deeply_nested_at_default_limit() { + use quickdecode::error::qjd_err; + let mut buf = String::new(); + for _ in 0..1100 { buf.push('['); } + for _ in 0..1100 { buf.push(']'); } + match Document::parse_with_options(buf.as_bytes(), &eager()) { + Err(qjd_err::QJD_NESTING_TOO_DEEP) => {} + other => panic!("expected QJD_NESTING_TOO_DEEP, got {:?}", other.err()), + } +} + +#[test] +fn lazy_mode_also_enforces_max_depth() { + use quickdecode::error::qjd_err; + let mut buf = String::new(); + for _ in 0..1100 { buf.push('['); } + for _ in 0..1100 { buf.push(']'); } + assert_eq!( + Document::parse_with_options(buf.as_bytes(), &lazy()).err().unwrap(), + qjd_err::QJD_NESTING_TOO_DEEP, + ); +} + +#[test] +fn accepts_nested_at_configured_limit() { + let mut buf = String::new(); + for _ in 0..256 { buf.push('['); } + for _ in 0..256 { buf.push(']'); } + let opts = Options { mode: QJD_MODE_EAGER, max_depth: 256 }; + assert!(Document::parse_with_options(buf.as_bytes(), &opts).is_ok()); +} + +#[test] +fn rejects_when_one_past_configured_limit() { + let mut buf = String::new(); + for _ in 0..33 { buf.push('['); } + for _ in 0..33 { buf.push(']'); } + let opts = Options { mode: QJD_MODE_EAGER, max_depth: 32 }; + assert!(Document::parse_with_options(buf.as_bytes(), &opts).is_err()); +} + +// ── Phase 6: trailing content ──────────────────────────────── + +#[test] +fn eager_rejects_trailing_content() { + use quickdecode::error::qjd_err; + assert_eq!( + Document::parse_with_options(b"{}garbage", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); +} + +#[test] +fn eager_rejects_multiple_root_values() { + use quickdecode::error::qjd_err; + assert_eq!( + Document::parse_with_options(b"1 2", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); + assert_eq!( + Document::parse_with_options(b"true false", &eager()).err().unwrap(), + qjd_err::QJD_TRAILING_CONTENT, + ); +} + +#[test] +fn eager_accepts_trailing_whitespace() { + assert_accepts!("{} \n\t"); +} + +#[test] +fn eager_accepts_top_level_scalar_with_trailing_whitespace() { + assert_accepts!("42 \n\t"); +} + +#[test] +fn lazy_accepts_trailing_garbage() { + // Lazy preserves historical behavior: trailing bytes are ignored. + assert!(Document::parse_with_options(b"{}garbage", &lazy()).is_ok()); +} + +// ── Phase 2: number format ─────────────────────────────────── + +#[test] +fn eager_accepts_canonical_numbers() { + for s in ["0", "-0", "1", "-1", "3.14", "-2.718", + "1e10", "1E10", "1e+10", "1e-10", "1.5e2", + "9223372036854775807", "-9223372036854775808"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } +} + +#[test] +fn eager_rejects_invalid_numbers() { + use quickdecode::error::qjd_err; + for s in ["+1", "01", "00", ".5", "1.", "1.e5", "0x1F", + "NaN", "Infinity", "-Infinity", "1e", "1e+"] { + let input = format!("[{}]", s); + match Document::parse_with_options(input.as_bytes(), &eager()) { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + Err(other) => panic!( + "expected QJD_INVALID_NUMBER for {:?}, got {:?}", input, other), + Ok(_) => panic!("EAGER unexpectedly accepted {:?}", input), + } + } +} + +#[test] +fn lazy_defers_invalid_number_until_access() { + // In LAZY mode, "[01]" parses; the error surfaces when you ask for the value. + let doc = Document::parse_with_options(b"[01]", &lazy()).unwrap(); + // Walking via FFI tests is verbose; we only check that the LAZY parse + // itself does not fail. Field-level access is covered in tests/ffi_*. + drop(doc); +} + +// ── Phase 4 + 5: string content ────────────────────────────── + +#[test] +fn eager_rejects_raw_tab_in_string() { + use quickdecode::error::qjd_err; + let input = b"[\"a\tb\"]"; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + Err(other) => panic!("expected QJD_INVALID_STRING, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted raw tab in string"), + } +} + +#[test] +fn eager_rejects_raw_null_in_string() { + use quickdecode::error::qjd_err; + let input = b"[\"a\x00b\"]"; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + Err(other) => panic!("expected QJD_INVALID_STRING, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted raw null in string"), + } +} + +#[test] +fn eager_rejects_invalid_utf8_in_string() { + use quickdecode::error::qjd_err; + let input = &[b'[', b'"', 0xC0, 0xC0, b'"', b']']; + match Document::parse_with_options(input, &eager()) { + Err(qjd_err::QJD_INVALID_UTF8) => {} + Err(other) => panic!("expected QJD_INVALID_UTF8, got {:?}", other), + Ok(_) => panic!("EAGER unexpectedly accepted invalid UTF-8 in string"), + } +} + +#[test] +fn eager_accepts_escape_sequences() { + assert_accepts!("[\"a\\nb\\u00e9\"]"); + assert_accepts!("[\"emoji \\uD83D\\uDE00\"]"); +} + +#[test] +fn lazy_accepts_raw_tab_but_decode_fails() { + let input = b"[\"a\tb\"]"; + let doc = Document::parse_with_options(input, &lazy()).expect("lazy accepts raw control"); + drop(doc); + // Field-level rejection on access is enforced by decode/string.rs and + // is covered by tests/ffi_strings.rs (existing decode_string tests cover + // the error type); no extra assertion needed here. +} + +// ── Task 10 fix: check_gap dispatch ────────────────────────── + +#[test] +fn eager_rejects_uppercase_true_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"TRUE", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_uppercase_false_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"False", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_uppercase_null_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"NULL", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_undefined_as_parse_error() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"undefined", &eager()); + match r { + Err(qjd_err::QJD_PARSE_ERROR) => {} + other => panic!("expected QJD_PARSE_ERROR, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_nan_as_invalid_number() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"NaN", &eager()); + match r { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + other => panic!("expected QJD_INVALID_NUMBER, got {:?}", other.err()), + } +} + +#[test] +fn eager_rejects_infinity_as_invalid_number() { + use quickdecode::error::qjd_err; + let r = Document::parse_with_options(b"Infinity", &eager()); + match r { + Err(qjd_err::QJD_INVALID_NUMBER) => {} + other => panic!("expected QJD_INVALID_NUMBER, got {:?}", other.err()), + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Task 11: Comprehensive RFC 8259 conformance corpus +// Organized into nested mod blocks per category. +// ───────────────────────────────────────────────────────────────────────────── + +mod structural { + use super::*; + + // RFC 8259 §2-3: JSON values — null, true, false are valid root values. + #[test] + fn primitives_valid() { + assert_accepts!("null"); + assert_accepts!("true"); + assert_accepts!("false"); + } + + // RFC 8259 §2: a JSON text contains exactly one value — empty is not valid. + #[test] + fn empty_input_rejected() { + assert_rejects_both!(""); + } + + // RFC 8259 §2: whitespace-only input also contains no value. + #[test] + fn whitespace_only_rejected() { + assert_rejects_both!(" "); + assert_rejects_both!("\t\n\r"); + } + + // RFC 8259 §4-5: empty object and empty array are valid. + #[test] + fn empty_containers() { + assert_accepts!("{}"); + assert_accepts!("[]"); + } + + // RFC 8259 §4-5: nested containers with mixed value types. + #[test] + fn nested_containers() { + assert_accepts!("[{\"a\":[1,{\"b\":2}]}]"); + assert_accepts!("{\"x\":{\"y\":{\"z\":null}}}"); + assert_accepts!("[[],[],[[],[]]]"); + } + + // RFC 8259 §4: '{' must be followed by a matching '}'. + #[test] + fn unclosed_brace() { + assert_rejects_both!("{"); + } + + // RFC 8259 §5: '[' must be followed by a matching ']'. + #[test] + fn unclosed_bracket() { + assert_rejects_both!("["); + } + + // Bracket mismatch: '{' closed by ']'. + #[test] + fn mismatched_brace_bracket() { + assert_rejects_both!("{]"); + } + + // Bracket mismatch: '[' closed by '}'. + #[test] + fn mismatched_bracket_brace() { + assert_rejects_both!("[}"); + } + + // RFC 8259 §4: object value must follow the colon — omitting it is invalid. + // Eager catches the empty gap after ':'; lazy defers (structural-only rule). + #[test] + fn missing_value() { + assert_rejects_eager!("{\"a\":}", QJD_PARSE_ERROR); + } + + // RFC 8259 §4: colon between key and value is mandatory. + // The grammar-aware pass detects this: after consuming the key + // string the state is ObjAfterKey, and `}` is rejected because + // it can only close ObjAfterOpen/ObjAfterValue. + #[test] + fn missing_colon() { + assert_rejects_eager!("{\"a\"}", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: a leading comma in an array is invalid. + // [,] — both commas have empty gaps → eager rejects via the ':'/',' + // heuristic in check_gap. + #[test] + fn leading_comma_array_empty() { + assert_rejects_eager!("[,]", QJD_PARSE_ERROR); + } + + // [,1] — leading comma followed by a value: the grammar-aware + // pass rejects this because `,` is invalid in the ArrAfterOpen + // state (only a value or `]` is allowed after `[`). + #[test] + fn leading_comma_array_with_value() { + assert_rejects_eager!("[,1]", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: trailing comma in an array is invalid. + #[test] + fn trailing_comma_array() { + assert_rejects_eager!("[1,]", QJD_PARSE_ERROR); + } + + // RFC 8259 §4: trailing comma in an object is invalid. + #[test] + fn trailing_comma_object() { + assert_rejects_eager!("{\"a\":1,}", QJD_PARSE_ERROR); + } + + // RFC 8259 §5: array elements must be separated by exactly one comma. + // [1 2] contains a space-separated pair that validate_number rejects as + // QJD_INVALID_NUMBER (not QJD_PARSE_ERROR) — the element IS rejected by + // eager, just with a different error code. + #[test] + fn missing_comma_in_array_rejected() { + // We assert only that eager rejects; the exact code is QJD_INVALID_NUMBER + // because the "1 2" token fails number validation (space within number). + let input = b"[1 2]"; + assert!( + Document::parse_with_options(input, &eager()).is_err(), + "EAGER should reject [1 2]" + ); + } + + // Missing comma inside an object (no structural separator between values): + // {"a":1"b":2} — after consuming the value `1`, the state is + // ObjAfterValue; the next `"` (start of "b") is rejected because + // a key/value-position quote is not legal there. + #[test] + fn missing_comma_in_object() { + assert_rejects_eager!("{\"a\":1\"b\":2}", QJD_PARSE_ERROR); + } +} + +mod whitespace { + use super::*; + + // RFC 8259 §2: insignificant whitespace (space, tab, LF, CR) is allowed + // before and after structural characters. + + #[test] + fn spaces_around_object() { + assert_accepts!(" { } "); + } + + #[test] + fn tabs_around_object() { + assert_accepts!("\t{}\t"); + } + + #[test] + fn newlines_around() { + assert_accepts!("\n{}\n"); + } + + #[test] + fn cr_around() { + assert_accepts!("\r{}\r"); + } + + #[test] + fn inside_object() { + assert_accepts!("{ \"a\" : 1 , \"b\" : 2 }"); + } + + #[test] + fn inside_array() { + assert_accepts!("[ 1 , 2 , 3 ]"); + } + + // All four RFC whitespace characters interleaved. + #[test] + fn mixed_whitespace() { + assert_accepts!(" \t\n\r { \t\n\r } \t\n\r "); + } +} + +mod literals { + use super::*; + + // RFC 8259 §3: only lowercase "true", "false", "null" are valid. + // Wrong case must be rejected by eager. + + #[test] + fn true_must_be_lowercase() { + assert_rejects_eager!("TRUE", QJD_PARSE_ERROR); + assert_rejects_eager!("True", QJD_PARSE_ERROR); + assert_rejects_eager!("tRuE", QJD_PARSE_ERROR); + } + + #[test] + fn false_must_be_lowercase() { + assert_rejects_eager!("FALSE", QJD_PARSE_ERROR); + assert_rejects_eager!("False", QJD_PARSE_ERROR); + } + + #[test] + fn null_must_be_lowercase() { + assert_rejects_eager!("NULL", QJD_PARSE_ERROR); + assert_rejects_eager!("Null", QJD_PARSE_ERROR); + } + + // JavaScript-ism: "nil" is not a valid JSON value. + #[test] + fn nil_rejected() { + assert_rejects_eager!("nil", QJD_PARSE_ERROR); + } + + // JavaScript-ism: "undefined" is not a valid JSON value. + #[test] + fn undefined_rejected() { + assert_rejects_eager!("undefined", QJD_PARSE_ERROR); + } +} + +mod strings { + use super::*; + + // RFC 8259 §7: string grammar. + + // Empty string is valid. + #[test] + fn empty_string() { + assert_accepts!("\"\""); + assert_accepts!("[\"\" ]"); + } + + // Printable ASCII (no special chars) is valid. + #[test] + fn ascii_string() { + assert_accepts!("\"hello world\""); + assert_accepts!("\"abcdefghijklmnopqrstuvwxyz 0123456789 !@#$%^&*()\""); + } + + // RFC 8259 §7: all defined escape sequences must be accepted. + #[test] + fn all_escape_sequences() { + // \" \\ \/ \b \f \n \r \t + assert_accepts!("\"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\""); + } + + // RFC 8259 §7: \uXXXX Unicode escape (4 hex digits). + #[test] + fn unicode_escape() { + assert_accepts!("\"\\u0000\""); // NUL encoded as escape — valid + assert_accepts!("\"\\u00e9\""); // é + assert_accepts!("\"\\u4e2d\\u6587\""); // 中文 + } + + // RFC 8259 §7: surrogate pair (\uD800–\uDBFF followed by \uDC00–\uDFFF). + #[test] + fn surrogate_pair() { + assert_accepts!("\"\\uD83D\\uDE00\""); // 😀 U+1F600 + } + + // RFC 8259 §7: strings must be terminated with a closing '"'. + #[test] + fn unclosed_string_rejected() { + assert_rejects_both!("\"hello"); + assert_rejects_both!("\""); + } + + // JSON does not allow single-quoted strings (JavaScript-ism). + #[test] + fn single_quoted_string_rejected() { + assert_rejects_eager!("'hello'", QJD_PARSE_ERROR); + } + + // RFC 8259 §7: control characters (U+0000–U+001F) must be escaped. + // A raw tab (0x09) inside a string is forbidden. + #[test] + fn raw_control_char_rejected() { + use quickdecode::error::qjd_err; + let with_tab = b"[\"a\tb\"]"; + let with_null = b"[\"a\x00b\"]"; + match Document::parse_with_options(with_tab, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + other => panic!("expected QJD_INVALID_STRING for raw tab, got {:?}", other.err()), + } + match Document::parse_with_options(with_null, &eager()) { + Err(qjd_err::QJD_INVALID_STRING) => {} + other => panic!("expected QJD_INVALID_STRING for raw NUL, got {:?}", other.err()), + } + } + + // Strings with valid multi-byte UTF-8 content are accepted. + #[test] + fn utf8_multibyte_string() { + assert_accepts!("\"café\""); // 2-byte sequence + assert_accepts!("\"中文\""); // 3-byte sequences + assert_accepts!("\"😀\""); // 4-byte sequence (emoji) + } +} + +mod numbers { + use super::*; + + // RFC 8259 §6: number grammar. + // These complement the existing top-level number tests with a thorough + // table-driven suite organized by sub-rule. + + // §6 integer: optional minus, zero, or non-zero digit followed by digits. + #[test] + fn integers_valid() { + for s in ["0", "-0", "1", "-1", "123", "-456", + "9223372036854775807", "-9223372036854775808"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6 fraction: a '.' followed by one or more digits. + #[test] + fn fractions_valid() { + for s in ["0.0", "-0.0", "1.5", "-2.718", "3.14159", + "0.123456789"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6 exponent: 'e'/'E' with optional '+'/'-' and one or more digits. + #[test] + fn exponents_valid() { + for s in ["1e10", "1E10", "1e+10", "1e-10", + "1.5e2", "2.5E-3", "0e0", "-0e0"] { + let input = format!("[{}]", s); + assert_accepts!(input); + } + } + + // §6: leading '+' is not allowed. + #[test] + fn leading_plus_rejected() { + assert_rejects_eager!("[+1]", QJD_INVALID_NUMBER); + } + + // §6: leading zeros are not allowed (except bare "0"). + #[test] + fn leading_zero_rejected() { + assert_rejects_eager!("[01]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[00]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[007]", QJD_INVALID_NUMBER); + } + + // §6: fraction requires at least one digit after the dot. + #[test] + fn trailing_dot_rejected() { + assert_rejects_eager!("[1.]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1.e5]", QJD_INVALID_NUMBER); + } + + // §6: fraction cannot start without an integer part. + #[test] + fn leading_dot_rejected() { + assert_rejects_eager!("[.5]", QJD_INVALID_NUMBER); + } + + // §6: exponent requires at least one digit. + #[test] + fn incomplete_exponent_rejected() { + assert_rejects_eager!("[1e]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1e+]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[1e-]", QJD_INVALID_NUMBER); + } + + // Hex notation is not part of the JSON number grammar. + #[test] + fn hex_notation_rejected() { + assert_rejects_eager!("[0x1F]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[0xFF]", QJD_INVALID_NUMBER); + } + + // Non-finite values are not part of JSON. + #[test] + fn non_finite_rejected() { + assert_rejects_eager!("[NaN]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[Infinity]", QJD_INVALID_NUMBER); + assert_rejects_eager!("[-Infinity]", QJD_INVALID_NUMBER); + } + + // Lone minus is not a valid number. + #[test] + fn lone_minus_rejected() { + assert_rejects_eager!("[-]", QJD_INVALID_NUMBER); + } +} diff --git a/tests/vendor/JSONTestSuite b/tests/vendor/JSONTestSuite new file mode 160000 index 0000000..1ef36fa --- /dev/null +++ b/tests/vendor/JSONTestSuite @@ -0,0 +1 @@ +Subproject commit 1ef36fa01286573e846ac449e8683f8833c5b26a