From a74e5398cca41791f6c03ebe6ca40ca60d8950b9 Mon Sep 17 00:00:00 2001 From: Tilo Sloboda Date: Fri, 12 Jun 2026 14:03:55 -0700 Subject: [PATCH 1/5] 1.2.0: recognize Null / NULL as nil; add recognized-literal corner-case tests Null and NULL join null / None / undefined as recognized spellings of nil (SQL / R / PHP var_export / YAML / DB-derived input), in BOTH the C extension and the pure-Ruby parser, in every position the existing spellings work (top-level, object value, array element). Quoted ("NULL") and embedded (NULL Island) forms stay strings. Uppercase TRUE/FALSE intentionally NOT added (thin precedent: PHP/SQL emit lowercase/numeric booleans; only R uses uppercase). Also adds the corner-case tests discussed: NaN/Infinity (real Float objects, +Infinity, array elements) and recognized-literal classification boundaries (quoted stays string; wrong-case stays string). Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 4 ++ ext/smarter_json/smarter_json.c | 7 ++- lib/smarter_json/parser.rb | 11 ++-- lib/smarter_json/version.rb | 2 +- spec/parser_spec.rb | 105 ++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ade2eef..c36b281 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ > ⚠️ We discourage the use of `process(input).first` / `process(input)[0]` because it silently drops potential additional documents > Please use `process_one` if you are expecting only one JSON doc, e.g. in API payloads, because it emits on_warning if it finds multiple docs. +## 1.2.0 (unreleased) + +- `Null` and `NULL` are now read as `nil` (joining `null` / `None` / `undefined`), for SQL / R / PHP / YAML / DB-derived input — in every position the existing spellings work. Quoted (`"NULL"`) or embedded (`NULL Island`) forms stay strings. + ## 1.1.2 (2026-06-12) RSpec tests: 1,097 diff --git a/ext/smarter_json/smarter_json.c b/ext/smarter_json/smarter_json.c index cf74957..77ccf8a 100644 --- a/ext/smarter_json/smarter_json.c +++ b/ext/smarter_json/smarter_json.c @@ -979,7 +979,8 @@ static VALUE fj_classify_quoteless(fj_state *st, const char *p0, long n0) { if (fj_tok_eq(p, n, "true") || fj_tok_eq(p, n, "True")) return Qtrue; if (fj_tok_eq(p, n, "false") || fj_tok_eq(p, n, "False")) return Qfalse; - if (fj_tok_eq(p, n, "null") || fj_tok_eq(p, n, "None") || fj_tok_eq(p, n, "undefined")) return Qnil; + if (fj_tok_eq(p, n, "null") || fj_tok_eq(p, n, "Null") || fj_tok_eq(p, n, "NULL") || + fj_tok_eq(p, n, "None") || fj_tok_eq(p, n, "undefined")) return Qnil; if (fj_tok_eq(p, n, "NaN")) return rb_float_new(NAN); if (fj_tok_eq(p, n, "Infinity")) return rb_float_new(INFINITY); @@ -1273,8 +1274,10 @@ static VALUE fj_parse_value(fj_state *st) { case 'T': return fj_parse_literal(st, "True", Qtrue); case 'F': return fj_parse_literal(st, "False", Qfalse); case 'u': return fj_parse_literal(st, "undefined", Qnil); - case 'N': /* NaN (number) vs None (Python null) */ + case 'N': /* NaN (number); None / Null / NULL (null) */ if (fj_byte_at(st, 1) == 'a') return fj_parse_number(st); + if (fj_byte_at(st, 1) == 'u') return fj_parse_literal(st, "Null", Qnil); + if (fj_byte_at(st, 1) == 'U') return fj_parse_literal(st, "NULL", Qnil); return fj_parse_literal(st, "None", Qnil); default: if (b == '-' || b == '+' || b == '.' || b == 'I' || (b >= '0' && b <= '9')) { diff --git a/lib/smarter_json/parser.rb b/lib/smarter_json/parser.rb index 6ccfa51..5c04aba 100644 --- a/lib/smarter_json/parser.rb +++ b/lib/smarter_json/parser.rb @@ -1210,10 +1210,11 @@ def parse_value # Disambiguate NaN (number) from None (Python null) at a strict position. def parse_upper_n - if byte_at(1) == 0x61 # 'a' → NaN - parse_number - else - parse_literal_keyword("None", nil) + case byte_at(1) + when 0x61 then parse_number # 'a' -> NaN + when 0x75 then parse_literal_keyword("Null", nil) # 'u' -> Null + when 0x55 then parse_literal_keyword("NULL", nil) # 'U' -> NULL + else parse_literal_keyword("None", nil) end end @@ -1378,7 +1379,7 @@ def classify_quoteless(str) case str when "true", "True" then return true when "false", "False" then return false - when "null", "None" then return nil + when "null", "Null", "NULL", "None" then return nil when "undefined" then return nil when "NaN" then return Float::NAN when "Infinity", "+Infinity" then return Float::INFINITY diff --git a/lib/smarter_json/version.rb b/lib/smarter_json/version.rb index ea3301a..27aa438 100644 --- a/lib/smarter_json/version.rb +++ b/lib/smarter_json/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module SmarterJSON - VERSION = "1.1.2" + VERSION = "1.2.0" end diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index 2fa2583..be8a0ec 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -299,6 +299,71 @@ expect(SmarterJSON.process("NaN", acceleration: acceleration).first).to be_a(Float).and(be_nan) expect(SmarterJSON.process_one("NaN", acceleration: acceleration)).to be_a(Float).and(be_nan) end + + it "parses +Infinity at the top level" do + expect(SmarterJSON.process("+Infinity", acceleration: acceleration)).to eq([Float::INFINITY]) + expect(SmarterJSON.process_one("+Infinity", acceleration: acceleration)).to eq(Float::INFINITY) + end + + it "parses NaN and Infinity as array elements" do + result = SmarterJSON.process_one("[1, NaN, Infinity, -Infinity]", acceleration: acceleration) + expect(result[0]).to eq(1) + expect(result[1]).to be_a(Float).and(be_nan) + expect(result[2]).to eq(Float::INFINITY) + expect(result[3]).to eq(-Float::INFINITY) + end + + it "returns real Float objects (Float::INFINITY / Float::NAN), usable as numbers" do + inf = SmarterJSON.process_one("Infinity", acceleration: acceleration) + nan = SmarterJSON.process_one("NaN", acceleration: acceleration) + expect(inf).to be_a(Float) + expect(inf.infinite?).to eq(1) # +Infinity (Float#infinite? => 1) + expect(inf + 1).to eq(Float::INFINITY) # arithmetic works + expect(nan).to be_a(Float) + expect(nan).to be_nan # <-- the real check: did SmarterJSON return a true NaN? + expect(nan).not_to eq(nan) # <-- just re-demonstrates IEEE behavior (NaN != itself), not OUR behavior + end + + it "classifies recognized literals to their values alongside NaN/Infinity (recognized-literals-win)" do + result = SmarterJSON.process_one(<<~JSON, acceleration: acceleration) + { + happy: True, + sad: False, + nothing: None, + missing: undefined, + score: Infinity, + debt: -Infinity, + ratio: NaN + } + JSON + expect(result["happy"]).to be(true) + expect(result["sad"]).to be(false) + expect(result["nothing"]).to be_nil + expect(result["missing"]).to be_nil + expect(result["score"]).to eq(Float::INFINITY) + expect(result["debt"]).to eq(-Float::INFINITY) + expect(result["ratio"]).to be_a(Float).and(be_nan) + end + end + + describe "recognized-literal classification boundaries" do + # The literal classification (true/True, false/False, null/None/undefined, + # NaN, Infinity) applies ONLY to unquoted tokens, and matches exactly. + + it "keeps a QUOTED recognized literal as a string (classification is quoteless-only)" do + expect(SmarterJSON.process_one('{"a": "True"}', acceleration: acceleration)).to eq({ "a" => "True" }) + expect(SmarterJSON.process_one('{"a": "NaN"}', acceleration: acceleration)).to eq({ "a" => "NaN" }) + expect(SmarterJSON.process_one('{"a": "Infinity"}', acceleration: acceleration)).to eq({ "a" => "Infinity" }) + expect(SmarterJSON.process_one('{"a": "None"}', acceleration: acceleration)).to eq({ "a" => "None" }) + expect(SmarterJSON.process_one('{"a": "null"}', acceleration: acceleration)).to eq({ "a" => "null" }) + end + + it "does not recognize wrong-case variants (exact match) — they stay quoteless strings" do + expect(SmarterJSON.process_one("{a: TRUE}", acceleration: acceleration)).to eq({ "a" => "TRUE" }) + expect(SmarterJSON.process_one("{a: nan}", acceleration: acceleration)).to eq({ "a" => "nan" }) + expect(SmarterJSON.process_one("{a: infinity}", acceleration: acceleration)).to eq({ "a" => "infinity" }) + expect(SmarterJSON.process_one("{a: NONE}", acceleration: acceleration)).to eq({ "a" => "NONE" }) + end end describe "explicit + sign on numbers" do @@ -875,6 +940,46 @@ end end + describe "Null / NULL (SQL / R / PHP / YAML null)" do + # Null and NULL join null / None / undefined as recognized spellings of nil + # (SQL / R / PHP var_export / YAML / DB-derived input). TRUE/FALSE are intentionally + # NOT added — uppercase booleans have far thinner precedent (see CHANGELOG/discussion). + + it "parses Null as nil (top level)" do + expect(SmarterJSON.process("Null", acceleration: acceleration)).to eq([nil]) + expect(SmarterJSON.process_one("Null", acceleration: acceleration)).to be_nil + end + + it "parses NULL as nil (top level)" do + expect(SmarterJSON.process("NULL", acceleration: acceleration)).to eq([nil]) + expect(SmarterJSON.process_one("NULL", acceleration: acceleration)).to be_nil + end + + it "recognizes Null / NULL as object value and array element" do + expect(SmarterJSON.process_one("{a: Null, b: NULL}", acceleration: acceleration)).to eq({ "a" => nil, "b" => nil }) + expect(SmarterJSON.process_one("[Null, NULL]", acceleration: acceleration)).to eq([nil, nil]) + end + + it "recognizes Null / NULL surrounded by whitespace" do + expect(SmarterJSON.process_one("{ a: Null , b: NULL }", acceleration: acceleration)).to eq({ "a" => nil, "b" => nil }) + end + + it "keeps a QUOTED Null / NULL as a string (classification is quoteless-only)" do + expect(SmarterJSON.process_one('{"a": "Null"}', acceleration: acceleration)).to eq({ "a" => "Null" }) + expect(SmarterJSON.process_one('{"a": "NULL"}', acceleration: acceleration)).to eq({ "a" => "NULL" }) + end + + it "does NOT recognize Null / NULL embedded in a larger token (stays a string)" do + expect(SmarterJSON.process_one("{a: NULL Island}", acceleration: acceleration)).to eq({ "a" => "NULL Island" }) + expect(SmarterJSON.process_one("{a: Null and void}", acceleration: acceleration)).to eq({ "a" => "Null and void" }) + expect(SmarterJSON.process_one("{a: Nullable}", acceleration: acceleration)).to eq({ "a" => "Nullable" }) + end + + it "leaves None / null / undefined unchanged (still nil)" do + expect(SmarterJSON.process_one("{a: None, b: null, c: undefined}", acceleration: acceleration)).to eq({ "a" => nil, "b" => nil, "c" => nil }) + end + end + describe "Python literals" do it "parses True as true" do expect(SmarterJSON.process("True", acceleration: acceleration)).to eq([true]) From 6594c51f5a117378844d239cccc8e8e02988c8c6 Mon Sep 17 00:00:00 2001 From: Tilo Sloboda Date: Fri, 12 Jun 2026 16:54:36 -0700 Subject: [PATCH 2/5] Leading-zero numbers: sign/dot/exponent -> number, bare integer -> string A leading-zero token now reads as a number when it carries a sign, a decimal point, or an exponent (+007 -> 7, -000023.5 -> -23.5, 00.0 -> 0.0, 007e2 -> 700.0). A bare leading-zero integer (000001, 02) still reads as a string, so IDs, zip codes, and account numbers keep their zeros. C path: fj_try_decimal (quoteless/container) and fj_parse_number (strict/top-level) consume leading zeros and reject a bare leading-zero integer. Ruby path: DEC_RE admits leading zeros, numeric_value returns the string for a bare leading-zero integer, parse_number raises at top level. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 1 + ext/smarter_json/smarter_json.c | 50 +++++++++++++++++++++++++++++---- lib/smarter_json/parser.rb | 29 +++++++++++++++++-- spec/parser_spec.rb | 41 ++++++++++++++++++++++++--- 4 files changed, 109 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c36b281..4996fa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ ## 1.2.0 (unreleased) +- A leading-zero token now reads as a number when it carries a sign, a decimal point, or an exponent (`+007` → `7`, `-000023.5` → `-23.5`, `00.0` → `0.0`, `007e2` → `700.0`). A bare leading-zero integer (`000001`, `02`) still reads as a string, so IDs, zip codes, and account numbers keep their zeros. - `Null` and `NULL` are now read as `nil` (joining `null` / `None` / `undefined`), for SQL / R / PHP / YAML / DB-derived input — in every position the existing spellings work. Quoted (`"NULL"`) or embedded (`NULL Island`) forms stay strings. ## 1.1.2 (2026-06-12) diff --git a/ext/smarter_json/smarter_json.c b/ext/smarter_json/smarter_json.c index 77ccf8a..947f4ff 100644 --- a/ext/smarter_json/smarter_json.c +++ b/ext/smarter_json/smarter_json.c @@ -641,16 +641,31 @@ static FJ_ALWAYS_INLINE VALUE fj_float_from_parts(fj_state *st, uint64_t m10, in * per-byte '_' test, dropping to a slow step only when an underscore appears. */ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) { long i = 0; - int is_float = 0, neg = 0, has_digit = 0, overflow = 0; + int is_float = 0, neg = 0, has_digit = 0, overflow = 0, has_sign = 0, had_leading_zero = 0; uint64_t m10 = 0; int m10digits = 0, frac = 0; int64_t e10 = 0; - if (i < n && (p[i] == '-' || p[i] == '+')) { neg = (p[i] == '-'); i++; } + if (i < n && (p[i] == '-' || p[i] == '+')) { has_sign = 1; neg = (p[i] == '-'); i++; } - /* Integer part: a single '0', or [1-9] then digits/underscores. */ + /* Integer part: a single '0', or [1-9] then digits/underscores. A leading '0' followed + * by more digits (a leading-zero token) is consumed too but flagged: a BARE leading-zero + * integer (no sign / dot / exponent) is rejected below and kept as a string, so zip / + * account / check numbers preserve their zeros. */ if (i < n && p[i] == '0') { has_digit = 1; m10digits = 1; i++; + if (i < n && p[i] >= '0' && p[i] <= '9') { + had_leading_zero = 1; + for (;;) { + while (i < n && p[i] >= '0' && p[i] <= '9') { + if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(p[i] - '0'); m10digits++; } + else overflow = 1; + i++; + } + if (i < n && p[i] == '_') { i++; continue; } + break; + } + } } else if (i < n && p[i] >= '1' && p[i] <= '9') { has_digit = 1; for (;;) { @@ -699,6 +714,8 @@ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) { if (i != n) return 0; /* token not fully consumed -> not a number (string) */ if (!has_digit) return 0; /* e.g. "." or "+" -> not a number (string) */ + /* A BARE leading-zero integer (no sign / dot / exponent) is an ID, not a number. */ + if (had_leading_zero && !has_sign && !is_float) return 0; if (!is_float) { *out = fj_int_from_parts(m10, m10digits, neg, overflow, p, n); @@ -730,13 +747,13 @@ static VALUE fj_parse_number(fj_state *st) { const char *p = buf + st->pos; /* buf[len] == '\0' (RSTRING_PTR) is the scan sentinel */ const char *np = p; /* token start, includes a leading sign */ long nlen; - int is_float = 0, neg = 0, overflow = 0; + int is_float = 0, neg = 0, overflow = 0, has_sign = 0, had_leading_zero = 0; uint64_t m10 = 0; /* mantissa: integer + fraction digits */ int m10digits = 0; /* mantissa digit chars (caps the Eisel-Lemire fast path at 18) */ int frac = 0; /* fraction digit chars: e10 -= frac */ int64_t e10 = 0; - if (*p == '-' || *p == '+') { neg = (*p == '-'); p++; } + if (*p == '-' || *p == '+') { has_sign = 1; neg = (*p == '-'); p++; } /* Cold branches (rare, not perf-critical): sync the cursor, reuse scalar helpers. */ if (*p == 'I') { st->pos = p - buf; fj_consume_keyword(st, "Infinity"); return rb_float_new(neg ? -INFINITY : INFINITY); } @@ -755,10 +772,25 @@ static VALUE fj_parse_number(fj_state *st) { return rb_str_to_inum(hx, 16, 0); } - /* Integer part: a single '0', or [1-9] then digits/underscores. */ + /* Integer part: a single '0', or [1-9] then digits/underscores. A leading '0' followed + * by more digits is consumed but flagged; a BARE leading-zero integer (no sign / dot / + * exponent) is rejected after the scan — it is an ID, not a number, and has no bare + * top-level quoteless-string form, so it raises (matching `000001`). */ if (*p == '0') { m10digits = 1; /* one leading zero, counted as a single mantissa digit */ p++; + if (*p >= '0' && *p <= '9') { + had_leading_zero = 1; + for (;;) { + while (*p >= '0' && *p <= '9') { + if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; } + else overflow = 1; + p++; + } + if (*p == '_') { p++; continue; } + break; + } + } } else if (*p >= '1' && *p <= '9') { for (;;) { while (*p >= '0' && *p <= '9') { @@ -811,6 +843,12 @@ static VALUE fj_parse_number(fj_state *st) { st->pos = p - buf; nlen = p - np; + /* A BARE leading-zero integer is an ID, not a number; at this top-level / strict + * position there is no quoteless-string form, so it raises. */ + if (had_leading_zero && !has_sign && !is_float) { + fj_error(st, "invalid number with a leading zero"); + } + if (!is_float) { return fj_int_from_parts(m10, m10digits, neg, overflow, np, nlen); } diff --git a/lib/smarter_json/parser.rb b/lib/smarter_json/parser.rb index 5c04aba..77b5c1f 100644 --- a/lib/smarter_json/parser.rb +++ b/lib/smarter_json/parser.rb @@ -739,7 +739,7 @@ class Parser # Mantissa must carry at least one digit (int part, or a leading-dot fraction), so a # bare exponent like "-e695881" is NOT a number — it falls through to a quoteless # string, matching the C path. Trailing exponent stays optional. - DEC_RE = /\A[-+]?(?:(?:0|[1-9][0-9_]*)(?:\.[0-9_]*)?|\.[0-9_]+)(?:[eE][-+]?[0-9_]+)?\z/.freeze + DEC_RE = /\A[-+]?(?:[0-9][0-9_]*(?:\.[0-9_]*)?|\.[0-9_]+)(?:[eE][-+]?[0-9_]+)?\z/.freeze # A decimal BigDecimal() would reject as-is: a leading dot (".5") or a dot not # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal # would change the string — so when it doesn't match, we skip normalization. @@ -1406,7 +1406,15 @@ def numeric_value(str) # number tokens that is a real per-value allocation. Underscores are rare, so only # pay it when the token actually contains one (measured +27% on long-token decimals). body = str.include?("_") ? str.delete("_") : str - body.match?(/[.eE]/) ? decimal_value(body) : body.to_i + return decimal_value(body) if body.match?(/[.eE]/) + + # A BARE leading-zero integer (no sign / dot / exponent) is an ID — a zip code, + # account number, phone number — not a number; keep it a string so the zeros survive. + # A sign (+007 / -007) signals numeric intent (IDs never carry a sign), so those parse. + c0 = body.getbyte(0) + return NOT_NUMERIC if c0 == ZERO && body.bytesize > 1 + + body.to_i end # True when the token starts with [+-]?0[xX] — the only shape HEX_RE can match. @@ -1664,10 +1672,13 @@ def decode_unicode_escape(i) def parse_number negative = false + signed = false if byte == MINUS negative = true + signed = true advance(1) elsif byte == PLUS + signed = true advance(1) end @@ -1681,6 +1692,7 @@ def parse_number end int_start = @pos + had_leading_zero = false if byte == ZERO advance(1) @@ -1693,6 +1705,12 @@ def parse_number value = @input.byteslice(hex_start, @pos - hex_start).delete("_").to_i(16) return negative ? -value : value end + # A run of further digits after the single leading '0' (007, 00023) — consume them + # and flag it; the reject check below turns a bare leading-zero integer into an error. + if (b = byte) && b >= ZERO && b <= NINE + had_leading_zero = true + advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) + end elsif byte && byte >= 0x31 && byte <= NINE advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) elsif byte == DOT @@ -1718,6 +1736,13 @@ def parse_number advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) end + # A BARE leading-zero integer is an ID, not a number; at this top-level / strict + # position there is no quoteless-string form, so it raises (a sign or a dot/exponent + # signals numeric intent and is allowed: +007 -> 7, -000023.5 -> -23.5, 007e2 -> 700.0). + if had_leading_zero && !signed && !is_float + raise error("invalid number with a leading zero") + end + slice = @input.byteslice(int_start, @pos - int_start).delete("_") value = is_float ? decimal_value(slice) : slice.to_i negative ? -value : value diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index be8a0ec..f3aa698 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -646,7 +646,7 @@ end end - describe "leading-zero numbers fall through to quoteless strings" do + describe "bare leading-zero integers fall through to quoteless strings" do it 'parses 0080 as the string "0080"' do expect(SmarterJSON.process("port: 0080", acceleration: acceleration)).to eq([{ "port" => "0080" }]) expect(SmarterJSON.process_one("port: 0080", acceleration: acceleration)).to eq({ "port" => "0080" }) @@ -663,6 +663,39 @@ end end + describe "leading-zero numbers with sign / dot / exponent parse as numbers" do + # A leading-zero token carries numeric intent — and parses as a NUMBER — when it + # has a sign, a decimal point, or an exponent. A BARE leading-zero integer stays a + # string (block above) so zip / account / check numbers keep their zeros; IDs never + # carry a sign, dot, or exponent. + + it "parses a signed leading-zero integer as a number" do + expect(SmarterJSON.process_one("{a: +007, b: -007}", acceleration: acceleration)).to eq({ "a" => 7, "b" => -7 }) + end + + it "parses a leading-zero decimal as a number" do + expect(SmarterJSON.process_one("{a: 00.00, b: -000023.5, c: 00001.5}", acceleration: acceleration)).to eq({ "a" => 0.0, "b" => -23.5, "c" => 1.5 }) + end + + it "parses leading-zero scientific notation as a number" do + expect(SmarterJSON.process_one("{a: 00e5, b: 007e2, c: +00e5}", acceleration: acceleration)).to eq({ "a" => 0.0, "b" => 700.0, "c" => 0.0 }) + end + + it "applies in arrays and at the top level too" do + expect(SmarterJSON.process_one("[000001, -000023.5, 007e2]", acceleration: acceleration)).to eq(["000001", -23.5, 700.0]) + expect(SmarterJSON.process_one("-000023.5", acceleration: acceleration)).to eq(-23.5) + expect(SmarterJSON.process_one("+007", acceleration: acceleration)).to eq(7) + end + + it "a bare leading-zero integer still has no top-level form (raises)" do + expect { SmarterJSON.process_one("000001", acceleration: acceleration) }.to raise_error(SmarterJSON::ParseError) + end + + it "leaves single-zero and non-leading-zero numbers unchanged" do + expect(SmarterJSON.process_one("{a: 0, b: 0.5, c: -0, d: 1.5e3}", acceleration: acceleration)).to eq({ "a" => 0, "b" => 0.5, "c" => 0, "d" => 1500.0 }) + end + end + describe "implicit root object" do it "parses key: value at top level without outer {}" do expect(SmarterJSON.process("host: localhost\nport: 5432", acceleration: acceleration)).to eq([{ "host" => "localhost", "port" => 5432 }]) @@ -1897,9 +1930,9 @@ def parse_values(input, **opts) expect(result["true1"]).to eq(true) expect(result["false1"]).to eq(false) expect(result["null1"]).to be_nil - expect(result["str1"]).to eq("00") # leading zero → string - expect(result["str2"]).to eq("00.0") - expect(result["str3"]).to eq("02") + expect(result["str1"]).to eq("00") # bare leading zero -> string (same case as an account number) + expect(result["str2"]).to eq(0.0) # leading zero + dot -> number + expect(result["str3"]).to eq("02") # bare leading zero -> string (same case as an account number) end it "parses strings_test.hjson and recognizes string-vs-literal distinction" do From ec03b61e45843427b42309fa7cf1c0844d70b4cf Mon Sep 17 00:00:00 2001 From: Tilo Sloboda Date: Fri, 12 Jun 2026 17:59:42 -0700 Subject: [PATCH 3/5] Leading-zero numbers: treat an underscore right after the zero as a separator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A token like 0_0.5 is 00.5 once the underscore separator is removed, so it should parse as a number (the dot signals numeric intent) — and identically on the C and Ruby paths, in containers and at the top level. The C scanners diverged: their leading-'0' branch entered its digit run-loop only on a digit, not on '_', so 0_0.5 / 0_0e5 stayed strings (C) while the Ruby container path already parsed them as numbers. The top-level path raised on both. All three scanners (fj_try_decimal, fj_parse_number, Ruby parse_number) now enter the run-loop on '_' too, matching the [1-9] branch; had_leading_zero is flagged only when a real digit is consumed, so a bare 0_0 / 0_07 still stays a string (raises at the top level). Defect in the unreleased 1.2.0 leading-zero work; never shipped, so no CHANGELOG change. Co-Authored-By: Claude Opus 4.8 --- ext/smarter_json/smarter_json.c | 14 ++++++++++---- lib/smarter_json/parser.rb | 14 +++++++++----- spec/parser_spec.rb | 28 ++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/ext/smarter_json/smarter_json.c b/ext/smarter_json/smarter_json.c index 947f4ff..6ef8cf4 100644 --- a/ext/smarter_json/smarter_json.c +++ b/ext/smarter_json/smarter_json.c @@ -654,10 +654,13 @@ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) { * account / check numbers preserve their zeros. */ if (i < n && p[i] == '0') { has_digit = 1; m10digits = 1; i++; - if (i < n && p[i] >= '0' && p[i] <= '9') { - had_leading_zero = 1; + /* A leading '0' followed by more digits (possibly underscore-separated, like the + * [1-9] branch below) is a leading-zero token: consume the run and flag it, so + * 0_5.0 / 0_0.5 behave exactly like 05.0 / 00.5 on both paths. */ + if (i < n && ((p[i] >= '0' && p[i] <= '9') || p[i] == '_')) { for (;;) { while (i < n && p[i] >= '0' && p[i] <= '9') { + had_leading_zero = 1; if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(p[i] - '0'); m10digits++; } else overflow = 1; i++; @@ -779,10 +782,13 @@ static VALUE fj_parse_number(fj_state *st) { if (*p == '0') { m10digits = 1; /* one leading zero, counted as a single mantissa digit */ p++; - if (*p >= '0' && *p <= '9') { - had_leading_zero = 1; + /* A leading '0' followed by more digits (possibly underscore-separated, like the + * [1-9] branch below) is a leading-zero token: consume the run and flag it, so the + * underscore is just a separator (0_0.5 behaves like 00.5). */ + if ((*p >= '0' && *p <= '9') || *p == '_') { for (;;) { while (*p >= '0' && *p <= '9') { + had_leading_zero = 1; if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; } else overflow = 1; p++; diff --git a/lib/smarter_json/parser.rb b/lib/smarter_json/parser.rb index 77b5c1f..923c2e5 100644 --- a/lib/smarter_json/parser.rb +++ b/lib/smarter_json/parser.rb @@ -1705,11 +1705,15 @@ def parse_number value = @input.byteslice(hex_start, @pos - hex_start).delete("_").to_i(16) return negative ? -value : value end - # A run of further digits after the single leading '0' (007, 00023) — consume them - # and flag it; the reject check below turns a bare leading-zero integer into an error. - if (b = byte) && b >= ZERO && b <= NINE - had_leading_zero = true - advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) + # A run of further digits after the single leading '0' (007, 00023, or the + # underscore-separated 0_0) — consume it and flag the leading zero; the reject check + # below turns a bare leading-zero integer into an error. The underscore is only a + # separator, so 0_0.5 behaves like 00.5. + if (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) + while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) + had_leading_zero = true if b >= ZERO && b <= NINE + advance(1) + end end elsif byte && byte >= 0x31 && byte <= NINE advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE) diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index f3aa698..409a511 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -694,6 +694,34 @@ it "leaves single-zero and non-leading-zero numbers unchanged" do expect(SmarterJSON.process_one("{a: 0, b: 0.5, c: -0, d: 1.5e3}", acceleration: acceleration)).to eq({ "a" => 0, "b" => 0.5, "c" => 0, "d" => 1500.0 }) end + + it "treats an underscore right after the leading zero like any other leading-zero run" do + # 0_0.5 is 00.5 once the underscore is removed -> a number, because of the dot; + # 0_0e5 is 00e5 -> a number, because of the exponent. Both paths must agree (the C + # scanner once bailed on the 0_ shape and kept the whole token a string). + expect(SmarterJSON.process_one("{a: 0_0.5, b: 0_0e5}", acceleration: acceleration)).to eq({ "a" => 0.5, "b" => 0.0 }) + # a bare 0_0 (no sign / dot / exponent) is still an ID -> string. + expect(SmarterJSON.process_one("{a: 0_0, b: 0_07}", acceleration: acceleration)).to eq({ "a" => "0_0", "b" => "0_07" }) + end + + it "treats 0_ at the top level just like the underscore-free form" do + # The underscore is only a digit separator, so 0_0.5 behaves like 00.5 everywhere: + # at the top level it is a recognized number (00.5 -> 0.5 there too). + expect(SmarterJSON.process_one("0_0.5", acceleration: acceleration)).to eq(0.5) + expect(SmarterJSON.process_one("0_0e5", acceleration: acceleration)).to eq(0.0) + # ...while a bare 0_0 has no top-level form, same as 00 / 000001 -> it raises. + expect { SmarterJSON.process_one("0_0", acceleration: acceleration) }.to raise_error(SmarterJSON::ParseError) + end + end + + describe "version-like strings (multiple dots) stay strings" do + it "keeps dotted version numbers as strings on both paths" do + expect(SmarterJSON.process_one("{a: 1.1.0, b: 0.0.2, c: 2.0.0.1, d: 2.0.2.0.pre1}", acceleration: acceleration)).to eq({ "a" => "1.1.0", "b" => "0.0.2", "c" => "2.0.0.1", "d" => "2.0.2.0.pre1" }) + end + + it "keeps leading-zero version numbers as strings too" do + expect(SmarterJSON.process_one("{a: 00.0.2, b: 007.0.1, c: 0.9.7}", acceleration: acceleration)).to eq({ "a" => "00.0.2", "b" => "007.0.1", "c" => "0.9.7" }) + end end describe "implicit root object" do From f5c4ee0ac69adce63de2daf8684a5c14e62ad452 Mon Sep 17 00:00:00 2001 From: Tilo Sloboda Date: Fri, 12 Jun 2026 18:03:04 -0700 Subject: [PATCH 4/5] update docs --- README.md | 5 +++-- docs/_introduction.md | 2 +- docs/examples.md | 24 ++++++++++++++++++++---- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0827981..6e60563 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ A lenient, fast JSON processor for Ruby. It extracts strict JSON, NDJSON, JSONL, ## Features at a glance -- **Reads the whole human-JSON superset, no modes or flags** — strict JSON, NDJSON, JSONL, JSON5, HJSON, JSONC, plus comments, trailing commas, unquoted / single / triple / smart quotes, an implicit root object, `NaN` / `Infinity` / hex / underscores, Python & JavaScript literals, a UTF-8 BOM, mixed line endings, and any Ruby encoding (see [What it accepts](#what-it-accepts-beyond-strict-json) for the full list). +- **Reads the whole human-JSON superset, no modes or flags** — strict JSON, NDJSON, JSONL, JSON5, HJSON, JSONC, plus comments, trailing commas, unquoted / single / triple / smart quotes, an implicit root object, `NaN` / `Infinity` / hex / underscores, Python / JavaScript / SQL literals, a UTF-8 BOM, mixed line endings, and any Ruby encoding (see [What it accepts](#what-it-accepts-beyond-strict-json) for the full list). - **Every document from multi-document input, in one call** — `process` returns an `Array` of all of them; `process_one` returns the single value and warns if there was more than one (never raises; routed to `on_warning`, else `Rails.logger`, else `Kernel.warn`). - **Streaming in bounded memory** — pass a block, or use `foreach(path_or_io)` for a composable `Enumerator` you can `.select` / `.map` / `.lazy` over. - **Recovers JSON from LLM / markdown noise** — strips markdown code fences, surrounding prose, and `` tags, and pulls every payload out of one messy blob. @@ -75,7 +75,8 @@ Three things set it apart: - Trailing commas; unquoted keys (`{host: localhost}`); single-quoted, triple-quoted (`'''…'''`), and quoteless string values - Implicit root object — a config file that starts with `key: value`, no outer `{}` - `NaN`, `Infinity`, hex (`0xFF`), leading `+` / `.`, underscores in numbers (`1_000_000`) -- UTF-8 BOM, smart/curly quotes (in keys and values), Python literals (`True` / `False` / `None`), JavaScript `undefined` +- Leading-zero numbers (which strict JSON rejects): a token with a sign, decimal point, or exponent reads as a number (`-007.5` → `-7.5`, `007e2` → `700.0`), but a bare leading-zero integer is kept as a string (`007`, `02`) so IDs, zip codes, and account numbers don't lose their zeros +- UTF-8 BOM, smart/curly quotes (in keys and values), Python literals (`True` / `False` / `None`), JavaScript `undefined`, case-variant null (`Null` / `NULL`, as SQL / R / YAML emit it) - Mixed CR / LF / CRLF line endings, and any Ruby-supported input encoding (via `encoding:`) - Duplicate keys (last value wins by default; configurable) diff --git a/docs/_introduction.md b/docs/_introduction.md index b060e2c..5100178 100644 --- a/docs/_introduction.md +++ b/docs/_introduction.md @@ -29,7 +29,7 @@ Most JSON parsers reject anything that isn't perfectly strict JSON, and they mak ## What it accepts, beyond strict JSON -Comments (`//`, `/* … */`, `#` — a `#`/`//` only starts a comment when preceded by whitespace, so `url: http://x.com` reads as a string, not a truncated value), markdown-wrapped / chatty blobs around the payload, trailing commas, unquoted / single- / triple-quoted / quoteless strings, an implicit root object (`key: value`, no braces), `NaN` / `Infinity` / hex / underscored numbers, Python (`True` / `False` / `None`) and JavaScript (`undefined`) literals, smart quotes, a UTF-8 BOM, mixed CR / LF / CRLF line endings, any Ruby-supported input encoding (via `encoding:`), and duplicate keys. The full list — with the human-JSON spec references it's drawn from — is kept in one place: [**What it accepts, beyond strict JSON**](../README.md#what-it-accepts-beyond-strict-json) in the README. +Comments (`//`, `/* … */`, `#` — a `#`/`//` only starts a comment when preceded by whitespace, so `url: http://x.com` reads as a string, not a truncated value), markdown-wrapped / chatty blobs around the payload, trailing commas, unquoted / single- / triple-quoted / quoteless strings, an implicit root object (`key: value`, no braces), `NaN` / `Infinity` / hex / underscored numbers, leading-zero numbers (a signed / decimal / exponent token like `-007.5` is a number, a bare `007` is kept as a string so IDs keep their zeros), Python (`True` / `False` / `None`), JavaScript (`undefined`), and SQL / R / YAML (`Null` / `NULL`) literals, smart quotes, a UTF-8 BOM, mixed CR / LF / CRLF line endings, any Ruby-supported input encoding (via `encoding:`), and duplicate keys. The full list — with the human-JSON spec references it's drawn from — is kept in one place: [**What it accepts, beyond strict JSON**](../README.md#what-it-accepts-beyond-strict-json) in the README. It raises only on genuinely unreadable input (unterminated string, mismatched bracket), with line and column in the message — never on valid-but-lenient input. diff --git a/docs/examples.md b/docs/examples.md index f9fdc6d..fe924a4 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -145,7 +145,23 @@ JSON A `#`/`//` only starts a comment when preceded by whitespace, so `http://example.com` stays a string rather than being truncated. -### Example 10: Wrapper Noise Around a Payload +### Example 10: Leading-Zero IDs and SQL `NULL` + +```ruby +SmarterJSON.process_one(<<~JSON) + { + user_id: 007, # bare leading zero -> kept as a string + zip: 02139, # ditto: zip codes keep their leading zero + balance: -007.50, # a sign / decimal point / exponent makes it a number + deleted_at: NULL # SQL / R / YAML null spelling -> nil + } +JSON +# => {"user_id"=>"007", "zip"=>"02139", "balance"=>-7.5, "deleted_at"=>nil} +``` + +A bare leading-zero integer is kept as a string so identifiers, zip codes, and account numbers don't lose their zeros; a sign, decimal point, or exponent marks numeric intent (`-007.50` → `-7.5`). `Null` and `NULL` join `null` / `None` / `undefined` as spellings of `nil`; a quoted `"NULL"` stays a string. + +### Example 11: Wrapper Noise Around a Payload #### Fenced payload @@ -197,14 +213,14 @@ TEXT # => [{"a"=>1}, {"b"=>2}] ``` -### Example 11: Write JSON +### Example 12: Write JSON ```ruby SmarterJSON.generate({ "a" => 1, "b" => [2, 3] }) # => '{"a":1,"b":[2,3]}' SmarterJSON.generate([1, 2, 3]) # => '[1,2,3]' ``` -### Example 12: Write NDJSON +### Example 13: Write NDJSON An Array writes one element per line: @@ -212,7 +228,7 @@ An Array writes one element per line: SmarterJSON.generate([{ "id" => 1 }, { "id" => 2 }], format: :ndjson) # => "{\"id\":1}\n{\"id\":2}\n" ``` -### Example 13: Round-Trip Read and Write +### Example 14: Round-Trip Read and Write ```ruby obj = { "a" => 1, "b" => [2, "three", nil, true] } From 2a1f619d7df686905874b7619ae21c6476380a0f Mon Sep 17 00:00:00 2001 From: Tilo Sloboda Date: Fri, 12 Jun 2026 18:38:09 -0700 Subject: [PATCH 5/5] 1.2.0 review polish: CHANGELOG clarity, doc consistency, comment trim - CHANGELOG: flag that leading-zero decimals/exponents previously read as strings (a String->Float change, not just a new capability), and add the `RSpec tests: 1,143` line to match the other entries. - Align the Null/NULL source list to "SQL / R / PHP / YAML" in README and docs/_introduction.md (CHANGELOG and the spec already say PHP). - Trim the now-redundant inner leading-zero comments in fj_try_decimal and fj_parse_number (the outer comment already explains the bare-rejection). No behavior change; 1143 examples, 0 failures. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 4 +++- README.md | 2 +- docs/_introduction.md | 2 +- ext/smarter_json/smarter_json.c | 10 ++++------ 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4996fa3..cc0f665 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,9 @@ ## 1.2.0 (unreleased) -- A leading-zero token now reads as a number when it carries a sign, a decimal point, or an exponent (`+007` → `7`, `-000023.5` → `-23.5`, `00.0` → `0.0`, `007e2` → `700.0`). A bare leading-zero integer (`000001`, `02`) still reads as a string, so IDs, zip codes, and account numbers keep their zeros. +RSpec tests: 1,143 + +- A leading-zero token now reads as a number when it carries a sign, a decimal point, or an exponent (`+007` → `7`, `-000023.5` → `-23.5`, `00.0` → `0.0`, `007e2` → `700.0`) — previously these were kept as strings. A bare leading-zero integer (`000001`, `02`) still reads as a string, so IDs, zip codes, and account numbers keep their zeros. - `Null` and `NULL` are now read as `nil` (joining `null` / `None` / `undefined`), for SQL / R / PHP / YAML / DB-derived input — in every position the existing spellings work. Quoted (`"NULL"`) or embedded (`NULL Island`) forms stay strings. ## 1.1.2 (2026-06-12) diff --git a/README.md b/README.md index 6e60563..2bc9597 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Three things set it apart: - Implicit root object — a config file that starts with `key: value`, no outer `{}` - `NaN`, `Infinity`, hex (`0xFF`), leading `+` / `.`, underscores in numbers (`1_000_000`) - Leading-zero numbers (which strict JSON rejects): a token with a sign, decimal point, or exponent reads as a number (`-007.5` → `-7.5`, `007e2` → `700.0`), but a bare leading-zero integer is kept as a string (`007`, `02`) so IDs, zip codes, and account numbers don't lose their zeros -- UTF-8 BOM, smart/curly quotes (in keys and values), Python literals (`True` / `False` / `None`), JavaScript `undefined`, case-variant null (`Null` / `NULL`, as SQL / R / YAML emit it) +- UTF-8 BOM, smart/curly quotes (in keys and values), Python literals (`True` / `False` / `None`), JavaScript `undefined`, case-variant null (`Null` / `NULL`, as SQL / R / PHP / YAML emit it) - Mixed CR / LF / CRLF line endings, and any Ruby-supported input encoding (via `encoding:`) - Duplicate keys (last value wins by default; configurable) diff --git a/docs/_introduction.md b/docs/_introduction.md index 5100178..92a5c31 100644 --- a/docs/_introduction.md +++ b/docs/_introduction.md @@ -29,7 +29,7 @@ Most JSON parsers reject anything that isn't perfectly strict JSON, and they mak ## What it accepts, beyond strict JSON -Comments (`//`, `/* … */`, `#` — a `#`/`//` only starts a comment when preceded by whitespace, so `url: http://x.com` reads as a string, not a truncated value), markdown-wrapped / chatty blobs around the payload, trailing commas, unquoted / single- / triple-quoted / quoteless strings, an implicit root object (`key: value`, no braces), `NaN` / `Infinity` / hex / underscored numbers, leading-zero numbers (a signed / decimal / exponent token like `-007.5` is a number, a bare `007` is kept as a string so IDs keep their zeros), Python (`True` / `False` / `None`), JavaScript (`undefined`), and SQL / R / YAML (`Null` / `NULL`) literals, smart quotes, a UTF-8 BOM, mixed CR / LF / CRLF line endings, any Ruby-supported input encoding (via `encoding:`), and duplicate keys. The full list — with the human-JSON spec references it's drawn from — is kept in one place: [**What it accepts, beyond strict JSON**](../README.md#what-it-accepts-beyond-strict-json) in the README. +Comments (`//`, `/* … */`, `#` — a `#`/`//` only starts a comment when preceded by whitespace, so `url: http://x.com` reads as a string, not a truncated value), markdown-wrapped / chatty blobs around the payload, trailing commas, unquoted / single- / triple-quoted / quoteless strings, an implicit root object (`key: value`, no braces), `NaN` / `Infinity` / hex / underscored numbers, leading-zero numbers (a signed / decimal / exponent token like `-007.5` is a number, a bare `007` is kept as a string so IDs keep their zeros), Python (`True` / `False` / `None`), JavaScript (`undefined`), and SQL / R / PHP / YAML (`Null` / `NULL`) literals, smart quotes, a UTF-8 BOM, mixed CR / LF / CRLF line endings, any Ruby-supported input encoding (via `encoding:`), and duplicate keys. The full list — with the human-JSON spec references it's drawn from — is kept in one place: [**What it accepts, beyond strict JSON**](../README.md#what-it-accepts-beyond-strict-json) in the README. It raises only on genuinely unreadable input (unterminated string, mismatched bracket), with line and column in the message — never on valid-but-lenient input. diff --git a/ext/smarter_json/smarter_json.c b/ext/smarter_json/smarter_json.c index 6ef8cf4..4660d99 100644 --- a/ext/smarter_json/smarter_json.c +++ b/ext/smarter_json/smarter_json.c @@ -654,9 +654,8 @@ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) { * account / check numbers preserve their zeros. */ if (i < n && p[i] == '0') { has_digit = 1; m10digits = 1; i++; - /* A leading '0' followed by more digits (possibly underscore-separated, like the - * [1-9] branch below) is a leading-zero token: consume the run and flag it, so - * 0_5.0 / 0_0.5 behave exactly like 05.0 / 00.5 on both paths. */ + /* Underscore-separated too (like the [1-9] branch below), so 0_5.0 / 0_0.5 behave + * exactly like 05.0 / 00.5 on both paths. */ if (i < n && ((p[i] >= '0' && p[i] <= '9') || p[i] == '_')) { for (;;) { while (i < n && p[i] >= '0' && p[i] <= '9') { @@ -782,9 +781,8 @@ static VALUE fj_parse_number(fj_state *st) { if (*p == '0') { m10digits = 1; /* one leading zero, counted as a single mantissa digit */ p++; - /* A leading '0' followed by more digits (possibly underscore-separated, like the - * [1-9] branch below) is a leading-zero token: consume the run and flag it, so the - * underscore is just a separator (0_0.5 behaves like 00.5). */ + /* Underscore-separated too (like the [1-9] branch below), so the underscore is just a + * separator (0_0.5 behaves like 00.5). */ if ((*p >= '0' && *p <= '9') || *p == '_') { for (;;) { while (*p >= '0' && *p <= '9') {