diff --git a/AGENTS.md b/AGENTS.md index e5b7e513c..3df68c953 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -99,10 +99,14 @@ perl dev/tools/perl_test_runner.pl perl5_t/t/op/ > /tmp/test_output.txt 2>&1 | Command | What it does | |---------|--------------| -| `make` | Build + run all unit tests (use before committing) | -| `make dev` | Build only, skip tests (for quick iteration during debugging) | +| `make` | Build + run all unit tests (always use this) | | `make test-bundled-modules` | Run bundled CPAN module tests (XML::Parser, etc.) | +`make dev` has been disabled on purpose — it used to build without +running tests, which let regressions sneak into commits. Always use +`make`; if you truly need a no-test build, invoke Gradle directly +(`./gradlew shadowJar installDist`). + - For interpreter changes, test with both backends: ```bash ./jperl -e 'code' # JVM backend diff --git a/Makefile b/Makefile index 8701b1c37..3ca4a5848 100644 --- a/Makefile +++ b/Makefile @@ -39,13 +39,28 @@ else ./gradlew classes testUnitParallel --parallel shadowJar endif -# Development build - forces recompilation (use during active development) -dev: check-java-gradle -ifeq ($(OS),Windows_NT) - gradlew.bat clean compileJava shadowJar installDist -else - ./gradlew clean compileJava shadowJar installDist -endif +# `make dev` is disabled on purpose. +# +# It used to be a "build without running tests" shortcut, but that is +# precisely what makes it dangerous: it lets changes land on a branch +# without having ever been exercised by the unit test suite. Agents +# (and humans in a hurry) reach for `make dev` to iterate faster and +# then forget to run `make` before pushing, so regressions sneak in. +# +# Use `make` (the default target) instead: it builds *and* runs the +# fast unit tests. If you really need a no-test build for a very +# specific reason, invoke Gradle directly (`./gradlew shadowJar`) and +# own the consequences. +dev: + @echo "ERROR: 'make dev' is disabled on purpose." + @echo "" + @echo " It skipped the unit tests, which caused regressions to slip" + @echo " into commits. Please use 'make' (which builds + tests) for" + @echo " everyday iteration." + @echo "" + @echo " If you truly need a no-test build, invoke Gradle directly:" + @echo " ./gradlew shadowJar installDist" + @exit 1 # Default test target - fast unit tests using perl_test_runner.pl test: test-unit diff --git a/build.gradle b/build.gradle index 04298b5a4..173d96701 100644 --- a/build.gradle +++ b/build.gradle @@ -201,7 +201,6 @@ dependencies { implementation libs.asm // ByteCode manipulation implementation libs.asm.util // ASM utilities implementation libs.icu4j // Unicode support - implementation libs.fastjson2 // JSON processing implementation libs.snakeyaml.engine // YAML processing implementation libs.tomlj // TOML processing implementation libs.commons.csv // CSV processing diff --git a/dev/custom_bytecode/TESTING.md b/dev/custom_bytecode/TESTING.md index 452a1288a..f5ce6c054 100644 --- a/dev/custom_bytecode/TESTING.md +++ b/dev/custom_bytecode/TESTING.md @@ -39,7 +39,7 @@ Micro-benchmark for interpreter performance on loop-heavy code. ./gradlew run -PmainClass=org.perlonjava.interpreter.ForLoopBenchmark # Method 2: Direct Java execution -java -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' -o -name 'fastjson*.jar' | tr '\n' ':')" \ +java -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' | tr '\n' ':')" \ org.perlonjava.interpreter.ForLoopBenchmark ``` @@ -120,7 +120,7 @@ See what methods the JVM is compiling and inlining: ```bash java -XX:+PrintCompilation \ - -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' -o -name 'fastjson*.jar' | tr '\n' ':')" \ + -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' | tr '\n' ':')" \ org.perlonjava.interpreter.ForLoopBenchmark 2>&1 | grep -E "(BytecodeInterpreter|MathOperators|CompareOperators)" ``` @@ -130,7 +130,7 @@ See what the C2 compiler is inlining: ```bash java -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining \ - -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' -o -name 'fastjson*.jar' | tr '\n' ':')" \ + -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' | tr '\n' ':')" \ org.perlonjava.interpreter.ForLoopBenchmark 2>&1 > /tmp/inline.txt # Check if operators are being inlined into execute loop @@ -144,7 +144,7 @@ View the generated interpreter bytecode: ```bash # Enable DEBUG mode in ForLoopBenchmark.java, then: make build -java -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' -o -name 'fastjson*.jar' | tr '\n' ':')" \ +java -cp "build/classes/java/main:$(find ~/.gradle/caches -name 'icu4j*.jar' -o -name 'asm*.jar' | tr '\n' ':')" \ org.perlonjava.interpreter.ForLoopBenchmark 2>&1 | head -30 ``` diff --git a/dev/design/sbom.md b/dev/design/sbom.md index 4c62ab528..4cc12d20a 100644 --- a/dev/design/sbom.md +++ b/dev/design/sbom.md @@ -75,7 +75,6 @@ These are external libraries downloaded from Maven Central during build: | org.ow2.asm:asm | 9.9.1 | BSD-3-Clause | JVM bytecode generation | | org.ow2.asm:asm-util | 9.9.1 | BSD-3-Clause | ASM utilities | | com.ibm.icu:icu4j | 78.2 | ICU License | Unicode support | -| com.alibaba.fastjson2:fastjson2 | 2.0.61 | Apache-2.0 | JSON processing | | org.snakeyaml:snakeyaml-engine | 3.0.1 | Apache-2.0 | YAML processing | | org.tomlj:tomlj | 1.1.1 | Apache-2.0 | TOML processing | | org.apache.commons:commons-csv | 1.14.1 | Apache-2.0 | CSV processing | diff --git a/dev/import-perl5/patches/PP.pm.patch b/dev/import-perl5/patches/PP.pm.patch index af3063b9a..1d912959b 100644 --- a/dev/import-perl5/patches/PP.pm.patch +++ b/dev/import-perl5/patches/PP.pm.patch @@ -1,6 +1,24 @@ ---- perl5/cpan/JSON-PP/lib/JSON/PP.pm -+++ src/main/perl/lib/JSON/PP.pm -@@ -695,6 +695,9 @@ BEGIN { +--- perl5/cpan/JSON-PP/lib/JSON/PP.pm 2025-12-30 10:25:00 ++++ src/main/perl/lib/JSON/PP.pm 2026-04-23 21:59:52 +@@ -129,6 +129,17 @@ + + + # Methods ++ ++# Backend introspection — the CPAN JSON dispatcher and its test suite call ++# these as class methods on whichever backend module was loaded (JSON::XS, ++# JSON::PP, JSON::backportPP, or — in PerlOnJava — our `JSON` shim which ++# ISA JSON::PP). JSON::PP is the pure-Perl backend, so `is_pp == 1` and ++# `is_xs == 0`. Upstream CPAN JSON::PP doesn't define these (the real ++# dispatcher installs them), but we ship JSON::PP as our backend and add ++# them here so that tests which introspect `JSON::PP->is_pp` directly ++# work out of the box. ++sub is_xs { 0 } ++sub is_pp { 1 } + + sub new { + my $class = shift; +@@ -695,6 +706,9 @@ last; } } @@ -10,3 +28,36 @@ } { # PARSE +@@ -1544,8 +1558,30 @@ + + my ($obj, $offset) = $coder->PP_decode_json( $self->{incr_text}, 0x00000001 ); + push @ret, $obj; +- use bytes; +- $self->{incr_text} = substr( $self->{incr_text}, $offset || 0 ); ++ # PerlOnJava: the decoder advances its internal `$at` by the ++ # UTF-8 byte length of each multi-byte character (via ++ # `is_valid_utf8`), so `$offset` is in BYTES. CPAN ++ # JSON::PP papers over this with `use bytes; substr`, ++ # which in upstream Perl makes substr operate on the ++ # UTF-8 byte representation of the string. PerlOnJava's ++ # `use bytes` pragma does not yet redirect substr, so do ++ # the equivalent explicitly. ++ # ++ # When `get_utf8` is true, `$self->{incr_text}` is already ++ # a byte string (the user hands us UTF-8 bytes) and plain ++ # substr works correctly with a byte offset. When ++ # `get_utf8` is false, we decoded to chars above and need ++ # to encode-substr-decode to match the decoder's byte ++ # bookkeeping. ++ if ( $coder->get_utf8 ) { ++ $self->{incr_text} = substr( $self->{incr_text}, $offset || 0 ); ++ } else { ++ my $bytes = $self->{incr_text}; ++ utf8::encode($bytes); ++ my $remaining = substr($bytes, $offset || 0); ++ utf8::decode($remaining); ++ $self->{incr_text} = $remaining; ++ } + $self->{incr_pos} = 0; + $self->{incr_nest} = 0; + $self->{incr_mode} = 0; diff --git a/dev/modules/README.md b/dev/modules/README.md index 0e23bd47b..ef84155a2 100644 --- a/dev/modules/README.md +++ b/dev/modules/README.md @@ -69,7 +69,7 @@ PerlOnJava's XSLoader returns an error matching `/loadable object/` which these For performance-critical modules, PerlOnJava can provide Java implementations: - `DateTime` - Uses `java.time` APIs -- `JSON::XS` - Falls back to JSON::PP (or could use FASTJSON) +- `JSON::XS` - Falls back to the bundled `JSON::PP` - `DBI` - Custom Java implementation with JDBC See [xs_fallback.md](xs_fallback.md) for implementation details. diff --git a/dev/modules/dynamic_loading.md b/dev/modules/dynamic_loading.md index 2863779a7..27338ebd5 100644 --- a/dev/modules/dynamic_loading.md +++ b/dev/modules/dynamic_loading.md @@ -234,7 +234,6 @@ The root build configurations that will build all subprojects: 9.7.1 5.11.4 76.1 - 2.0.54 2.9 @@ -255,11 +254,6 @@ The root build configurations that will build all subprojects: icu4j ${icu4j.version} - - com.alibaba.fastjson2 - fastjson2 - ${fastjson.version} - org.snakeyaml snakeyaml-engine @@ -312,7 +306,6 @@ subprojects { ext { asmVersion = '9.7.1' icu4jVersion = '76.1' - fastjsonVersion = '2.0.54' snakeyamlVersion = '2.9' junitVersion = '5.9.2' } diff --git a/dev/modules/json_test_parity.md b/dev/modules/json_test_parity.md new file mode 100644 index 000000000..5e6f72ff8 --- /dev/null +++ b/dev/modules/json_test_parity.md @@ -0,0 +1,443 @@ +# JSON Test Parity — `./jcpan -t JSON` + +## Goal + +Make the complete CPAN `JSON` 4.11 test suite pass when run through +PerlOnJava's bundled `JSON` backend (`src/main/perl/lib/JSON.pm` + +`src/main/java/org/perlonjava/runtime/perlmodule/Json.java`). + +This means that `./jcpan -t JSON` should report **0 failures**. Today +(after PR #550 lands) the baseline is **22 / 68 passing**. + +## How tests see our backend + +CPAN `JSON` 4.11 is a pure-Perl dispatcher that normally loads +`JSON::XS` or `JSON::PP`. Our `MakeMaker` intentionally skips +`lib/JSON.pm` during install and doesn't stage it to `blib/lib/`, so +`make test` (which uses `PERL5LIB=./blib/lib:./blib/arch`) falls +through to our JAR-bundled `JSON.pm`. That shim loads `Json.java` +via `XSLoader::load('Json')` and exposes itself as `is_xs == 1`, +so from the tests' point of view we are "the XS backend". + +## Current state (PR #550) + +`JSON.pm` has been taught the CPAN dispatcher's API surface: +`import` tags, `$JSON::Backend`, `backend()`, `jsonToObj`/`objToJson`, +option stubs, minimal incremental parser. That alone lifts passing +count from 16 → 22. + +The other 46 failures all need `Json.java` work. + +## Remaining failures — by feature + +Counts are approximate (one test file may fail for multiple reasons). + +### 1. Error handling / "croak" compatibility (~15 files) + +Every encoder/decoder error today surfaces as a Java stack trace +(`... at Json.java line 151`). `JSON::XS` croaks with very specific +messages that many tests regex-match on: + +- `malformed UTF-8 character in JSON string` +- `unexpected end of string` +- `unexpected character, expected ...` +- `, or ] expected while parsing array` +- `, or } expected while parsing object/hash` +- `garbage after JSON object` +- `cannot encode reference to scalar` / `cannot encode reference` +- `hash- or arrayref expected` (when `allow_nonref` off) +- `attempt to encode object` (when `allow_blessed` off) +- `number out of range` (when `max_size` exceeded) +- `JSON text must be an object or array` (strict mode) +- offset/line/column info in decode errors + +**Tests**: `02_error`, `08_pc_base`, `108_decode`, +`120_incr_parse_truncated`, `17_relaxed`, `18_json_checker`, +`20_unknown`, `22_comment_at_eof`, `rt_116998_wrong_character_offset`, +`x02_error`, `xe19_xs_and_suportbypp`, `xe20_croak_message`, +`gh_28_json_test_suite`. + +**Approach**: Replace the fastjson2 exception with a +`PerlCompilerException` / `croak()` call that carries the +JSON::XS-style message with offset/line/column. Requires a small +shim between the fastjson2 error and our croak path (OR writing our +own parser — see §10 below). + +### 2. `utf8` / `ascii` / `latin1` flags (~10 files) + +`JSON::XS` has three orthogonal encoding options: + +| Flag | On | Off | +|---------|--------------------------------------|------------------------------------| +| `utf8` | encode returns byte string; decode expects byte string | encode returns character string; decode expects character string | +| `ascii` | escape all codepoints > 0x7F as `\uXXXX` (with surrogate pairs for > 0xFFFF) | emit raw characters | +| `latin1`| escape codepoints > 0xFF as `\uXXXX` | emit raw characters | + +Today we ignore all three. When `ascii` is requested, codepoints +like `U+8000` must come out as `\u8000` and `U+10402` as +`\ud801\udc02`. When `utf8` is set, the output must be a byte string +(`length` returns byte count). + +**Tests**: `01_utf8`, `109_encode`, `14_latin1`, `e02_bool`, +`112_upgrade`. + +**Approach**: This only makes sense if we write our own emitter +(see §10). Fastjson2's output shape is fixed. + +### 3. Pretty printing (`indent`, `indent_length`, `space_before`, `space_after`) (~3 files) + +`JSON::XS` uses exactly this format: + +```json +{ + "a" : 1, + "b" : [ + 2, + 3 + ] +} +``` + +- 3-space indent by default (`indent_length` overrides) +- one element per line +- `" : "` between key and value (spaces from `space_before`/`space_after`) +- `,\n` between elements (no trailing comma) + +Today we delegate to fastjson2's `PrettyFormat` then do regex +replacements, which doesn't produce byte-identical output. + +**Tests**: `06_pc_pretty`, `xe05_indent_length`. + +### 4. `canonical` / `sort_by` (~2 files) + +- `canonical` → encode object keys sorted +- `sort_by` → callback-based ordering + +**Tests**: parts of `03_types`, `104_sortby` (already passing after +shim), `52_object`. + +### 5. Booleans / `JSON::PP::Boolean` (~3 files) + +`JSON::XS` and `JSON::PP` both bless their true/false singletons into +`JSON::PP::Boolean`. Tests use `isa_ok($x, 'JSON::PP::Boolean')`. + +**Tests**: `e03_bool2`, `xe12_boolean`, `e02_bool`, `118_boolean_values`. + +**Approach**: + +1. Create a trivial `JSON/PP/Boolean.pm` in the JAR (if not already — + there is one). +2. Bless the singletons in `Json.java` (or have `true()`/`false()` + return a blessed `RuntimeScalar`). +3. Make `is_bool()` recognise both raw `BOOLEAN` and blessed-as- + `JSON::PP::Boolean` references. + +### 6. `allow_blessed` / `convert_blessed` / `TO_JSON` (~5 files) + +- By default: die when encoding a blessed reference. +- `allow_blessed`: emit as `null`. +- `convert_blessed`: call `$obj->TO_JSON`, encode its return value. +- `-convert_blessed_universally` import: install a default + `UNIVERSAL::TO_JSON` (already accepted as a no-op after PR #550; + actual behaviour needs Java-side hook). + +**Tests**: `12_blessed`, `x12_blessed`, `52_object`, +`e11_conv_blessed_univ`, `113_overloaded_eq`. + +### 7. Relaxed decoding (~3 files) + +`relaxed` option: accept `#`-to-EOL comments, trailing commas in +arrays and objects. + +**Tests**: `17_relaxed`, `22_comment_at_eof`. + +### 8. `allow_barekey`, `allow_singlequote`, `escape_slash`, `loose` (~4 files) + +Decoder/encoder relaxations, all PP-only in CPAN JSON. Because we +already accept these option setters in the shim, we just need the +Java side to honour them. + +**Tests**: `105_esc_slash`, `106_allow_barekey`, `107_allow_singlequote`, +`xe04_escape_slash`. + +### 9. Number formatting (~1-2 files) + +Perl-number-compatible output: + +- `-1.234e5` → `-123400` (no trailing `.0`) +- `1.23E-4` → `0.000123` (or at least a form Perl round-trips) +- Large `1.01e+30` must preserve magnitude and typeness +- Integers stay integers (no `.0`) +- `NaN`/`Inf` → error (JSON can't represent them) + +Today we emit fastjson2's native formatting (`-123400.0`, `1.23E-4`). + +**Tests**: `11_pc_expo`, and number-adjacent bits of `03_types`. + +### 10. Incremental parser (~5 files) + +Full JSON::XS `incr_parse` / `incr_text` / `incr_skip` / `incr_reset` +semantics: buffer bytes, try to extract one top-level JSON value at +a time, report remaining input, support truncation. + +**Tests**: `19_incr`, `22_comment_at_eof`, `116_incr_parse_fixed`, +`119_incr_parse_utf8`, `120_incr_parse_truncated`, `rt_90071_incr_parse`. + +### 11. `decode_prefix` (~2 files) + +Decode the first JSON value in a buffer and return it along with the +number of characters consumed. + +**Tests**: `15_prefix`, `114_decode_prefix`. + +### 12. `max_depth` / `max_size` (~2 files) + +Enforce recursion/size limits, croak with a specific message +(`max_depth exceeded`, `json text or perl structure exceeds maximum +nesting level`, `max_size exceeded`). + +**Tests**: `13_limit`, `e01_property`. + +### 13. `boolean_values` (~1 file) + +Customise the two values used to represent JSON `true`/`false` when +decoding. + +**Tests**: `118_boolean_values`. + +### 14. Tied hashes / arrays (~2 files) + +Encode tied data structures by iterating via `FETCH` (our current +`Json.java` uses `RuntimeHash.elements.keySet()` which bypasses ties). + +**Tests**: `16_tied`, `x16_tied`. + +### 15. `allow_nonref` enforcement (~several) + +When off (the default), encode/decode of non-objects/arrays must die. + +### 16. Overloaded objects (~1 file) + +`113_overloaded_eq`: a blessed hashref with `"" eq`/`""` overloads +should serialize using its stringification. Interaction with +`allow_blessed` + `convert_blessed`. + +### 17. `allow_unknown` (~1 file) + +Currently-unencodable values (coderefs, globs, filehandles, …) +become `null` instead of a fatal error. + +**Tests**: `20_unknown`. + +### 18. `allow_tags` (~1 file) + +JSON::XS-specific tagged-value extension. May be acceptable to stub. + +### 19. Standards compliance — `gh_28_json_test_suite` (~1 file) + +Runs the json.org conformance suite; most of it passes once error +handling, number formatting, and surrogate handling are fixed. + +### 20. `00_load_backport_pp` + +Tests that `JSON::backportPP` can load and set `$JSON::BackendModulePP`. +Our shim currently sets no PP backend. Probably fine to load +`JSON::backportPP` on demand from within the shim. + +## Implementation strategy + +Fastjson2 is fundamentally the wrong shape for this API: it has its +own error vocabulary, its own number formatting, doesn't know about +Perl `undef`/booleans/blessed refs, and offers no hooks for relaxed +parsing, tagged values, or incremental input. + +Rather than fight it, **write our own JSON encoder and decoder in +Java** in `Json.java`, roughly along the lines of JSON::PP's +algorithm but working on `RuntimeScalar`/`RuntimeHash`/`RuntimeArray` +directly. This is ~1000-1500 lines of straightforward code and +gives us full control over every option. + +We can keep fastjson2 as a fallback for JSON::MaybeXS-style consumers +who don't need the full option set, but `JSON.pm`'s path should go +through our hand-rolled emitter/parser. + +## Phased implementation + +Each phase is independently testable against the 68-test CPAN suite +plus `make` (regressions). + +| Phase | Scope | Expected new passes | Risk | +|-------|-------|---------------------|------| +| **1** | Hand-rolled encoder + `utf8`/`ascii`/`latin1`/`indent`/`space_*`/`canonical` options, Perl-aware number formatting, proper `null`/boolean handling, `allow_nonref` enforcement, `cannot encode reference` errors, blessed-as-`JSON::PP::Boolean` true/false | ~15 | low | +| **2** | Hand-rolled decoder + offset/line/column-aware croak messages matching `JSON::XS` patterns, tied iteration, `allow_blessed` / `convert_blessed` / `TO_JSON` | ~10 | low | +| **3** | `relaxed`, `allow_barekey`, `allow_singlequote`, `escape_slash`, `loose`, `allow_unknown`, `max_depth`, `max_size`, `decode_prefix` | ~10 | low | +| **4** | Real `incr_parse` state machine | ~5 | medium | +| **5** | `boolean_values`, `allow_tags`, `sort_by`, overload integration, remaining edges until `./jcpan -t JSON` reports 0 failures | ~6 | medium | + +Each phase ends with: `make` green, `jcpan -t JSON` count +measured and recorded in "Progress Tracking" below. + +## Measuring progress + +Reproducible local check (no `jcpan`/`make test` wrapper needed): + +```bash +cd ~/.cpan/build/JSON-4.11-9 +export PERL5LIB=./blib/lib:./blib/arch +# or use a fresh extract: +# jcpan -g JSON && cd ~/.cpan/build/JSON-*/ && jperl Makefile.PL && make +pass=0; fail=0 +for t in t/*.t; do + out=$(timeout 90 /path/to/jperl "$t" 2>&1) + rc=$? + last_plan=$(echo "$out" | grep -E "^1\.\." | tail -1) + fail_count=$(echo "$out" | grep -cE "^not ok ") + if [ "$last_plan" = "1..0" ] || echo "$last_plan" | grep -q "SKIP"; then + : + elif [ $rc -eq 0 ] && [ $fail_count -eq 0 ]; then + pass=$((pass+1)) + else + fail=$((fail+1)); echo " $t" + fi +done +echo "PASS: $pass FAIL: $fail" +``` + +The tooling-friendly success criterion is `FAIL: 0` from the snippet +above, and `./jcpan -t JSON` reporting `PASS` at the end. + +## Progress Tracking + +### Current status: `./jcpan -t JSON` — `Result: PASS` (68 files, 26126 subtests, 0 failures) + +### Completed + +- [x] **Shim compatibility** (`src/main/perl/lib/JSON.pm`): import + tags (`-support_by_pp`, `-no_export`, + `-convert_blessed_universally`), `$JSON::Backend`, + `backend()`, `is_xs`/`is_pp`, `jsonToObj`/`objToJson`, + `pureperl_only_methods`, `null`, option stubs, `property()`. + **16 → 22** passing. + +- [x] **Strategic pivot: delegate to JSON::PP** — instead of + hand-rolling an encoder/decoder in Java, the JSON shim now + `@ISA = 'JSON::PP'`. The bundled `JSON::PP` is a complete + pure-Perl backend that already implements every option the + test suite exercises. Deleted `Json.java` and the + `fastjson2` dependency. **22 → 47** passing. + +- [x] **Parser fix**: package-literal barewords + (`Foo::Bar::`) in `bless` and before word operators + (`eq`, `ne`, `lt`, `gt`, `le`, `ge`, `cmp`, `x`, `isa`, `and`, + `or`, `xor`). Previously `bless $x, Foo::Bar::;` kept the + trailing `::` and `Foo::Bar:: eq $x` was mis-parsed as a sub + call. See commit 80842ea. + +- [x] **Parser fix**: `{"a","b"}` / `{1,2}` / `{foo,1,bar,2}` now + parse as hashrefs (not blocks evaluating a comma expression) + when the first content token is a hash-key-shaped thing. + See commit 3cf3405. + +- [x] **Runtime fix**: `unpack "U*"` in character mode now reads + code points directly from the string instead of re-decoding + its UTF-8 byte representation. Matches real Perl on any + Latin-1 byte string. Fixed `JSON::PP`'s `_encode_ascii`. + See commit 3c1545e. + +- [x] **Runtime fix**: `is_xs`/`is_pp` added to `JSON::PP` as + class methods (was `Can't locate object method` in ~8 tests). + +- [x] **Parser fix**: `[\c?]` inside a character class now matches + U+007F (DEL), not U+001C. The regex preprocessor no longer + escapes `?` inside `[ ]`. Fixes `JSON::PP`'s control-char + escape regex and the whole `t/99_binary.t` suite. See commit + f8b3864. + +- [x] **JSON.pm fix**: `-convert_blessed_universally` uses + `reftype` instead of `ref` to classify the underlying + storage of a blessed reference. See commit da2a693. + +- [x] **Runtime fix**: `bless` and `UNIVERSAL::isa` now canonicalise + through stash aliases (`*Dst:: = *Src::;`). After aliasing, + `ref($x)` reports the canonical name and `isa` accepts either + name. See commit 3c3ef4f. + +- [x] **`PERL_JSON_BACKEND=JSON::backportPP`** — new + `JSON/backportPP.pm` that `require`s `JSON::PP` and then + `delete $INC{'JSON/PP.pm'}` so the backend-identification + contract (check `%INC` for `JSON/backportPP.pm` vs `JSON/PP.pm`) + is satisfied. `JSON.pm` gates its backend load on the env var + and paper-patches the CPAN-shipped `JSON/backportPP.pm`'s + missing introspection (`@JSON::PP::ISA`, `@JSON::backportPP::ISA`, + `is_xs`/`is_pp`) so the dispatcher surface works regardless of + which backportPP was picked up. See commits e800c29, 88aea98. + +- [x] **JSON::PP IncrParser `substr` byte handling** — replaced the + `use bytes; substr($text, $offset)` line (which depends on + `use bytes` redirecting `substr` — a feature PerlOnJava does + not yet implement) with an explicit encode/substr/decode when + `get_utf8` is off, and plain `substr` when it is on. Fixes + multi-byte-char incremental parsing. See commit b6d67bf. + +### jcpan -t JSON progression + +| Milestone | Result | +|-----------|--------| +| Before | 16 / 68 passing | +| Shim-only | 22 / 68 | +| + JSON::PP delegation, drop fastjson2 | 47 / 68 | +| + 4 parser/runtime/regex fixes | 64 / 68 | +| + JSON::PP class methods + property | 65 / 68 | +| + reftype fix | 65 / 68 (+ t/e11 internally) | +| + stash alias canonicalisation | 66 / 68 | +| + JSON::backportPP support | 67 / 68 (1 test 6/24 subtests failing) | +| **+ IncrParser UTF-8 byte substr** | **`Result: PASS` — 68 / 68, 26126 / 26126 subtests** | + +### Still outstanding + +None — `./jcpan -t JSON` reports `Result: PASS` end-to-end. + +Follow-up: `use bytes` should eventually redirect `substr` / `unpack` +/ `index` globally in PerlOnJava (following the pattern already +established for `length`/`chr`/`ord`). Tracked under the runtime +roadmap; not needed for this task. + +### Files changed + +- `src/main/perl/lib/JSON.pm` — full rewrite, now `@ISA = ('JSON::PP')` +- `src/main/perl/lib/JSON/PP.pm` — added `is_xs` / `is_pp` class methods +- `src/main/java/org/perlonjava/runtime/perlmodule/Json.java` — DELETED +- `src/main/java/org/perlonjava/frontend/parser/OperatorParser.java` + (bless `::` strip) +- `src/main/java/org/perlonjava/frontend/parser/SubroutineParser.java` + (word-op infix detection) +- `src/main/java/org/perlonjava/frontend/parser/StatementResolver.java` + (hashref disambiguation) +- `src/main/java/org/perlonjava/runtime/operators/unpack/UFormatHandler.java` + (unpack U*) +- `src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java` + (`?` in char class) +- `src/main/java/org/perlonjava/runtime/operators/ReferenceOperators.java` + (bless via stash alias) +- `src/main/java/org/perlonjava/runtime/perlmodule/Universal.java` + (isa via stash alias) +- `src/test/resources/unit/{method_call_trailing_colons,hash_ref_disambiguation,unpack,regex/regex_charclass,stash_aliasing}.t` + — regression tests for every runtime/parser fix +- Many docs cleaned up to drop fastjson2 references + (`docs/reference/bundled-modules.md`, + `docs/reference/xs-compatibility.md`, + `docs/guides/using-cpan-modules.md`, + `docs/getting-started/installation.md`, + `dev/modules/README.md`, + `dev/modules/dynamic_loading.md`, + `dev/design/sbom.md`, + `dev/custom_bytecode/TESTING.md`, + `dev/presentations/blogs_perl_org_jcpan_2026/blog-post{,-long}.md`, + `build.gradle`, `gradle/libs.versions.toml`, `pom.xml`, + `jperl`, `jperl.bat`) + +## Related documents + +- `dev/modules/xs_fallback.md` — general XS → Java fallback framework +- `src/main/perl/lib/JSON.pm` — the shim +- `src/main/java/org/perlonjava/runtime/perlmodule/Json.java` — the Java backend diff --git a/dev/presentations/blogs_perl_org_jcpan_2026/blog-post-long.md b/dev/presentations/blogs_perl_org_jcpan_2026/blog-post-long.md index 6461dd3ce..76dcedecb 100644 --- a/dev/presentations/blogs_perl_org_jcpan_2026/blog-post-long.md +++ b/dev/presentations/blogs_perl_org_jcpan_2026/blog-post-long.md @@ -82,7 +82,7 @@ XS modules contain C code that gets compiled to native machine code. Since PerlO For popular XS modules, PerlOnJava includes **Java implementations** of the XS functions: - **DateTime** — java.time APIs -- **JSON** — fastjson2 library +- **JSON** — bundled `JSON::PP` (pure Perl) - **Digest::MD5/SHA** — Java MessageDigest - **DBI** — JDBC backend - **Compress::Zlib** — java.util.zip diff --git a/dev/presentations/blogs_perl_org_jcpan_2026/blog-post.md b/dev/presentations/blogs_perl_org_jcpan_2026/blog-post.md index 48a16498e..b17ba82ef 100644 --- a/dev/presentations/blogs_perl_org_jcpan_2026/blog-post.md +++ b/dev/presentations/blogs_perl_org_jcpan_2026/blog-post.md @@ -39,7 +39,7 @@ For XS modules (those with C code), PerlOnJava provides Java implementations for | Module | Java Backend | |--------|--------------| | DateTime | java.time with JulianFields | -| JSON | fastjson2 | +| JSON | bundled `JSON::PP` (pure Perl) | | Digest::MD5/SHA | Java MessageDigest | | DBI | JDBC | | Compress::Zlib | java.util.zip | diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 3dbb1f6cf..bd8d32072 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -99,7 +99,6 @@ sudo dpkg -r perlonjava - JUnit: For testing - ASM: For bytecode manipulation - ICU4J: For Unicode support -- FASTJSON v2: For JSON support - SnakeYAML Engine: for YAML support ## Running PerlOnJava diff --git a/docs/guides/using-cpan-modules.md b/docs/guides/using-cpan-modules.md index 90df62228..7a447cbaa 100644 --- a/docs/guides/using-cpan-modules.md +++ b/docs/guides/using-cpan-modules.md @@ -71,7 +71,7 @@ PerlOnJava includes Java implementations of XS functions for several modules: | Module | Notes | |--------|-------| | DateTime | Uses java.time APIs with JulianFields.RATA_DIE | -| JSON | Fast JSON encode/decode using fastjson2 | +| JSON | Delegates to the bundled pure-Perl `JSON::PP` for full `JSON::XS` option parity | | Digest::MD5 | Java MessageDigest API | | Digest::SHA | Java MessageDigest API | | Time::HiRes | Java System.nanoTime() | diff --git a/docs/reference/bundled-modules.md b/docs/reference/bundled-modules.md index 9dc87d6b7..0278efff3 100644 --- a/docs/reference/bundled-modules.md +++ b/docs/reference/bundled-modules.md @@ -112,7 +112,7 @@ These are loaded automatically or via `use`: | Module | Implementation | Notes | |--------|---------------|-------| | `Data::Dumper` | Java + Perl | | -| `JSON` / `JSON::PP` | Java | Fast encode/decode via fastjson2 | +| `JSON` / `JSON::PP` | Perl | `JSON` delegates to the bundled pure-Perl `JSON::PP` | | `YAML::PP` | Java + Perl | | | `TOML` | Java | | | `Text::CSV` | Java | | diff --git a/docs/reference/xs-compatibility.md b/docs/reference/xs-compatibility.md index a77e8f26c..043dc54fa 100644 --- a/docs/reference/xs-compatibility.md +++ b/docs/reference/xs-compatibility.md @@ -17,7 +17,7 @@ These modules have optimized Java implementations built into PerlOnJava: | Module | Java Class | XS_VERSION | Notes | |--------|------------|------------|-------| -| JSON | Json.java | - | Uses fastjson2 library | +| JSON | _(Perl)_ | 4.11 | Delegates to the bundled pure-Perl `JSON::PP` | | DateTime | DateTime.java | 1.65 | Uses java.time APIs, JulianFields.RATA_DIE | | Digest::MD5 | DigestMD5.java | - | Uses Java MessageDigest | | Digest::SHA | DigestSHA.java | - | Uses Java MessageDigest | diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 5428dfa28..4e4c0a172 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -2,7 +2,6 @@ asm = "9.9.1" bouncycastle = "1.78.1" commons-csv = "1.14.1" -fastjson2 = "2.0.61" icu4j = "78.3" junit-jupiter = "6.1.0-M1" snakeyaml-engine = "3.0.1" @@ -15,7 +14,6 @@ asm-util = { module = "org.ow2.asm:asm-util", version.ref = "asm" } bcprov = { module = "org.bouncycastle:bcprov-jdk18on", version.ref = "bouncycastle" } bcpkix = { module = "org.bouncycastle:bcpkix-jdk18on", version.ref = "bouncycastle" } commons-csv = { module = "org.apache.commons:commons-csv", version.ref = "commons-csv" } -fastjson2 = { module = "com.alibaba.fastjson2:fastjson2", version.ref = "fastjson2" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } junit-jupiter-api = { module = "org.junit.jupiter:junit-jupiter-api", version.ref = "junit-jupiter" } junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit-jupiter" } diff --git a/jperl b/jperl index e6bbd76c0..55e33a53d 100755 --- a/jperl +++ b/jperl @@ -32,7 +32,7 @@ fi JVM_OPTS="--enable-native-access=ALL-UNNAMED" # Java 23+ warns about sun.misc.Unsafe usage (JEP 471). Add flag to suppress -# warnings from fastjson2 library. +# warnings from transitive libraries (ASM, ICU4J, etc.) that still use it. JAVA_VERSION=$(java -version 2>&1 | head -1 | sed 's/.*version "\([0-9]*\).*/\1/') if [ "$JAVA_VERSION" -ge 23 ] 2>/dev/null; then JVM_OPTS="$JVM_OPTS --sun-misc-unsafe-memory-access=allow" diff --git a/jperl.bat b/jperl.bat index 521af7b94..a626a596d 100755 --- a/jperl.bat +++ b/jperl.bat @@ -19,7 +19,7 @@ rem for native system calls (file operations, process management). set JVM_OPTS=--enable-native-access=ALL-UNNAMED rem Java 23+ warns about sun.misc.Unsafe usage (JEP 471). Add flag to suppress -rem warnings from fastjson2 library. +rem warnings from transitive libraries (ASM, ICU4J, etc.) that still use it. for /f "tokens=3" %%v in ('java -version 2^>^&1 ^| findstr /i "version"') do ( for /f "tokens=1 delims=." %%m in ("%%~v") do ( if %%m GEQ 23 set JVM_OPTS=%JVM_OPTS% --sun-misc-unsafe-memory-access=allow diff --git a/pom.xml b/pom.xml index 538d132c2..6fc193bbd 100644 --- a/pom.xml +++ b/pom.xml @@ -50,11 +50,6 @@ icu4j 78.3 - - com.alibaba.fastjson2 - fastjson2 - 2.0.61.android8 - org.snakeyaml snakeyaml-engine diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index e4fdd8a4a..2522435f3 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "7a0687aef"; + public static final String gitCommitId = "3611e23bd"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 23 2026 18:51:50"; + public static final String buildTimestamp = "Apr 24 2026 12:28:51"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/frontend/parser/OperatorParser.java b/src/main/java/org/perlonjava/frontend/parser/OperatorParser.java index c1782bb37..4646b1818 100644 --- a/src/main/java/org/perlonjava/frontend/parser/OperatorParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/OperatorParser.java @@ -775,8 +775,15 @@ static BinaryOperatorNode parseBless(Parser parser, int currentIndex) { // Handle bareword class names if (className instanceof IdentifierNode identifierNode) { - // Convert bareword to string (like "Moo" -> StringNode("Moo")) - className = new StringNode(identifierNode.name, currentIndex); + // Convert bareword to string (like "Moo" -> StringNode("Moo")). + // A package-literal bareword ending in "::" means the class + // name without the trailing "::", matching standard Perl: + // bless $r, Foo::Bar::; # class is "Foo::Bar", not "Foo::Bar::" + String name = identifierNode.name; + if (name.endsWith("::") && !name.equals("::")) { + name = name.substring(0, name.length() - 2); + } + className = new StringNode(name, currentIndex); } else if (className instanceof StringNode stringNode && stringNode.value.isEmpty()) { // default to main package if empty class name is provided className = new StringNode("main", currentIndex); diff --git a/src/main/java/org/perlonjava/frontend/parser/StatementResolver.java b/src/main/java/org/perlonjava/frontend/parser/StatementResolver.java index ee0dbfb4a..d1ef3cbc3 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StatementResolver.java +++ b/src/main/java/org/perlonjava/frontend/parser/StatementResolver.java @@ -859,6 +859,14 @@ public static boolean isHashLiteral(Parser parser) { boolean hasContent = false; // Track if we've seen any content boolean firstTokenIsSigil = false; // Track if first token is % or @ (hash/array) + // Extra bits for the "{'a','b'}" / "{1,2}" / "{foo,1}" rule: real Perl + // treats a braced expression as a hashref when the FIRST content token + // is a hash-key-shaped thing (string, number, bareword identifier) and + // at least one plain comma appears at depth 1. Without this, such + // expressions degraded to a block evaluating a comma list. + boolean firstTokenIsKeyLike = false; + boolean sawCommaAtDepth1 = false; + if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("isHashLiteral START - initial braceCount: " + braceCount); // Check if the first token is % or @ - this strongly suggests a hash literal @@ -867,6 +875,30 @@ public static boolean isHashLiteral(Parser parser) { if (firstToken.text.equals("%") || firstToken.text.equals("@")) { firstTokenIsSigil = true; if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("isHashLiteral first token is sigil: " + firstToken.text); + } else if (firstToken.type == LexerTokenType.STRING + || firstToken.type == LexerTokenType.NUMBER + || firstToken.text.equals("\"") + || firstToken.text.equals("'") + || firstToken.text.equals("`")) { + // A string or number literal as first content makes this a hash + // key context (if a comma at depth 1 follows — tracked below). + // Quoted strings are lexed as an OPERATOR token for the opening + // delimiter, so treat `"`, `'`, `` ` `` the same way. + // + // Barewords (IDENTIFIER) are deliberately NOT considered key-like + // here: real Perl treats `{ foo, 1 }` as a block (evaluating the + // comma expression), not a hashref, because the bareword might + // be a function call. Matching that behaviour also avoids + // false positives from code like + // { # comment + // some_function(...); + // } + // where stray `{` / `}` tokens inside string arguments can + // confuse this pre-parse scanner's brace balance — our scanner + // walks the raw token stream without tracking string boundaries, + // so it must stay conservative. See perl5_t/t/re/pat.t for + // real-world misbalanced regex fragments in string arguments. + firstTokenIsKeyLike = true; } while (braceCount > 0) { @@ -955,7 +987,9 @@ public static boolean isHashLiteral(Parser parser) { } case "," -> { // Comma alone is not definitive - could be function args or hash - // Continue scanning for more evidence + // Continue scanning for more evidence. Recorded so the + // first-content-is-key rule in the final decision can fire. + sawCommaAtDepth1 = true; if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("isHashLiteral found comma, continuing scan"); } case "for", "while", "if", "unless", "until", "foreach", "my", "our", "say", "print", "local" -> { @@ -1015,6 +1049,14 @@ public static boolean isHashLiteral(Parser parser) { // { %hash } or { @array } or { %{$ref} } - treat as hash constructor if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("isHashLiteral RESULT: TRUE - starts with sigil (% or @)"); return true; + } else if (firstTokenIsKeyLike && sawCommaAtDepth1) { + // `{ 'a', 'b' }`, `{ 1, 2 }`, `{ foo, 1 }` — first token is a + // hash-key-shaped thing (string/number/bareword, not a block + // keyword), followed by a comma. Real Perl treats this as a + // hashref constructor. `;` would have triggered hasBlockIndicator + // and exited above, so we know there is no statement separator. + if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("isHashLiteral RESULT: TRUE - first token key-like + comma at depth 1"); + return true; } else if (parser.insideBracedDereference) { // Inside %{...}, inner {} should default to hash constructor, not block. // Perl 5 sets PL_expect = XTERM after %{, making the next { a hash constructor. diff --git a/src/main/java/org/perlonjava/frontend/parser/SubroutineParser.java b/src/main/java/org/perlonjava/frontend/parser/SubroutineParser.java index e16819139..1f1f3acd4 100644 --- a/src/main/java/org/perlonjava/frontend/parser/SubroutineParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/SubroutineParser.java @@ -351,7 +351,14 @@ static Node parseSubroutineCall(Parser parser, boolean isMethod) { || nextTok.text.equals("]") || nextTok.text.equals(",") || nextTok.type == LexerTokenType.EOF; - boolean infixOp = nextTok.type == LexerTokenType.OPERATOR + // Word-operators (eq, ne, lt, gt, le, ge, cmp, x, isa, and, or, + // xor, …) are produced by the lexer as IDENTIFIER tokens but are + // listed in INFIX_OP. Accept both OPERATOR and IDENTIFIER token + // types here so a bareword like `Foo::Bar::` followed by one of + // them is not mistaken for a sub call consuming the word + // operator as its first argument. + boolean infixOp = (nextTok.type == LexerTokenType.OPERATOR + || nextTok.type == LexerTokenType.IDENTIFIER) && (INFIX_OP.contains(nextTok.text) || nextTok.text.equals("?") || nextTok.text.equals(":")); diff --git a/src/main/java/org/perlonjava/runtime/operators/ReferenceOperators.java b/src/main/java/org/perlonjava/runtime/operators/ReferenceOperators.java index 73f9d9319..b91baa608 100644 --- a/src/main/java/org/perlonjava/runtime/operators/ReferenceOperators.java +++ b/src/main/java/org/perlonjava/runtime/operators/ReferenceOperators.java @@ -32,6 +32,15 @@ public static RuntimeScalar bless(RuntimeScalar runtimeScalar, RuntimeScalar cla if (str.isEmpty()) { str = "main"; } + // Canonicalise the class name through any stash aliases + // (`*Foo:: = *Bar::`). In Perl, `bless` binds the referent to + // the stash object itself, whose `HvNAME` is the canonical + // package name — so if Foo has been aliased to Bar, a later + // `bless $x, "Foo"` reports `ref($x) eq "Bar"`. Without this + // canonicalisation, `ref` would return "Foo" and + // `$x->isa("Bar")` would miss the linearised hierarchy that + // the aliased stash exposes. + str = GlobalVariable.resolveStashAlias(str); RuntimeBase referent = (RuntimeBase) runtimeScalar.value; int newBlessId = NameNormalizer.getBlessId(str); diff --git a/src/main/java/org/perlonjava/runtime/operators/Unpack.java b/src/main/java/org/perlonjava/runtime/operators/Unpack.java index 153fd3165..4c193d339 100644 --- a/src/main/java/org/perlonjava/runtime/operators/Unpack.java +++ b/src/main/java/org/perlonjava/runtime/operators/Unpack.java @@ -81,7 +81,9 @@ public static RuntimeList unpack(int ctx, RuntimeBase... args) { // Create state object - always starts in character mode UnpackState state = new UnpackState(dataString, false, utf8Flagged); - // Check if template starts with U0 to switch to byte mode + // Check if template starts with U0 to switch to byte mode. + // (See inline `case 'C'/'U' 0` handler below for the historical + // naming quirk.) if (template.startsWith("U0")) { state.switchToByteMode(); } @@ -250,7 +252,15 @@ private static RuntimeList unpackInternal(String template, UnpackState state, continue; } - // Check for explicit mode modifiers + // Check for explicit mode modifiers. Historical quirk: the + // internal state's `characterMode` name is inverted from what + // the Perl docs call "character mode". `C0` puts the state + // into what the rest of the code (StringFormatHandler etc.) + // treats as "operate on raw code-units", which matches the + // Perl `C0` semantics of "strip UTF-8 flag & treat as bytes". + // `U0` puts it into the opposite mode. Don't touch this + // unless you're ready to audit every other handler that + // keys off `state.isCharacterMode()`. if (format == 'C' && i + 1 < template.length() && template.charAt(i + 1) == '0') { state.switchToCharacterMode(); i += 2; // Skip 'C0' diff --git a/src/main/java/org/perlonjava/runtime/operators/unpack/UFormatHandler.java b/src/main/java/org/perlonjava/runtime/operators/unpack/UFormatHandler.java index b20873678..850f3b068 100644 --- a/src/main/java/org/perlonjava/runtime/operators/unpack/UFormatHandler.java +++ b/src/main/java/org/perlonjava/runtime/operators/unpack/UFormatHandler.java @@ -22,16 +22,66 @@ public UFormatHandler(boolean startsWithU) { @Override public void unpack(UnpackState state, List output, int count, boolean isStarCount) { for (int i = 0; i < count; i++) { - // Check if input has UTF-8 flag (is already Unicode string) - if (state.isUTF8Data()) { - // Input is UTF-8 flagged string - read codepoints directly - if (state.hasMoreCodePoints()) { - output.add(new RuntimeScalar(state.nextCodePoint())); - } else { + // The U format reads one Unicode code point. Real-Perl + // semantics: + // + // * Template starts with bare `U` (e.g. `"U*"`) or has + // switched to Perl "character mode" via `U0`: read one + // code point directly from the string. Default for + // most templates used by Perl code (e.g. JSON::PP's + // `_encode_ascii`: `unpack("U*", $str)` on a character + // string yields the string's code points one by one). + // * After `C0` (or default when template does NOT start + // with bare `U`): bytes are interpreted as UTF-8 and one + // character is decoded per consumed group. + // + // `UnpackState`'s internal `isCharacterMode()` keys off a + // naming calibrated for the numerous other format handlers + // (`StringFormatHandler`, `NumericFormatHandler`, …) and + // cannot be changed without auditing all of them. Here we + // combine `startsWithU` (set when the template begins with + // bare `U`) with `!isCharacterMode()` (set after `U0`). + // + // When we're in byte mode but reading code points (the + // `U0…U` case), we advance both `codePointIndex` AND + // `buffer.position` so a later `switchToCharacterMode` + // (triggered by e.g. `C0`) recomputes `codePointIndex` + // correctly from the byte offset. Without that, the + // code-point progress inside this handler would be lost the + // moment the template switches back to character mode. + // + // Covered by `op/utf8decode.t` (the `C0U*` path), + // `op/pack.t` (the `U0U C0 W` sequence), and the JSON::PP + // `_encode_ascii` round-trip in `unpack.t` (the starts-with- + // U path). + boolean readCodePoints = startsWithU || !state.isCharacterMode(); + if (readCodePoints) { + if (!state.hasMoreCodePoints()) { break; } + int cp = state.nextCodePoint(); + output.add(new RuntimeScalar(cp)); + // Keep the byte buffer in sync so subsequent mode + // switches don't rewind us to the start. + if (!state.isCharacterMode()) { + ByteBuffer buf = state.getBuffer(); + if (buf != null) { + // `originalBytes` layout mirrors what + // `UnpackState.switchToByteMode` uses: ISO-8859-1 + // (1 byte per code point) when `!isUTF8Data`, + // UTF-8 (variable length) when `isUTF8Data`. + int advance = state.isUTF8Data() + ? (cp <= 0x7F ? 1 + : cp <= 0x7FF ? 2 + : cp <= 0xFFFF ? 3 + : cp <= 0x10FFFF ? 4 + : 5) + : 1; + int newPos = Math.min(buf.position() + advance, buf.limit()); + buf.position(newPos); + } + } } else { - // Input is byte string - decode UTF-8 ByteBuffer buffer = state.getBuffer(); if (!buffer.hasRemaining()) { break; // Just stop unpacking diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Json.java b/src/main/java/org/perlonjava/runtime/perlmodule/Json.java deleted file mode 100644 index 2ec568e94..000000000 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Json.java +++ /dev/null @@ -1,228 +0,0 @@ -package org.perlonjava.runtime.perlmodule; - -import com.alibaba.fastjson2.JSON; -import com.alibaba.fastjson2.JSONArray; -import com.alibaba.fastjson2.JSONObject; -import com.alibaba.fastjson2.JSONWriter; -import org.perlonjava.runtime.runtimetypes.*; - -import java.math.BigDecimal; - -import static org.perlonjava.runtime.runtimetypes.RuntimeScalarCache.*; -import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.*; - -/** - * The {@code Json} class provides methods for encoding and decoding JSON data - * within a Perl-like runtime environment. It extends {@link PerlModuleBase} and - * offers functionality to convert between JSON strings and Perl data structures. - *

- * Note: Some methods are defined in src/main/perl/lib/JSON.pm - */ -public class Json extends PerlModuleBase { - - /** - * Constructs a new {@code Json} instance and initializes the module with the name "JSON". - */ - public Json() { - super("JSON", false); - } - - /** - * Initializes the JSON module by registering methods and defining exports. - */ - public static void initialize() { - Json json = new Json(); - try { - json.registerMethod("encode", null); - json.registerMethod("decode", null); - json.registerMethod("true", "getTrue", ""); - json.registerMethod("false", "getFalse", ""); - json.registerMethod("null", "getNull", ""); - json.registerMethod("is_bool", "isBool", "$"); - } catch (NoSuchMethodException e) { - System.err.println("Warning: Missing Json method: " + e.getMessage()); - } - } - - /** - * Checks if the given argument is a boolean. - * - * @param args the runtime array containing the argument to check - * @param ctx the runtime context - * @return a {@link RuntimeList} indicating whether the argument is a boolean - */ - public static RuntimeList isBool(RuntimeArray args, int ctx) { - RuntimeScalar res = args.get(0); - return getScalarBoolean(res.type == RuntimeScalarType.BOOLEAN).getList(); - } - - /** - * Returns a {@link RuntimeList} representing the boolean value true. - * - * @param args the runtime array - * @param ctx the runtime context - * @return a {@link RuntimeList} representing true - */ - public static RuntimeList getTrue(RuntimeArray args, int ctx) { - return scalarTrue.getList(); - } - - /** - * Returns a {@link RuntimeList} representing the boolean value false. - * - * @param args the runtime array - * @param ctx the runtime context - * @return a {@link RuntimeList} representing false - */ - public static RuntimeList getFalse(RuntimeArray args, int ctx) { - return scalarFalse.getList(); - } - - /** - * Returns a {@link RuntimeList} representing a null value. - * - * @param args the runtime array - * @param ctx the runtime context - * @return a {@link RuntimeList} representing null - */ - public static RuntimeList getNull(RuntimeArray args, int ctx) { - return scalarUndef.getList(); - } - - /** - * Encodes a Perl data structure into a JSON string with specific formatting options. - * - * @param args the runtime array containing the instance and Perl data structure - * @param ctx the runtime context - * @return a {@link RuntimeList} containing the JSON string - * @throws IllegalStateException if the number of arguments is incorrect - */ - public static RuntimeList encode(RuntimeArray args, int ctx) { - if (args.size() != 2) { - throw new IllegalStateException("Bad number of arguments for Json method"); - } - RuntimeScalar instance = args.get(0); - RuntimeScalar perlData = args.get(1); - Object json = convertRuntimeScalarToJson(perlData); - - // Retrieve the instance settings - RuntimeHash hash = instance.hashDeref(); - boolean indent = hash.get("indent").getBoolean(); - boolean spaceBefore = hash.get("space_before").getBoolean(); - boolean spaceAfter = hash.get("space_after").getBoolean(); - - // Configure JSON serialization options - JSONWriter.Feature[] features = indent ? new JSONWriter.Feature[]{JSONWriter.Feature.PrettyFormat} : new JSONWriter.Feature[0]; - - // Serialize JSON with the configured options - String jsonString = JSON.toJSONString(json, features); - - // Post-process the JSON string for custom indentation - if (indent) { - jsonString = jsonString.replaceAll("\t", " "); // Replace default indentation with 3 spaces - } - - // Post-process the JSON string for space_before and space_after - if (spaceBefore) { - jsonString = jsonString.replaceAll(":", " :"); - } - if (spaceAfter) { - jsonString = jsonString.replaceAll(",", ", "); - jsonString = jsonString.replaceAll(":", ": "); - } - - return new RuntimeScalar(jsonString).getList(); - } - - /** - * Decodes a JSON string into a Perl data structure with specific instance settings. - * - * @param args the runtime array containing the instance and JSON string - * @param ctx the runtime context - * @return a {@link RuntimeList} containing the Perl data structure - * @throws IllegalStateException if the number of arguments is incorrect - */ - public static RuntimeList decode(RuntimeArray args, int ctx) { - if (args.size() != 2) { - throw new IllegalStateException("Bad number of arguments for Json method"); - } - RuntimeScalar instance = args.get(0); - RuntimeScalar jsonString = args.get(1); - Object json = JSON.parse(jsonString.toString()); - return convertJsonToRuntimeScalar(json).getList(); - } - - /** - * Converts a JSON object to a {@link RuntimeScalar}. - * - * @param json the JSON object to convert - * @return a {@link RuntimeScalar} representing the JSON object - */ - private static RuntimeScalar convertJsonToRuntimeScalar(Object json) { - if (json instanceof JSONObject jsonObject) { - RuntimeHash hash = new RuntimeHash(); - for (String key : jsonObject.keySet()) { - hash.put(key, convertJsonToRuntimeScalar(jsonObject.get(key))); - } - return hash.createReference(); - } else if (json instanceof JSONArray jsonArray) { - RuntimeArray array = new RuntimeArray(); - for (int i = 0; i < jsonArray.size(); i++) { - array.elements.add(convertJsonToRuntimeScalar(jsonArray.get(i))); - } - return array.createReference(); - } else if (json instanceof String) { - return new RuntimeScalar((String) json); - } else if (json instanceof Integer) { - return new RuntimeScalar((Integer) json); - } else if (json instanceof Long) { - return new RuntimeScalar((Long) json); - } else if (json instanceof Double) { - return new RuntimeScalar((Double) json); - } else if (json instanceof Boolean) { - return new RuntimeScalar((Boolean) json); - } else if (json instanceof BigDecimal) { - // Convert BigDecimal to double - return new RuntimeScalar(((BigDecimal) json).doubleValue()); - } else { - return new RuntimeScalar(); // Represents null or undefined - } - } - - /** - * Converts a {@link RuntimeScalar} to a JSON object. - * - * @param scalar the {@link RuntimeScalar} to convert - * @return the JSON object representation of the scalar - */ - private static Object convertRuntimeScalarToJson(RuntimeScalar scalar) { - switch (scalar.type) { - case HASHREFERENCE: - JSONObject jsonObject = new JSONObject(); - RuntimeHash hash = (RuntimeHash) scalar.value; - for (String key : hash.elements.keySet()) { - jsonObject.put(key, convertRuntimeScalarToJson(hash.get(key))); - } - return jsonObject; - case ARRAYREFERENCE: - JSONArray jsonArray = new JSONArray(); - RuntimeArray array = (RuntimeArray) scalar.value; - for (RuntimeScalar element : array.elements) { - jsonArray.add(convertRuntimeScalarToJson(element)); - } - return jsonArray; - case STRING, BYTE_STRING, VSTRING: - return scalar.toString(); - case DOUBLE: - return scalar.getDouble(); - case INTEGER: - return scalar.getLong(); - case BOOLEAN: - return scalar.getBoolean(); - case READONLY_SCALAR: - return convertRuntimeScalarToJson((RuntimeScalar) scalar.value); - default: - return null; - } - } -} diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java b/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java index 6ca192430..fe7b517f8 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java @@ -349,6 +349,10 @@ public static RuntimeList isa(RuntimeArray args, int ctx) { } else if (normalizedArg.startsWith("::")) { normalizedArg = normalizedArg.substring(2); } + // Canonicalise through stash aliases (`*Foo:: = *Bar::;`): an argument + // like "Dummy::True" must still match an object blessed into "JSON::PP::Boolean" + // if the two package names are aliases. + normalizedArg = GlobalVariable.resolveStashAlias(normalizedArg); return new RuntimeScalar(linearizedClasses.contains(normalizedArg)).getList(); } diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java index b9f0d73c9..6e5d9b676 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java @@ -926,7 +926,17 @@ static int handleRegexCharacterClassEscape(int offset, String s, StringBuilder s lastChar = c; wasEscape = false; break; - case '(', ')', '*', '?', '<', '>', '\'', '"', '`', '@', '#', '=', '&': + case '(', ')', '*', '<', '>', '\'', '"', '`', '@', '#', '=', '&': + // NOTE: '?' is deliberately NOT in this list. None of these + // characters need to be escaped inside a Java regex character + // class, so the backslash is purely cosmetic — except that a + // backslashed '?' combines with a preceding `\c` to form + // `\c\?`, which Java then parses as `\c\` (control-backslash + // = U+001C) followed by a literal '?', silently corrupting + // patterns like `[\n\t\c?]` (Perl: matches LF, TAB, DEL; + // pre-fix: also matched U+001C). See + // dev/modules/json_test_parity.md and t/99_binary.t in the + // CPAN JSON distribution for a motivating example. sb.append('\\'); sb.append(Character.toChars(c)); first = false; diff --git a/src/main/perl/lib/JSON.pm b/src/main/perl/lib/JSON.pm index 2baa3cf13..382466fd9 100644 --- a/src/main/perl/lib/JSON.pm +++ b/src/main/perl/lib/JSON.pm @@ -1,43 +1,168 @@ package JSON; -our $VERSION = '4.11'; +# PerlOnJava's bundled JSON backend. +# +# CPAN `JSON` is a pure-Perl dispatcher that loads `JSON::XS` or +# `JSON::PP`. In PerlOnJava we use `JSON::PP` as the real backend — +# it is a complete pure-Perl implementation that handles every +# option, edge case and error message the CPAN test suite checks +# for. A previous iteration of this shim delegated to a partial +# Java implementation in `Json.java`; that implementation did not +# honour most of the JSON::XS options and was replaced by the +# JSON::PP inheritance below. `Json.java` is kept in the tree but +# no longer loaded by this module. +# +# We still provide the dispatcher's surface so code that writes +# use JSON -support_by_pp; +# my $v = JSON->backend->VERSION; +# continues to load cleanly. -use Exporter "import"; -use warnings; use strict; -use Symbol; -use Carp; +use warnings; +use Carp (); + +# Pick the pure-Perl backend to load. The CPAN `JSON` dispatcher +# honours `$ENV{PERL_JSON_BACKEND}`: the value `JSON::backportPP` +# asks for a self-contained copy of JSON::PP that should not leave +# `$INC{'JSON/PP.pm'}` populated. PerlOnJava ships both +# `JSON::PP` and `JSON::backportPP`; the latter loads JSON::PP's +# source without registering the PP file in `%INC`. +BEGIN { + my $backend = $ENV{PERL_JSON_BACKEND}; + if (defined $backend && $backend eq 'JSON::backportPP') { + require JSON::backportPP; + } else { + require JSON::PP; + } + # The CPAN `JSON::backportPP` declares `package JSON::PP;` internally + # but only sets `@JSON::backportPP::ISA = ('Exporter')`, leaving + # `@JSON::PP::ISA` empty AND leaving `JSON::backportPP` without + # `isa('JSON::PP')`, `is_pp`, `is_xs`. Our own `JSON::PP.pm` / + # `JSON::backportPP.pm` set these up correctly, but the CPAN tarball + # ships its own copies that shadow ours under + # `PERL5LIB=./blib/lib:./blib/arch` during `make test`. Paper over + # both omissions so the dispatcher surface behaves the same either + # way. + unless (@JSON::PP::ISA) { + @JSON::PP::ISA = ('Exporter'); + } + unless (grep { $_ eq 'JSON::PP' } @JSON::backportPP::ISA) { + push @JSON::backportPP::ISA, 'JSON::PP'; + } + unless (defined &JSON::backportPP::is_pp) { + no warnings 'once'; + *JSON::backportPP::is_pp = sub { 1 }; + *JSON::backportPP::is_xs = sub { 0 }; + } + unless (defined &JSON::PP::is_pp) { + no warnings 'once'; + *JSON::PP::is_pp = sub { 1 }; + *JSON::PP::is_xs = sub { 0 }; + } +} -XSLoader::load( 'Json' ); +our $VERSION = '4.11'; -# NOTE: The rest of the code is in file: -# src/main/java/org/perlonjava/perlmodule/Json.java +# Inherit all encode/decode/option methods from JSON::PP. Using @ISA +# (rather than re-exporting every sub) means future JSON::PP updates +# are picked up automatically. +our @ISA = ('JSON::PP'); -our @EXPORT = ("encode_json", "decode_json", "to_json", "from_json"); +our @EXPORT = qw(from_json to_json jsonToObj objToJson encode_json decode_json); -my @PublicMethods = qw/ - ascii latin1 utf8 pretty indent space_before space_after relaxed canonical allow_nonref - allow_blessed convert_blessed filter_json_object filter_json_single_key_object - shrink max_depth max_size encode decode decode_prefix allow_unknown - allow_tags -/; +# Backend introspection variables populated by the real CPAN JSON +# after it loads a backend. Mirror the CPAN semantics: when +# `PERL_JSON_BACKEND` picked `JSON::backportPP`, report that +# (and set the module-PP name to match); otherwise report +# JSON::PP as the backend. Tests look at both. +our ($Backend, $BackendModule, $BackendModulePP); +BEGIN { + my $chosen = (defined $ENV{PERL_JSON_BACKEND} + && $ENV{PERL_JSON_BACKEND} eq 'JSON::backportPP') + ? 'JSON::backportPP' + : 'JSON::PP'; + $Backend = $chosen; + $BackendModule = $chosen; + $BackendModulePP = $chosen; +} +our $BackendModuleXS; # left undef: no XS backend available + +our $DEBUG = 0; +$DEBUG = $ENV{PERL_JSON_DEBUG} if exists $ENV{PERL_JSON_DEBUG}; -my @Properties = qw/ - ascii latin1 utf8 indent space_before space_after relaxed canonical allow_nonref - allow_blessed convert_blessed shrink max_depth max_size allow_unknown - allow_tags +my %RequiredVersion = ( + 'JSON::PP' => '2.27203', + 'JSON::XS' => '2.34', +); + +# PP-only methods, reported by pureperl_only_methods(). +my @PPOnlyMethods = qw/ + indent_length sort_by + allow_singlequote allow_bignum loose allow_barekey escape_slash as_nonblessed /; -sub new { - my ($class) = @_; - return bless {}, $class; +# CPAN JSON.pm supports several special import tags; accept them as +# no-ops so modules that use them at `use` time continue to load. +sub import { + my $pkg = shift; + my @to_export; + my $no_export; + + for my $tag (@_) { + if ($tag eq '-support_by_pp') { + # already supported — JSON::PP is our backend + next; + } + elsif ($tag eq '-no_export') { + $no_export++; + next; + } + elsif ($tag eq '-convert_blessed_universally') { + # Install a default UNIVERSAL::TO_JSON like CPAN JSON does. + # The hook unwraps blessed hashrefs/arrayrefs into plain refs; + # it must use `reftype` (not `ref`) because `ref` on a blessed + # ref returns the CLASS name, not the underlying reftype. + require Scalar::Util; + my $org_encode = JSON::PP->can('encode'); + no warnings 'redefine'; + no strict 'refs'; + *{'JSON::PP::encode'} = sub { + local *UNIVERSAL::TO_JSON = sub { + my $rt = Scalar::Util::reftype($_[0]) // ''; + return $rt eq 'HASH' ? { %{$_[0]} } + : $rt eq 'ARRAY' ? [ @{$_[0]} ] + : undef; + }; + $org_encode->(@_); + }; + next; + } + push @to_export, $tag; + } + return if $no_export; + __PACKAGE__->export_to_level(1, $pkg, @to_export); } -sub is_xs { 1 }; -sub is_pp { 0 }; +# CPAN's `encode_json` / `decode_json` are package subs in JSON::PP, +# not imported-to-this-package. Re-export them under the JSON +# package so `JSON::encode_json(...)` and a bare `encode_json(...)` +# after `use JSON` both work. +*encode_json = \&JSON::PP::encode_json; +*decode_json = \&JSON::PP::decode_json; +*is_bool = \&JSON::PP::is_bool; +*true = \&JSON::PP::true; +*false = \&JSON::PP::false; +*null = sub { undef }; + +# Backend introspection. Works both as class and instance method. +sub backend { $Backend } +sub is_xs { 0 } +sub is_pp { 1 } +sub require_xs_version { $RequiredVersion{'JSON::XS'} } +sub pureperl_only_methods { @PPOnlyMethods } -# INTERFACES +# INTERFACES — thin wrappers matching CPAN JSON.pm exactly. sub to_json ($@) { if ( @@ -58,7 +183,6 @@ sub to_json ($@) { $json->encode($_[0]); } - sub from_json ($@) { if ( ref($_[0]) eq 'JSON' or $_[0] eq 'JSON' ) { Carp::croak "from_json should not be called as a method."; @@ -75,129 +199,98 @@ sub from_json ($@) { return $json->decode( $_[0] ); } +sub jsonToObj { + my $alt = 'from_json'; + if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) { + shift @_; + $alt = 'decode'; + } + Carp::carp "'jsonToObj' will be obsoleted. Please use '$alt' instead."; + return JSON::from_json(@_); +} + +sub objToJson { + my $alt = 'to_json'; + if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) { + shift @_; + $alt = 'encode'; + } + Carp::carp "'objToJson' will be obsoleted. Please use '$alt' instead."; + return JSON::to_json(@_); +} + sub boolean { - # might be called as method or as function, so pop() to get the last arg instead of shift() to get the first + # might be called as method or as function; use pop() to fetch the + # intended boolean regardless. pop() ? true() : false() } -sub require_xs_version {} - -sub backend {} +# `property()` lets callers introspect (or toggle) any named option via the +# same name as its setter/getter. Neither JSON::PP nor the stock JSON shim +# provides it, but the CPAN JSON.pm dispatcher does — and several tests use +# it (t/e01_property.t). Implement it inline here. +my @PropertyNames = qw( + ascii latin1 utf8 indent space_before space_after relaxed canonical + allow_nonref allow_blessed convert_blessed shrink max_depth max_size + allow_unknown allow_tags +); sub property { - my ($self, $name, $value) = @_; - - if (@_ == 1) { + my $self = shift; + if (@_ == 0) { + # Return all properties as a hashref my %props; - for $name (@Properties) { - my $method = 'get_' . $name; - if ($name eq 'max_size') { - my $value = $self->$method(); - $props{$name} = $value == 1 ? 0 : $value; - next; - } - $props{$name} = $self->$method(); + for my $name (@PropertyNames) { + my $getter = 'get_' . $name; + my $v = $self->can($getter) ? $self->$getter() : undef; + # CPAN JSON.pm maps max_size == 1 back to 0 for reporting + $v = 0 if $name eq 'max_size' && defined $v && $v == 1; + $props{$name} = $v; } return \%props; } - elsif (@_ > 3) { - Carp::croak('property() can take only the option within 2 arguments.'); - } - elsif (@_ == 2) { - if ( my $method = $self->can('get_' . $name) ) { - if ($name eq 'max_size') { - my $value = $self->$method(); - return $value == 1 ? 0 : $value; - } - $self->$method(); - } - } - else { - $self->$name($value); - } - -} - -BEGIN { - my @xs_compati_bit_properties = qw( - latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink - allow_blessed convert_blessed relaxed allow_unknown - allow_tags - ); - my @pp_bit_properties = qw( - allow_singlequote allow_bignum loose - allow_barekey escape_slash as_nonblessed - ); - for my $name (@xs_compati_bit_properties, @pp_bit_properties) { - my $sym_ref = Symbol::qualify_to_ref($name, __PACKAGE__); - *$sym_ref = sub { - if ($_[1] // 1) { - $_[0]->{$name} = 1; - } - else { - $_[0]->{$name} = 0; - } - $_[0]; - }; - $sym_ref = Symbol::qualify_to_ref("get_$name", __PACKAGE__); - *$sym_ref = sub { - $_[0]->{$name} ? 1 : ''; - }; - } -} - -# pretty printing - -sub pretty { - my ($self, $v) = @_; - my $enable = defined $v ? $v : 1; + Carp::croak('property() can take only the option within 2 arguments.') + if @_ > 2; - if ($enable) { # indent_length(3) for JSON::XS compatibility - $self->indent(1)->space_before(1)->space_after(1); - } - else { - $self->indent(0)->space_before(0)->space_after(0); + my ($name, $value) = @_; + if (@_ == 1) { + # Getter form + my $getter = 'get_' . $name; + return undef unless $self->can($getter); + my $v = $self->$getter(); + $v = 0 if $name eq 'max_size' && defined $v && $v == 1; + return $v; } - - $self; -} - -# Functions - -my $JSON; # cache - -sub encode_json ($) { # encode - ($JSON ||= __PACKAGE__->new->utf8)->encode(@_); -} - - -sub decode_json { # decode - ($JSON ||= __PACKAGE__->new->utf8)->decode(@_); + # Setter form: property($name, $value) -> delegate to $self->$name($value) + return $self->$name($value); } 1; __END__ -Author and Copyright messages from the original JSON.pm: +=head1 NAME -=head1 AUTHOR +JSON - PerlOnJava bundled JSON backend (delegates to JSON::PP) -Makamaka Hannyaharamitu, Emakamaka[at]cpan.orgE +=head1 DESCRIPTION -JSON::XS was written by Marc Lehmann Eschmorp[at]schmorp.deE +This module exposes the CPAN C dispatcher API but is backed +unconditionally by L, which is shipped inside the PerlOnJava +JAR. The original hand-coded Java encoder in C has been +retired in favour of the complete pure-Perl implementation so that +every JSON::XS-style option, error message, and edge case matches the +CPAN test suite. -The release of this new version owes to the courtesy of Marc Lehmann. - -=head1 CURRENT MAINTAINER +=head1 AUTHOR -Kenichi Ishigaki, Eishigaki[at]cpan.orgE +The CPAN C module is maintained by Kenichi Ishigaki, originally +authored by Makamaka Hannyaharamitu. This PerlOnJava shim preserves +the author/licence of the original. =head1 COPYRIGHT AND LICENSE -Copyright 2005-2013 by Makamaka Hannyaharamitu - -Most of the documentation is taken from JSON::XS by Marc Lehmann +Copyright 2005-2013 by Makamaka Hannyaharamitu. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/src/main/perl/lib/JSON/PP.pm b/src/main/perl/lib/JSON/PP.pm index b739ccf83..266a17429 100644 --- a/src/main/perl/lib/JSON/PP.pm +++ b/src/main/perl/lib/JSON/PP.pm @@ -130,6 +130,17 @@ sub from_json($) { # Methods +# Backend introspection — the CPAN JSON dispatcher and its test suite call +# these as class methods on whichever backend module was loaded (JSON::XS, +# JSON::PP, JSON::backportPP, or — in PerlOnJava — our `JSON` shim which +# ISA JSON::PP). JSON::PP is the pure-Perl backend, so `is_pp == 1` and +# `is_xs == 0`. Upstream CPAN JSON::PP doesn't define these (the real +# dispatcher installs them), but we ship JSON::PP as our backend and add +# them here so that tests which introspect `JSON::PP->is_pp` directly +# work out of the box. +sub is_xs { 0 } +sub is_pp { 1 } + sub new { my $class = shift; my $self = { @@ -1547,8 +1558,30 @@ sub incr_parse { my ($obj, $offset) = $coder->PP_decode_json( $self->{incr_text}, 0x00000001 ); push @ret, $obj; - use bytes; - $self->{incr_text} = substr( $self->{incr_text}, $offset || 0 ); + # PerlOnJava: the decoder advances its internal `$at` by the + # UTF-8 byte length of each multi-byte character (via + # `is_valid_utf8`), so `$offset` is in BYTES. CPAN + # JSON::PP papers over this with `use bytes; substr`, + # which in upstream Perl makes substr operate on the + # UTF-8 byte representation of the string. PerlOnJava's + # `use bytes` pragma does not yet redirect substr, so do + # the equivalent explicitly. + # + # When `get_utf8` is true, `$self->{incr_text}` is already + # a byte string (the user hands us UTF-8 bytes) and plain + # substr works correctly with a byte offset. When + # `get_utf8` is false, we decoded to chars above and need + # to encode-substr-decode to match the decoder's byte + # bookkeeping. + if ( $coder->get_utf8 ) { + $self->{incr_text} = substr( $self->{incr_text}, $offset || 0 ); + } else { + my $bytes = $self->{incr_text}; + utf8::encode($bytes); + my $remaining = substr($bytes, $offset || 0); + utf8::decode($remaining); + $self->{incr_text} = $remaining; + } $self->{incr_pos} = 0; $self->{incr_nest} = 0; $self->{incr_mode} = 0; diff --git a/src/main/perl/lib/JSON/backportPP.pm b/src/main/perl/lib/JSON/backportPP.pm new file mode 100644 index 000000000..c80f5d16a --- /dev/null +++ b/src/main/perl/lib/JSON/backportPP.pm @@ -0,0 +1,54 @@ +# PerlOnJava's JSON::backportPP. +# +# The CPAN `JSON` dispatcher's `PERL_JSON_BACKEND=JSON::backportPP` +# contract says: load JSON::PP's implementation without populating +# `$INC{'JSON/PP.pm'}`, and expose it as the class `JSON::backportPP` +# whose base class is `JSON::PP`. +# +# We achieve this by defining an empty `JSON::backportPP` class that +# inherits from `JSON::PP`, then `require`-ing JSON::PP and removing +# `JSON/PP.pm` from `%INC` so the contract is satisfied. + +package JSON::backportPP; + +use strict; +use warnings; + +our $VERSION = '4.18'; +our @ISA = ('JSON::PP'); + +sub is_xs { 0 } +sub is_pp { 1 } + +BEGIN { + require JSON::PP; + # Backend-identification contract: callers distinguish + # "backportPP backend was loaded" from "JSON::PP was loaded + # directly" by checking `$INC{'JSON/PP.pm'}`. Hide our + # `require JSON::PP` by removing that entry after load so tests + # and CPAN dispatcher logic see only `JSON/backportPP.pm` in + # `%INC`. The actual JSON::PP code stays loaded (all subs are + # defined in-place in the interpreter). + delete $INC{'JSON/PP.pm'}; +} + +1; + +__END__ + +=head1 NAME + +JSON::backportPP - PerlOnJava shim providing the JSON::PP API without +populating C<$INC{'JSON/PP.pm'}> + +=head1 DESCRIPTION + +Loaded in place of C when the environment variable +C is set (see L). The +package C is a thin subclass of C; +calling any JSON::PP method through Cmethod> +dispatches to the same implementation as if C had been +loaded directly. C<%INC> distinguishes the two — C +is set, C is not. + +=cut diff --git a/src/test/resources/unit/hash_ref_disambiguation.t b/src/test/resources/unit/hash_ref_disambiguation.t new file mode 100644 index 000000000..38d2faa68 --- /dev/null +++ b/src/test/resources/unit/hash_ref_disambiguation.t @@ -0,0 +1,51 @@ +use strict; +use warnings; +use Test::More tests => 14; + +# Regression tests for block-vs-hashref disambiguation inside `{ ... }`. +# Real Perl treats `{ KEY, VALUE, ... }` as an anonymous hash when the +# first content token is a STRING or NUMBER literal (or fat-comma key) +# and a plain comma appears at depth 1. Bare identifiers as the first +# token make it a BLOCK — `{ foo, 1 }` could be `foo()` called with +# `1` as arg. Matching that behaviour also keeps our pre-parse scanner +# conservative w.r.t. string contents it can't see past. + +# --- hashref forms --- + +is(ref(sub { {"a","b"} }->()), 'HASH', '{"a","b"} is a hashref'); +is(ref(sub { {1,2,3,4} }->()), 'HASH', '{1,2,3,4} is a hashref'); +is(ref(sub { {"foo",1,"bar",2} }->()), 'HASH', '{"foo",1,"bar",2} is a hashref'); +is(ref(sub { { a => 1 } }->()), 'HASH', 'fat-comma hash still works'); +is(ref(sub { +{"a","b"} }->()), 'HASH', 'explicit +{} still works'); +is(ref(sub { {} }->()), 'HASH', 'empty {} still a hashref'); + +# --- content is preserved --- + +{ + my $h = sub { {"a","1","b","2"} }->(); + is_deeply($h, {a=>1, b=>2}, '{"a","1","b","2"} content'); +} + +# --- block forms are not mis-classified --- + +my @m = map { $_ + 1 } (1..3); +is_deeply(\@m, [2,3,4], 'map { $_ + 1 } still a block'); + +my @g = grep { $_ > 1 } (1..3); +is_deeply(\@g, [2,3], 'grep { $_ > 1 } still a block'); + +my @s = sort { $a <=> $b } (3,1,2); +is_deeply(\@s, [1,2,3], 'sort { $a <=> $b } still a block'); + +my $d = do { 5 + 6 }; +is($d, 11, 'do { ... } still a block'); + +my $e = eval { 7 * 8 }; +is($e, 56, 'eval { ... } still a block'); + +my $r = sub { return { k => 1 } }->(); +is(ref($r), 'HASH', 'return {...} still a hashref'); + +# Block with a leading keyword and comma expression +my @x = do { "foo", "bar" }; +is_deeply(\@x, ["foo","bar"], 'do { "foo", "bar" } is a block (list context)'); \ No newline at end of file diff --git a/src/test/resources/unit/method_call_trailing_colons.t b/src/test/resources/unit/method_call_trailing_colons.t index b27731d4a..b843a389f 100644 --- a/src/test/resources/unit/method_call_trailing_colons.t +++ b/src/test/resources/unit/method_call_trailing_colons.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 5; +use Test::More tests => 10; # Regression: `Foo::->bar()` should pass class name "Foo", not "Foo::" # See dev/modules/ppi.md (RC1). @@ -21,3 +21,25 @@ is(Foo::Bar::->classname,'Foo::Bar', 'nested bareword class with trailing :: is ok(Foo::->isa_check, 'isa still works through trailing-:: invocant'); ok(Foo::Bar::->isa_check, 'isa finds parent class through trailing-:: invocant'); +# Regression: `bless $ref, Foo::Bar::;` should strip trailing "::". +# Previously produced ref "Foo::Bar::" (keeping the ::), breaking +# isa/ref checks and ->method() dispatch. +{ + my $obj = bless { k => 1 }, Foo::Bar::; + is(ref($obj), 'Foo::Bar', 'bless + trailing :: strips the ::'); + ok($obj->isa('Foo::Bar'), '...and isa works'); +} + +# Regression: `Foo::Bar:: eq $x` was mis-parsed as a sub call consuming +# `eq` as its first argument, producing an "Undefined subroutine" error. +# The word-operators eq/ne/lt/gt/le/ge/cmp/x/isa/and/or/xor are lexed +# as IDENTIFIER tokens but still participate in expressions. +{ + my $x = 'Foo::Bar'; + ok(Foo::Bar:: eq $x, 'package literal followed by eq works'); + ok(Foo::Bar:: ne 'zzz', 'package literal followed by ne works'); +} + +# Sanity: the package literal by itself still stringifies correctly. +is(Foo::Bar::, 'Foo::Bar', 'package literal evaluates to class name'); + diff --git a/src/test/resources/unit/regex/regex_charclass.t b/src/test/resources/unit/regex/regex_charclass.t index d2b381bf1..4e4dd588f 100644 --- a/src/test/resources/unit/regex/regex_charclass.t +++ b/src/test/resources/unit/regex/regex_charclass.t @@ -41,4 +41,23 @@ $pattern = qr/[[:punct:]]+/; $match = $string =~ $pattern; ok($match, '\'Hello, World!\' matches \'[[:punct:]]+\''); +# Regression: `[\c?]` must match only U+007F (DEL), not U+001C. +# Earlier, the preprocessor escaped `?` inside [] as `\?`, so `[\c?]` +# leaked out as `[\c\?]` which Java parses as `\c\` (= 0x1C) plus a +# literal `?`, matching the wrong code point. This silently corrupted +# patterns like `[\n\t\c?[:^cntrl:]]` used by JSON::PP's ASCII escape +# regex (and any other Perl code that uses `\c?` inside a class). +subtest 'bracketed \c? matches DEL only' => sub { + my @matched; + for my $i (0x00..0x1f, 0x7f) { + push @matched, sprintf("0x%02x", $i) if chr($i) =~ /[\c?]/; + } + is_deeply(\@matched, ["0x7f"], '[\c?] matches U+007F only'); + + # Sanity: literal `?` still works in classes and as a quantifier. + ok("?" =~ /[?]/, "bracketed literal ? still matches"); + ok("colour" =~ /colou?r/, "? quantifier still works"); + ok("color" =~ /colou?r/, "? quantifier still works (absent)"); +}; + done_testing(); diff --git a/src/test/resources/unit/stash_aliasing.t b/src/test/resources/unit/stash_aliasing.t index 1b9fd5584..139434c16 100644 --- a/src/test/resources/unit/stash_aliasing.t +++ b/src/test/resources/unit/stash_aliasing.t @@ -75,4 +75,25 @@ subtest 'chain alias' => sub { 'chained alias resolves through to terminal package' ); }; +subtest 'bless through aliased package name' => sub { + package StashAliasBlessSrc; + sub greet { "hello from src" } + package main; + *StashAliasBlessDst:: = *StashAliasBlessSrc::; + + # bless using the aliased (Dst) name should be equivalent to blessing + # through the canonical (Src) name: ref reports the canonical name, + # isa succeeds for BOTH the aliased and canonical names, and method + # dispatch goes through to the shared stash. + my $x = bless {}, 'StashAliasBlessDst'; + is( ref($x), 'StashAliasBlessSrc', + 'ref() returns canonical (Src) name after blessing through alias' ); + ok( $x->isa('StashAliasBlessSrc'), + 'isa(canonical-name) is true for object blessed through alias' ); + ok( $x->isa('StashAliasBlessDst'), + 'isa(alias-name) is true too (alias == canonical)' ); + is( $x->greet, 'hello from src', + 'method dispatch works through the aliased stash' ); +}; + done_testing; diff --git a/src/test/resources/unit/unpack.t b/src/test/resources/unit/unpack.t index 7c90e51d0..abe244830 100644 --- a/src/test/resources/unit/unpack.t +++ b/src/test/resources/unit/unpack.t @@ -160,4 +160,43 @@ subtest 'Star (*) modifier tests' => sub { cmp_ok(abs($unpacked[1] - 2.22222222), '<', 0.00000001, 'Second double value'); }; +subtest 'U format reads code points, does not UTF-8-decode' => sub { + # Regression: `unpack "U*", "\xc2\xb6"` must return (194, 182) — one code + # point per character of the string — NOT (182) obtained by re-decoding + # the two bytes as a single UTF-8 sequence. A template starting with + # a bare `U` implies `U0` (Perl's "character mode"), so `U` reads code + # points directly from the string. Previously the handler keyed on a + # storage flag (isUTF8Data, true only for code points > 255), which + # caused Latin-1 byte strings to be mis-decoded. + + no utf8; + my @u1 = unpack "U*", "\xc2\xb6"; + is_deeply(\@u1, [194, 182], '{\xc2,\xb6} unpacks as two code points'); + + # ASCII stays ASCII. + my @u2 = unpack "U*", "abc"; + is_deeply(\@u2, [97, 98, 99], 'ASCII U* returns byte values'); + + # After `C0` Perl switches to "byte mode" — the string is treated as + # UTF-8 bytes and `U` decodes one multi-byte sequence per read. + my @u3 = unpack "C0U*", "\xc2\xb6"; + is_deeply(\@u3, [182], 'C0U* decodes as UTF-8'); + + # After `U0` we stay in character mode and `U` reads code points + # directly (same as starting the template with `U`). + my @u4 = unpack "U0U*", "\xc2\xb6"; + is_deeply(\@u4, [194, 182], 'U0U* reads code points'); + + # A string that really does contain a high Unicode code point round-trips. + my @u5 = unpack "U*", "\x{8000}\x{20}"; + is_deeply(\@u5, [0x8000, 0x20], 'high Unicode U* returns code points'); + + # And the `U0U C0 W` sequence from op/pack.t must advance both the + # code-point and byte cursors when U is read in byte mode, so that + # the subsequent `C0 W` sees the *next* character, not the one we + # just consumed. + my @u6 = unpack "U0U C0 W", "\xf8\xf9\xfa"; + is_deeply(\@u6, [248, 249], 'U0U C0 W advances both cursors'); +}; + done_testing();