diff --git a/.gitignore b/.gitignore
index 37286a45f..37c52279f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,8 @@ perl5/
# Ignore the t/ directory (Perl test suite copied from perl5/t/)
# These should be synced from perl5 repo, not committed here
t/
+# But allow bundled module test directories
+!src/test/resources/module/*/t/
# Ignore perl5_t/ directory (module tests from perl5/lib/*.t)
# These are synced from perl5 repo via sync.pl, not committed here
diff --git a/AGENTS.md b/AGENTS.md
index 676299076..84fea0ba2 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -82,6 +82,7 @@ PerlOnJava does **not** implement the following Perl features:
|---------|--------------|
| `make` | Build + run all unit tests (use before committing) |
| `make dev` | Build only, skip tests (for quick iteration during debugging) |
+| `make test-bundled-modules` | Run bundled CPAN module tests (XML::Parser, etc.) |
- For interpreter changes, test with both backends:
```bash
diff --git a/Makefile b/Makefile
index c022f5f5d..645d0597f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test test-unit test-interpreter test-exiftool test-all test-gradle test-gradle-unit test-gradle-all test-gradle-parallel test-maven-parallel build run wrapper check-java-gradle dev ci sbom sbom-java sbom-perl sbom-clean check-links
+.PHONY: all clean test test-unit test-interpreter test-bundled-modules test-exiftool test-all test-gradle test-gradle-unit test-gradle-all test-gradle-parallel test-maven-parallel build run wrapper check-java-gradle dev ci sbom sbom-java sbom-perl sbom-clean check-links
all: build
@@ -64,6 +64,15 @@ test-interpreter:
@echo "Running unit tests with bytecode interpreter..."
JPERL_INTERPRETER=1 perl dev/tools/perl_test_runner.pl --jobs 8 --timeout 60 --output test_interpreter_results.json src/test/resources/unit
+# Bundled CPAN module tests (XML::Parser, etc.)
+# Tests live under src/test/resources/module/{ModuleName}/t/
+test-bundled-modules: check-java-gradle
+ifeq ($(OS),Windows_NT)
+ gradlew.bat testModule --rerun-tasks
+else
+ ./gradlew testModule --rerun-tasks
+endif
+
# Image::ExifTool test suite (Image-ExifTool-13.44/t/ directory)
test-exiftool:
@echo "Running Image::ExifTool tests..."
diff --git a/README.md b/README.md
index 6bc9ba884..780ed53b4 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ PerlOnJava compiles Perl to JVM bytecode. One jar file runs on Linux, macOS, and
- **Single jar distribution** — no installation, no dependencies beyond Java
- **Full toolchain** — `jperl`, `jperldoc`, `jcpan`, `jprove`
-- **150+ modules included** — [DBI](docs/guides/database-access.md), HTTP::Tiny, JSON, YAML, Text::CSV, and more
+- **150+ modules included** — [DBI](docs/guides/database-access.md), HTTP::Tiny, JSON, XML::Parser, YAML, Text::CSV, and more
- **Install more with jcpan** — [pure-Perl CPAN modules](docs/guides/using-cpan-modules.md) work out of the box
- **JDBC database access** — [PostgreSQL, MySQL, SQLite, Oracle](docs/guides/database-access.md) via standard JDBC drivers
- **Embed in Java apps** — [JSR-223 ScriptEngine](docs/guides/java-integration.md) integration
diff --git a/build.gradle b/build.gradle
index 6cc2556a1..d6f14929b 100644
--- a/build.gradle
+++ b/build.gradle
@@ -264,6 +264,22 @@ tasks.register('testAll', Test) {
shouldRunAfter testUnit
}
+// Bundled module tests (XML::Parser, etc.)
+// Tests live under src/test/resources/module/{ModuleName}/t/
+tasks.register('testModule', Test) {
+ description = 'Runs bundled CPAN module tests (e.g. XML::Parser)'
+ group = 'verification'
+
+ testClassesDirs = sourceSets.test.output.classesDirs
+ classpath = sourceSets.test.runtimeClasspath
+
+ useJUnitPlatform {
+ includeTags 'module'
+ }
+
+ shouldRunAfter testUnit
+}
+
// Shadow JAR configuration for creating standalone executable
shadowJar {
archiveClassifier.set('')
diff --git a/dev/design/xml_parser_xs.md b/dev/design/xml_parser_xs.md
new file mode 100644
index 000000000..6b12d87eb
--- /dev/null
+++ b/dev/design/xml_parser_xs.md
@@ -0,0 +1,48 @@
+# XML::Parser Java XS Implementation Plan
+
+## Overview
+
+XML::Parser is implemented as a Java XS module (`XMLParserExpat.java`) backed by JDK's built-in SAX parser (`javax.xml.parsers.SAXParser`). This replaces the native C/XS expat bindings with a pure-Java equivalent, dispatching SAX events to the same Perl callback interface.
+
+## Architecture
+
+- **Java XS**: `src/main/java/org/perlonjava/runtime/perlmodule/XMLParserExpat.java`
+- **Perl shim**: `src/main/perl/lib/XML/Parser/Expat.pm` (modified from upstream)
+- **Backend**: JDK SAX (Apache Xerces built into the JDK)
+
+### Key Design Decisions
+
+1. **SAX vs DOM**: SAX chosen for streaming event model that maps naturally to expat's callback API
+2. **Namespace dualvars**: Namespace-qualified names use `DualVar(numericIndex, stringName)` matching expat's behavior where `int($name)` gives namespace index
+3. **BYTE_STRING encoding**: ParseString uses `ISO_8859_1` for `BYTE_STRING` input to avoid double-encoding raw UTF-8 bytes
+4. **SystemId un-resolution**: SAX resolves relative systemIds to absolute `file:///` URIs; `unresolveSysId()` strips the base to recover the original relative paths
+
+## Test Status
+
+**Current: 45/45 test files pass (100%), 434/434 subtests pass (100%)**
+
+All XML::Parser 2.56 tests pass (excluding 2 `Devel::CheckLib` C compiler detection tests that are irrelevant to PerlOnJava).
+
+### Running Tests
+
+Tests are stored in `src/test/resources/module/XML-Parser/` and run via:
+
+```bash
+make test-bundled-modules
+```
+
+This uses JUnit 5 (`ModuleTestExecutionTest.java`) with `@Tag("module")` to discover and execute all `.t` files under `module/*/t/`. The test runner `chdir`s to the module directory so relative paths resolve correctly.
+
+## Completed Phases
+
+### Phase 5: Final fixes for 47/47 (2026-04-07)
+
+**Tests fixed**: decl.t (44/46→46/46), foreign_dtd.t (0/5→5/5), checklib_findcc.t (2/3→3/3), checklib_tmpdir.t (1/3→3/3)
+
+1. **NOTATION type format fix**: Off-by-one bug in `attributeDecl()` — `substring(8)` → `substring(9)` to strip the space SAX adds after `NOTATION`
+2. **XMLDecl for text declarations**: Added `fireTextDeclHandler()` in `resolveEntity()` to fire the XMLDecl callback for text declarations in external parsed entities (with `version=undef`), before `convertEncoding()` rewrites the encoding
+3. **UseForeignDTD**: When `UseForeignDTD => 1` and no DOCTYPE exists, calls ExternEnt handler with `(parser, base, undef, undef)`, reads DTD content, injects `` after the XML declaration, and resolves the synthetic system ID in `resolveEntity()`
+4. **"undefined entity" error message**: SAX reports `"was referenced, but not declared"` for undefined entities; mapped to expat's `"undefined entity"` format in `formatError()`
+5. **Devel::CheckLib**: Replaced 9-line stub with real upstream source from XML-Parser-2.56 tarball
+
+Files: XMLParserExpat.java
diff --git a/dev/modules/README.md b/dev/modules/README.md
index 10525676e..c4a50c6ae 100644
--- a/dev/modules/README.md
+++ b/dev/modules/README.md
@@ -124,3 +124,4 @@ PERL_PARAMS_UTIL_PP=1 ./jcpan -t Class::Load
- [dbix_class.md](dbix_class.md) - DBIx::Class support
- [log4perl-compatibility.md](log4perl-compatibility.md) - Log::Log4perl
- [term_readkey.md](term_readkey.md) - Term::ReadKey
+- [xml_parser.md](xml_parser.md) - XML::Parser (Java XS via JDK SAX)
diff --git a/dev/modules/xml_parser.md b/dev/modules/xml_parser.md
new file mode 100644
index 000000000..02461fd6f
--- /dev/null
+++ b/dev/modules/xml_parser.md
@@ -0,0 +1,388 @@
+# XML::Parser Support for PerlOnJava
+
+> **Active plan and progress tracking**: See [`dev/design/xml_parser_xs.md`](../design/xml_parser_xs.md)
+>
+> This document contains the original architecture and reference material.
+> For current status, TODOs, and implementation progress, use the design doc.
+
+## Overview
+
+**Module**: XML::Parser 2.56 (depends on XML::Parser::Expat XS backend)
+**Test command**: `./jcpan --jobs 8 -t use XML::Parser`
+**Status**: 41/47 test files pass (95%)
+**Branch**: `feature/xml-parser`
+
+## Problem Statement
+
+XML::Parser is one of the most widely-used CPAN XML modules. It's a required dependency for
+XML::SAX::Expat, XML::Twig, XML::RSS, SVG::Parser, and many other modules. It's also an
+optional dependency of XML::Simple (whose `t/A_XMLParser.t` and `t/C_External_Entities.t` tests
+currently SKIP because XML::Parser is missing).
+
+The module currently fails to install via jcpan because:
+
+1. **Makefile.PL uses `Devel::CheckLib`** to verify that `libexpat` (a C library) is available.
+ This check runs `check_lib(lib => ['expat'], header => ['expat.h'])` which tries to compile
+ a C test program — this fails under PerlOnJava since there's no C compiler integration.
+2. Even if Makefile.PL succeeded, **Expat.xs cannot be compiled** — PerlOnJava runs on the JVM
+ and cannot load native `.so`/`.dylib` objects.
+
+## Solution: Java XS Implementation
+
+Implement `XML::Parser::Expat` as a **Java XS class** (`XMLParserExpat.java`) using the JDK's
+built-in `javax.xml.parsers.SAXParser` as the XML parsing engine. This follows the established
+pattern of `HTMLParser.java`, `DateTime.java`, `DigestMD5.java`, etc.
+
+**No new Maven/Gradle dependencies required** — Java's SAX parser is part of the JDK standard
+library (`java.xml` module).
+
+### Why JDK SAX and not libexpat?
+
+- PerlOnJava cannot load native libraries (no JNI/FFM for expat)
+- JDK SAX provides event-based parsing identical in concept to expat
+- Zero external dependencies — the project already follows this pattern for DateTime (`java.time`),
+ Digest::MD5 (`java.security.MessageDigest`), HTML::Parser (Jsoup), etc.
+- JDK SAX supports all core expat features: elements, attributes, characters, PIs, comments,
+ CDATA sections, DTD declarations, namespace processing, external entities
+
+## Dependency Tree
+
+```
+XML::Parser 2.56
+├── XML::Parser::Expat (XS → Java XS implementation)
+│ ├── XSLoader (loads XMLParserExpat.java)
+│ ├── File::Spec (bundled)
+│ └── File::ShareDir (CPAN, for encoding maps)
+├── XML::Parser::Style::Debug (pure Perl, bundled in CPAN dist)
+├── XML::Parser::Style::Subs (pure Perl)
+├── XML::Parser::Style::Tree (pure Perl)
+├── XML::Parser::Style::Objects (pure Perl)
+├── XML::Parser::Style::Stream (pure Perl)
+├── XML::Parser::ContentModel (pure Perl, in Expat.pm)
+├── XML::Parser::ExpatNB (pure Perl, in Expat.pm)
+└── LWP::UserAgent (optional, for external entity fetching)
+```
+
+## Architecture
+
+### Component Overview
+
+```
+┌─────────────────────────────────────────────┐
+│ Perl layer (from CPAN, installed by jcpan) │
+│ ├── XML::Parser (Parser.pm) │
+│ ├── XML::Parser::Style::* (pure Perl) │
+│ └── XML::Parser::ExpatNB (in Expat.pm) │
+├─────────────────────────────────────────────┤
+│ Perl shim (bundled in jar:PERL5LIB) │
+│ └── XML/Parser/Expat.pm │
+│ - Loads Java XS via XSLoader │
+│ - Pure Perl methods: setHandlers, │
+│ context, namespace methods, etc. │
+│ - Delegates XS calls to Java │
+├─────────────────────────────────────────────┤
+│ Java XS (XMLParserExpat.java) │
+│ └── Implements XS functions: │
+│ ParserCreate, ParseString, ParseStream │
+│ Set*Handler, Get*Position, etc. │
+│ Uses javax.xml.parsers.SAXParser │
+└─────────────────────────────────────────────┘
+```
+
+### Key Design Decisions
+
+1. **Reuse CPAN's Parser.pm**: The high-level `XML::Parser` module is pure Perl. We install it
+ from CPAN and only replace the XS backend (`XML::Parser::Expat`).
+
+2. **Bundled Expat.pm shim**: We provide our own `XML/Parser/Expat.pm` in `jar:PERL5LIB` that:
+ - Calls `XSLoader::load('XML::Parser::Expat')` to load `XMLParserExpat.java`
+ - Contains all the pure Perl methods from the original Expat.pm (context tracking, namespace
+ methods, xml_escape, etc.)
+ - Delegates XS-only functions (ParserCreate, ParseString, etc.) to the Java class
+
+3. **Opaque parser handle**: The `{Parser}` field in the Expat object stores a Java `SAXParser`
+ wrapper object (as `RuntimeScalarType.JAVAOBJECT`), similar to how DigestMD5 stores
+ `MessageDigest`.
+
+4. **Callback dispatch**: The Java SAX `ContentHandler`/`LexicalHandler`/`DTDHandler` methods
+ invoke Perl handler coderefs stored in the Expat hash, using `RuntimeCode.apply()`.
+
+## XS Function Mapping
+
+### Tier 1 — Core (required for basic XML::Parser usage)
+
+| XS Function | Java SAX Backend | Notes |
+|---|---|---|
+| `ParserCreate(self, enc, ns)` | `SAXParserFactory.newInstance()` | Store parser in `$self->{Parser}` as JAVAOBJECT |
+| `ParseString(parser, string)` | `parser.parse(InputSource)` | Convert string to `InputSource` |
+| `ParseStream(parser, ioref, delim)` | `parser.parse(InputStream)` | Read from Perl IO handle; Stream_Delimiter support |
+| `SetStartElementHandler` | `ContentHandler.startElement()` | Dispatch to Perl `Start` handler |
+| `SetEndElementHandler` | `ContentHandler.endElement()` | Dispatch to Perl `End` handler |
+| `SetCharacterDataHandler` | `ContentHandler.characters()` | Dispatch to Perl `Char` handler |
+| `SetProcessingInstructionHandler` | `ContentHandler.processingInstruction()` | Dispatch to Perl `Proc` handler |
+| `SetCommentHandler` | `LexicalHandler.comment()` | Dispatch to Perl `Comment` handler |
+| `SetStartCdataHandler` | `LexicalHandler.startCDATA()` | Dispatch to Perl `CdataStart` handler |
+| `SetEndCdataHandler` | `LexicalHandler.endCDATA()` | Dispatch to Perl `CdataEnd` handler |
+| `SetDefaultHandler` | Custom tracking | Catch-all for unhandled events |
+| `SetXMLDeclHandler` | Custom prolog detection | Parse `` prolog manually or via SAX property |
+| `GetCurrentLineNumber` | `Locator.getLineNumber()` | SAX Locator |
+| `GetCurrentColumnNumber` | `Locator.getColumnNumber()` | SAX Locator |
+| `SetBase` / `GetBase` | Field on Java wrapper | Simple string get/set |
+| `ParserRelease` / `ParserFree` | Clear references | No native memory to free |
+| `UnsetAllHandlers` | Clear all handler SVs | Used by `finish()` |
+
+### Tier 2 — DTD Features (required for ExifTool, XML::SAX::Expat)
+
+| XS Function | Java SAX Backend | Notes |
+|---|---|---|
+| `SetUnparsedEntityDeclHandler` | `DTDHandler.unparsedEntityDecl()` | |
+| `SetNotationDeclHandler` | `DTDHandler.notationDecl()` | |
+| `SetExternalEntityRefHandler` | `EntityResolver.resolveEntity()` | Map to Perl's ExternEnt handler |
+| `SetExtEntFinishHandler` | Post-entity callback | |
+| `SetEntityDeclHandler` | `DeclHandler.internalEntityDecl()` / `externalEntityDecl()` | |
+| `SetElementDeclHandler` | `DeclHandler.elementDecl()` | Return ContentModel object |
+| `SetAttListDeclHandler` | `DeclHandler.attributeDecl()` | |
+| `SetDoctypeHandler` | `LexicalHandler.startDTD()` | |
+| `SetEndDoctypeHandler` | `LexicalHandler.endDTD()` | |
+| `GetSpecifiedAttributeCount` | Track in `startElement` | |
+| `ElementIndex` | Depth-first counter | |
+
+### Tier 3 — Advanced / Incremental Parsing
+
+| XS Function | Java SAX Backend | Notes |
+|---|---|---|
+| `ParsePartial` | Chunked `InputSource` | For ExpatNB `parse_more()` |
+| `ParseDone` | Signal end-of-stream | |
+| `GetCurrentByteIndex` | Approximate via char counting | SAX Locator lacks byte offset |
+| `GetCurrentByteCount` | Approximate or stub | |
+| `RecognizedString` | Reconstruct from events | Not directly available in SAX |
+| `OriginalString` | Reconstruct or stub | Not directly available in SAX |
+| `PositionContext` | Track input buffer | Reconstruct context around current position |
+| `DefaultCurrent` | Re-fire to default handler | |
+| `SkipUntil` | Suppress callbacks until index | |
+| `GenerateNSName` | Perl-level implementation | Already in Expat.pm |
+| `LoadEncoding` / `FreeEncoding` | Stub / no-op | Java handles encodings natively |
+| `ExpatVersion` / `ExpatVersionInfo` | Return synthetic values | e.g., `"PerlOnJava SAX/1.0"` |
+| `ErrorString` | Map SAX exception messages | |
+| Security methods | No-op stubs | Java SAX has its own security model |
+
+## Expat.pm Shim Design
+
+The bundled `XML/Parser/Expat.pm` in `jar:PERL5LIB` replaces the CPAN version. It contains:
+
+1. **All pure Perl code from the original Expat.pm** — namespace methods, context tracking,
+ `xml_escape()`, `ContentModel` package, `ExpatNB` package, `Encinfo` package
+2. **`XSLoader::load('XML::Parser::Expat')`** instead of native XS loading
+3. **Adjusted `%Handler_Setters`** — maps handler type names to Java-backed setter functions
+ registered by `XMLParserExpat.java`
+
+The Perl methods that wrap XS calls (`parse`, `current_line`, `base`, etc.) work unchanged
+because they delegate to the Java-registered functions through the same calling convention.
+
+## MakeMaker Integration
+
+### Problem
+XML::Parser's `Makefile.PL` uses `Devel::CheckLib` (bundled in `./inc/`) to verify libexpat:
+
+```perl
+use lib './inc';
+use Devel::CheckLib;
+unless (check_lib(lib => ['expat'], header => ['expat.h'], ...)) {
+ warn "Expat must be installed...";
+ exit 0; # ← exits BEFORE WriteMakefile() is called
+}
+WriteMakefile1( NAME => 'XML::Parser', DIR => ['Expat'], ... );
+```
+
+Because `exit 0` happens before `WriteMakefile()`, PerlOnJava's custom MakeMaker never runs,
+no `Makefile` is generated, and CPAN::Distribution aborts with "No 'Makefile' created".
+
+### Solution: Two-layer approach (Strategy D)
+
+**Layer 1 — Stub `Devel::CheckLib` in build directory** (`CPAN/Distribution.pm`):
+
+Before running `Makefile.PL`, detect `./inc/Devel/CheckLib.pm` in the build directory and
+replace it with a PerlOnJava stub that always succeeds:
+
+```perl
+package Devel::CheckLib;
+use Exporter; our @ISA = ('Exporter');
+our @EXPORT = qw(assert_lib check_lib_or_exit check_lib);
+sub assert_lib { 1 }
+sub check_lib_or_exit { 1 }
+sub check_lib { 1 }
+1;
+```
+
+This lets `Makefile.PL` proceed to `WriteMakefile()`, where PerlOnJava's custom MakeMaker
+detects XS files and installs `.pm` files via `_handle_xs_module()`.
+
+**Layer 2 — Fallback Makefile.PL generation** (`CPAN/Distribution.pm`):
+
+As a safety net, when `Makefile.PL` exits 0 but no `Makefile` is created, generate a
+synthetic `Makefile.PL` from `META.yml`/`META.json` metadata and re-run it. This catches
+any module that dies/exits before `WriteMakefile()` regardless of the reason.
+
+### Additional complications
+
+- **Non-standard layout**: `Parser.pm` lives at the distribution root, not in `lib/`.
+ MakeMaker's `_install_pure_perl()` must handle this (it already scans for `.pm` files
+ at the root for flat-layout dists).
+- **Subdirectory build**: `DIR => ['Expat']` causes recursion into `Expat/Makefile.PL`,
+ which also calls `Devel::CheckLib`. The stub handles this automatically.
+- **File::ShareDir::Install**: Uses `install_share dist => 'share'` for encoding `.enc`
+ files. These can be installed but are unused (Java handles encodings natively).
+- **CPAN `Expat/Expat.pm` vs JAR shim**: Our `jar:PERL5LIB` Expat.pm shim takes
+ precedence over the CPAN-installed version because MakeMaker's JAR-shim deduplication
+ (lines 269-281) skips `.pm` files that already exist in `jar:PERL5LIB`.
+
+## Test Suite Analysis
+
+XML::Parser 2.56 has **47 test files**. Expected results by category:
+
+### Expected to Pass (with Java SAX backend)
+
+| Category | Test Files | Count | Notes |
+|---|---|---|---|
+| Core parsing | `styles.t`, `cdata.t`, `file.t`, `stream.t`, `partial.t` | 5 | Basic parse/style tests |
+| Handlers | `decl.t`, `namespaces.t`, `skip.t`, `finish.t` | 4 | Handler dispatch |
+| DTD | `parament.t`, `parament_internal.t`, `foreign_dtd.t` | 3 | DTD processing |
+| Error handling | `xpcroak.t`, `xpcarp.t`, `parse_error_context.t`, `error_string.t`, `error_hint.t` | 5 | Error reporting |
+| External entities | `external_ent.t`, `extern_ent_lexical_glob.t`, `nolwp.t`, `get_base.t` | 4 | Entity resolution |
+| UTF-8 | `utf8_handling.t`, `utf8_stream.t`, `debug_multibyte.t` | 3 | Encoding |
+| Security | `security_api.t`, `deep_nesting.t` | 2 | May need stubs |
+| Misc | `xml_escape.t`, `g_void.t`, `subs_inherited.t`, `tree_entity_expand.t`, `combine_chars.t`, `defaulted.t`, `element_decl.t`, `stream_attr_escape.t`, `stream_localize.t`, `file_open_scalar.t`, `parsefile_base_restore.t`, `bare_glob_filehandle.t` | 12 | Various features |
+| Stress | `astress.t` | 1 | Large document |
+
+### Expected to Need Stubs/Workarounds
+
+| Test File | Issue | Strategy |
+|---|---|---|
+| `current_byte.t` | SAX Locator lacks byte offset | Approximate via UTF-8 byte counting, or skip |
+| `current_length.t` | SAX Locator lacks byte count | Approximate or skip |
+| `encoding.t` | Custom `.enc` encoding maps | Stub `LoadEncoding`, use Java charset support |
+| `expat_version.t` | Reports expat version string | Return synthetic version |
+| `position_overflow.t` | Tests byte offset overflow | Depends on byte tracking impl |
+| `memory_leak_symtab.t` | Tests symbol table cleanup | May need DESTROY (known limitation) |
+
+### Build/Config Tests (may need adaptation)
+
+| Test File | Issue |
+|---|---|
+| `checklib_findcc.t` | Tests Devel::CheckLib C compiler detection |
+| `checklib_tmpdir.t` | Tests Devel::CheckLib temp directory |
+
+## Implementation Plan
+
+### Phase 1: Infrastructure and Installation (estimated: 1-2 sessions)
+
+| Step | Description | File |
+|---|---|---|
+| 1a | Create `XMLParserExpat.java` skeleton extending `PerlModuleBase` | `src/main/java/.../perlmodule/XMLParserExpat.java` |
+| 1b | Implement `ParserCreate` — create `SAXParser` wrapper, store in hash | `XMLParserExpat.java` |
+| 1c | Implement all `Set*Handler` methods — store Perl coderefs | `XMLParserExpat.java` |
+| 1d | Create `XML/Parser/Expat.pm` shim for `jar:PERL5LIB` | `src/main/perl/lib/XML/Parser/Expat.pm` |
+| 1e | Fix installation path so `jcpan` can install Parser.pm and Style modules | `ExtUtils/MakeMaker.pm` or `Devel/CheckLib.pm` stub |
+| 1f | Run `make` to verify unit tests pass | — |
+
+**Result**: `use XML::Parser` loads without error; no parsing yet.
+
+### Phase 2: Core Parsing (estimated: 2-3 sessions)
+
+| Step | Description | File |
+|---|---|---|
+| 2a | Implement `ParseString` — feed string to SAX parser, dispatch callbacks | `XMLParserExpat.java` |
+| 2b | Implement `ParseStream` — read from Perl IO handle, feed to SAX | `XMLParserExpat.java` |
+| 2c | Implement Start/End/Char handler dispatch with Perl callback invocation | `XMLParserExpat.java` |
+| 2d | Implement Comment, PI, CdataStart/CdataEnd dispatch | `XMLParserExpat.java` |
+| 2e | Implement position tracking (`Locator` → `current_line`/`current_column`) | `XMLParserExpat.java` |
+| 2f | Implement `base()` get/set | `XMLParserExpat.java` |
+| 2g | Test: `styles.t`, `cdata.t`, `file.t`, basic parsing | — |
+
+**Result**: Basic XML parsing works with Tree/Debug/Stream/Subs/Objects styles.
+
+### Phase 3: DTD and Declarations (estimated: 1-2 sessions)
+
+| Step | Description | File |
+|---|---|---|
+| 3a | Implement Doctype/DoctypeFin via `LexicalHandler.startDTD/endDTD` | `XMLParserExpat.java` |
+| 3b | Implement Entity/Element/Attlist via `DeclHandler` | `XMLParserExpat.java` |
+| 3c | Implement Unparsed/Notation via `DTDHandler` | `XMLParserExpat.java` |
+| 3d | Implement ExternEnt/ExternEntFin via `EntityResolver` | `XMLParserExpat.java` |
+| 3e | Implement `ContentModel` construction from `DeclHandler.elementDecl` | `XMLParserExpat.java` or Expat.pm |
+| 3f | Implement XMLDecl handler (parse `` prolog) | `XMLParserExpat.java` |
+| 3g | Test: `decl.t`, `parament.t`, `external_ent.t`, `namespaces.t` | — |
+
+**Result**: DTD-heavy tests pass; XML::SAX::Expat can use our backend.
+
+### Phase 4: Advanced Features (estimated: 1-2 sessions)
+
+| Step | Description | File |
+|---|---|---|
+| 4a | Implement Default handler (catch-all for unhandled events) | `XMLParserExpat.java` |
+| 4b | Implement `ParsePartial`/`ParseDone` for ExpatNB incremental parsing | `XMLParserExpat.java` |
+| 4c | Implement `specified_attr()` and `element_index()` | `XMLParserExpat.java` |
+| 4d | Implement byte position tracking (approximate) | `XMLParserExpat.java` |
+| 4e | Stub security API methods (no-op) | `XMLParserExpat.java` |
+| 4f | Stub `ExpatVersion()`/`ExpatVersionInfo()` | `XMLParserExpat.java` |
+| 4g | Stub `LoadEncoding`/`FreeEncoding` (Java handles encodings natively) | `XMLParserExpat.java` |
+| 4h | Test: full test suite, count pass/fail/skip | — |
+
+**Result**: Near-complete XML::Parser support.
+
+### Phase 5: Polish and Downstream Modules (estimated: 1 session)
+
+| Step | Description |
+|---|---|
+| 5a | Fix remaining test failures discovered in Phase 4 |
+| 5b | Test XML::Simple with XML::Parser backend (`t/A_XMLParser.t`, `t/C_External_Entities.t`) |
+| 5c | Test XML::SAX::Expat integration |
+| 5d | Update `dev/modules/xml_simple.md` to reflect XML::Parser availability |
+| 5e | Update `dev/modules/README.md` with XML::Parser entry |
+
+**Result**: XML::Parser fully working, downstream modules benefit.
+
+## Known Limitations
+
+### SAX vs Expat Behavioral Differences
+
+| Feature | Expat (C) | JDK SAX | Impact |
+|---|---|---|---|
+| Byte offset/count | Exact | Not available | `current_byte()` returns approximate value or -1 |
+| Original string | Exact verbatim bytes | Not available | `original_string()` returns reconstructed or undef |
+| Recognized string | UTF-8 representation | Not available | `recognized_string()` returns reconstructed or undef |
+| Custom `.enc` maps | Binary encoding files | Java charset support | `load_encoding()` is a no-op; Java handles encodings |
+| Stream delimiter | Native support | Must be implemented in Java wrapper | Wrap InputStream to detect delimiter |
+| Entity expansion control | `NoExpand` option | SAX `external-general-entities` feature | Map to SAX feature flags |
+| Billion Laughs protection | libexpat 2.4.0+ API | Java SAX has its own limits | Stub the API; Java protects by default |
+
+### Tests Expected to Remain Failing
+
+| Test | Reason |
+|---|---|
+| `checklib_findcc.t` | Tests C compiler detection — not relevant on JVM |
+| `checklib_tmpdir.t` | Tests C compiler temp dirs — not relevant on JVM |
+| `memory_leak_symtab.t` | May test DESTROY behavior (known PerlOnJava limitation) |
+
+## Progress Tracking
+
+> See [`dev/design/xml_parser_xs.md`](../design/xml_parser_xs.md) for current progress.
+
+### Completed
+- [x] Investigation and API catalog (2025-04-07)
+- [x] Phase 1: Infrastructure and installation (2025-04-06)
+- [x] Phase 2: Core parsing (2025-04-06)
+- [x] Phase 3: DTD and declarations (2025-04-07)
+- [x] Phase 4 partial: Advanced features (2025-04-07)
+- 41/47 test files pass (95%)
+
+### Remaining
+- Phase 4 continued: Encoding conversion (x-sjis-unicode)
+- UseForeignDTD
+
+## Related Documents
+
+- `dev/modules/xml_simple.md` — XML::Simple (benefits from XML::Parser availability)
+- `dev/modules/xs_fallback.md` — XS fallback mechanism
+- `dev/modules/makemaker_perlonjava.md` — MakeMaker implementation
+- `dev/modules/xsloader.md` — XSLoader architecture
diff --git a/docs/about/changelog.md b/docs/about/changelog.md
index d73da6f64..076d4e123 100644
--- a/docs/about/changelog.md
+++ b/docs/about/changelog.md
@@ -12,7 +12,7 @@ Release history of PerlOnJava. See [Roadmap](roadmap.md) for future plans.
- Lexical warnings with `use warnings` and FATAL support
- Non-local control flow: `last`/`next`/`redo`/`goto LABEL`/`goto $EXPR`
- Tail call with trampoline for `goto &NAME` and `goto __SUB__`
-- Add modules: `CPAN`, `Time::Piece`, `TOML`, `DirHandle`, `Dumpvalue`, `Sys::Hostname`, `IO::Socket`, `IO::Socket::INET`, `IO::Socket::UNIX`, `IO::Zlib`, `Archive::Tar`, `Archive::Zip`, `Net::FTP`, `Net::Cmd`, `IPC::Open2`, `IPC::Open3`, `ExtUtils::MakeMaker`.
+- Add modules: `CPAN`, `Time::Piece`, `TOML`, `DirHandle`, `Dumpvalue`, `Sys::Hostname`, `IO::Socket`, `IO::Socket::INET`, `IO::Socket::UNIX`, `IO::Zlib`, `Archive::Tar`, `Archive::Zip`, `Net::FTP`, `Net::Cmd`, `IPC::Open2`, `IPC::Open3`, `ExtUtils::MakeMaker`, `XML::Parser`.
- Add operators: `flock`, `syscall`, `fcntl`, `ioctl`.
- Add `\&CORE::X` subroutine references: built-in functions can be used as first-class code refs (e.g., `\&CORE::push`, `\&CORE::length`) with correct prototypes and glob aliasing.
- Support for forking patterns with `exec`:
diff --git a/docs/reference/feature-matrix.md b/docs/reference/feature-matrix.md
index 56aa50152..64437eb7f 100644
--- a/docs/reference/feature-matrix.md
+++ b/docs/reference/feature-matrix.md
@@ -748,6 +748,7 @@ The `:encoding()` layer supports all encodings provided by Java's `Charset.forNa
- ✅ **JSON** module.
- ✅ **Text::CSV** module.
- ✅ **TOML** module.
+- ✅ **XML::Parser** module backed by JDK SAX (replaces native libexpat XS).
- ✅ **YAML::PP** module.
- ✅ **YAML** module.
diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java
index 88504ad23..becfcf01b 100644
--- a/src/main/java/org/perlonjava/core/Configuration.java
+++ b/src/main/java/org/perlonjava/core/Configuration.java
@@ -33,7 +33,7 @@ public final class Configuration {
* Automatically populated by Gradle/Maven during build.
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
- public static final String gitCommitId = "596676cef";
+ public static final String gitCommitId = "28cf7aa5c";
/**
* Git commit date of the build (ISO format: YYYY-MM-DD).
diff --git a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java
index 9fc9904bc..8a2782596 100644
--- a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java
+++ b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java
@@ -237,6 +237,28 @@ protected void parseVariableInterpolation(String sigil) {
}
}
+ // After ${...}, parse subscript access like ${$ref}{key} or ${$ref}[0]
+ // This matches Perl 5 where "${$hashref}{key}" = $hashref->{key}
+ //
+ // However, when ${var} uses explicit braces with a simple variable name,
+ // [...] and {...} should NOT be parsed as subscripts.
+ // Perl 5 rule: explicit braces terminate the variable name, so:
+ // In regex: ${var}[0] = scalar $var + char class [0]
+ // In string: "${var}[0]" = scalar $var + literal "[0]"
+ // vs: $var[0] = array element $var[0]
+ // Only deref expressions like ${$ref}[0] should parse subscripts after braces.
+ boolean isSimpleBracedVariable = !isArray
+ && operand instanceof OperatorNode opNode
+ && "$".equals(opNode.operator)
+ && opNode.operand instanceof IdentifierNode;
+ if (!isSimpleBracedVariable) {
+ try {
+ operand = parseArrayHashAccess(parser, operand, isRegex);
+ } catch (Exception e) {
+ // If array/hash access parsing fails, use operand as-is
+ }
+ }
+
if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("str operand " + operand);
} else {
// Parse simple variables using shared logic, but keep the exact same flow
diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/XMLParserExpat.java b/src/main/java/org/perlonjava/runtime/perlmodule/XMLParserExpat.java
new file mode 100644
index 000000000..5a4ae3e5d
--- /dev/null
+++ b/src/main/java/org/perlonjava/runtime/perlmodule/XMLParserExpat.java
@@ -0,0 +1,2195 @@
+package org.perlonjava.runtime.perlmodule;
+
+import org.perlonjava.runtime.operators.Readline;
+import org.perlonjava.runtime.operators.ReferenceOperators;
+import org.perlonjava.runtime.runtimetypes.*;
+
+import static org.perlonjava.runtime.runtimetypes.RuntimeScalarCache.*;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import org.xml.sax.*;
+import org.xml.sax.ext.Attributes2;
+import org.xml.sax.ext.DeclHandler;
+import org.xml.sax.ext.LexicalHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Java XS implementation of XML::Parser::Expat.
+ *
+ * Provides the XS functions called by the Perl Expat.pm shim.
+ * Uses JDK's built-in SAX parser (javax.xml.parsers.SAXParser) as the backend.
+ */
+public class XMLParserExpat extends PerlModuleBase {
+
+ public static final String XS_VERSION = "2.56";
+
+ // Namespace separator character (same as expat's NSDELIM = 0xFC)
+ private static final char NS_SEP = '\u00FC';
+
+ // Keys for storing Java objects in the Perl hash
+ private static final String PARSER_KEY = "_xml_parser_state";
+
+ public XMLParserExpat() {
+ super("XML::Parser::Expat", false);
+ }
+
+ public static void initialize() {
+ XMLParserExpat module = new XMLParserExpat();
+
+ try {
+ // Core parser lifecycle
+ module.registerMethod("ParserCreate", null);
+ module.registerMethod("ParserRelease", null);
+ module.registerMethod("ParserFree", null);
+
+ // Parsing methods
+ module.registerMethod("ParseString", null);
+ module.registerMethod("ParseStream", null);
+ module.registerMethod("ParsePartial", null);
+ module.registerMethod("ParseDone", null);
+
+ // Handler setters - each returns the old handler
+ module.registerMethod("SetStartElementHandler", null);
+ module.registerMethod("SetEndElementHandler", null);
+ module.registerMethod("SetCharacterDataHandler", null);
+ module.registerMethod("SetProcessingInstructionHandler", null);
+ module.registerMethod("SetCommentHandler", null);
+ module.registerMethod("SetStartCdataHandler", null);
+ module.registerMethod("SetEndCdataHandler", null);
+ module.registerMethod("SetDefaultHandler", null);
+ module.registerMethod("SetUnparsedEntityDeclHandler", null);
+ module.registerMethod("SetNotationDeclHandler", null);
+ module.registerMethod("SetExternalEntityRefHandler", null);
+ module.registerMethod("SetExtEntFinishHandler", null);
+ module.registerMethod("SetEntityDeclHandler", null);
+ module.registerMethod("SetElementDeclHandler", null);
+ module.registerMethod("SetAttListDeclHandler", null);
+ module.registerMethod("SetDoctypeHandler", null);
+ module.registerMethod("SetEndDoctypeHandler", null);
+ module.registerMethod("SetXMLDeclHandler", null);
+
+ // Position/info methods
+ module.registerMethod("GetCurrentLineNumber", null);
+ module.registerMethod("GetCurrentColumnNumber", null);
+ module.registerMethod("GetCurrentByteIndex", null);
+ module.registerMethod("GetCurrentByteCount", null);
+ module.registerMethod("GetSpecifiedAttributeCount", null);
+ module.registerMethod("ElementIndex", null);
+
+ // Base URI
+ module.registerMethod("SetBase", null);
+ module.registerMethod("GetBase", null);
+
+ // String access
+ module.registerMethod("RecognizedString", null);
+ module.registerMethod("OriginalString", null);
+ module.registerMethod("DefaultCurrent", null);
+
+ // Context/position
+ module.registerMethod("PositionContext", null);
+ module.registerMethod("UnsetAllHandlers", null);
+ module.registerMethod("SkipUntil", null);
+
+ // Encoding
+ module.registerMethod("LoadEncoding", null);
+ module.registerMethod("FreeEncoding", null);
+
+ // Version info
+ module.registerMethod("ExpatVersion", null);
+ module.registerMethod("ExpatVersionInfo", null);
+
+ // Error
+ module.registerMethod("ErrorString", null);
+
+ // Namespace helper
+ module.registerMethod("GenerateNSName", null);
+
+ // Security stubs
+ module.registerMethod("SetBillionLaughsAttackProtectionMaximumAmplification", null);
+ module.registerMethod("SetBillionLaughsAttackProtectionActivationThreshold", null);
+ module.registerMethod("SetAllocTrackerMaximumAmplification", null);
+ module.registerMethod("SetAllocTrackerActivationThreshold", null);
+ module.registerMethod("SetReparseDeferralEnabled", null);
+
+ } catch (NoSuchMethodException e) {
+ System.err.println("Warning: Missing XMLParserExpat method: " + e.getMessage());
+ }
+ }
+
+ // ================================================================
+ // Internal parser state stored as a Java object in the Perl hash
+ // ================================================================
+
+ static class ParserState {
+ // Handler coderefs stored as RuntimeScalar
+ RuntimeScalar startHandler;
+ RuntimeScalar endHandler;
+ RuntimeScalar charHandler;
+ RuntimeScalar procHandler;
+ RuntimeScalar commentHandler;
+ RuntimeScalar startCdataHandler;
+ RuntimeScalar endCdataHandler;
+ RuntimeScalar defaultHandler;
+ RuntimeScalar unparsedHandler;
+ RuntimeScalar notationHandler;
+ RuntimeScalar externEntHandler;
+ RuntimeScalar externEntFinHandler;
+ RuntimeScalar entityDeclHandler;
+ RuntimeScalar elementDeclHandler;
+ RuntimeScalar attlistDeclHandler;
+ RuntimeScalar doctypeHandler;
+ RuntimeScalar endDoctypeHandler;
+ RuntimeScalar xmlDeclHandler;
+
+ // The Perl self object (Expat hash ref)
+ RuntimeScalar selfRef;
+
+ // Position tracking
+ int currentLine = 0;
+ int currentColumn = 0;
+ long currentByteIndex = -1;
+ int currentByteCount = 0;
+ int specifiedAttributeCount = 0;
+ int elementIndex = 0;
+ int elementIndexCounter = 0; // monotonically increasing counter
+ java.util.Deque elementIndexStack = new java.util.ArrayDeque<>();
+
+ // Base URI
+ String base;
+
+ // Last recognized/original string for reconstructing
+ String recognizedString = "";
+ String originalString = "";
+
+ // Entity expansion tracking for original_string
+ String currentEntityName = null;
+
+ // Track if the current element was self-closing
+ boolean lastWasSelfClosing = false;
+
+ // Skip until element index
+ int skipUntilIndex = -1;
+
+ // Partial parsing state
+ StringBuilder partialBuffer;
+ boolean partialMode = false;
+ boolean partialIsByteString = false;
+
+ // Namespace mode
+ boolean namespaces = false;
+
+ // NoExpand mode
+ boolean noExpand = false;
+
+ // Error message
+ String errorMessage = "";
+
+ // SAX Locator for position tracking
+ Locator locator;
+
+ // Byte tracking - tracks byte offsets based on input
+ long bytesProcessed = 0;
+
+ // The raw input bytes for byte position tracking
+ byte[] inputBytes;
+ int inputScanPos = 0; // how far we've scanned
+
+ // Base URI from InputSource for un-resolving SAX systemIds
+ String parseBaseUri;
+
+ // Protocol encoding (e.g. "ISO-8859-1") from ParserCreate
+ String protocolEncoding;
+
+ // Foreign DTD content (for UseForeignDTD support)
+ byte[] foreignDtdContent;
+ }
+
+ // ================================================================
+ // Parser lifecycle
+ // ================================================================
+
+ /**
+ * ParserCreate(self_sv, enc_sv, namespaces) - Create parser state
+ * Called from Expat.pm: $args{Parser} = ParserCreate($self, $enc, $ns)
+ */
+ public static RuntimeList ParserCreate(RuntimeArray args, int ctx) {
+ RuntimeScalar selfRef = args.get(0);
+ String encoding = args.size() > 1 ? args.get(1).toString() : null;
+ boolean namespaces = args.size() > 2 && args.get(2).getBoolean();
+
+ ParserState state = new ParserState();
+ state.selfRef = selfRef;
+ state.namespaces = namespaces;
+ state.protocolEncoding = (encoding != null && !encoding.isEmpty()) ? encoding : null;
+
+ // Store the state as a Java object in the Perl hash
+ RuntimeScalar stateScalar = new RuntimeScalar(state);
+ return stateScalar.getList();
+ }
+
+ /**
+ * ParserRelease(parser) - Break circular references
+ */
+ public static RuntimeList ParserRelease(RuntimeArray args, int ctx) {
+ // No-op on JVM - GC handles circular refs
+ return scalarUndef.getList();
+ }
+
+ /**
+ * ParserFree(parser) - Free parser resources
+ */
+ public static RuntimeList ParserFree(RuntimeArray args, int ctx) {
+ // No-op on JVM
+ return scalarUndef.getList();
+ }
+
+ // ================================================================
+ // Helper to get ParserState from the opaque parser handle
+ // ================================================================
+
+ private static ParserState getState(RuntimeScalar parser) {
+ if (parser != null && parser.type == RuntimeScalarType.JAVAOBJECT
+ && parser.value instanceof ParserState) {
+ return (ParserState) parser.value;
+ }
+ throw new PerlCompilerException("Invalid parser object");
+ }
+
+ // ================================================================
+ // Handler setter methods - each returns the old handler
+ // ================================================================
+
+ private static RuntimeScalar setHandler(RuntimeScalar parser, RuntimeScalar newHandler,
+ java.util.function.Function getter,
+ java.util.function.BiConsumer setter) {
+ ParserState state = getState(parser);
+ RuntimeScalar old = getter.apply(state);
+ if (old == null) old = scalarUndef;
+
+ if (newHandler != null && newHandler.type != RuntimeScalarType.UNDEF
+ && newHandler.getBoolean()) {
+ setter.accept(state, newHandler);
+ } else {
+ setter.accept(state, null);
+ }
+ return old;
+ }
+
+ public static RuntimeList SetStartElementHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.startHandler, (s, h) -> s.startHandler = h).getList();
+ }
+
+ public static RuntimeList SetEndElementHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.endHandler, (s, h) -> s.endHandler = h).getList();
+ }
+
+ public static RuntimeList SetCharacterDataHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.charHandler, (s, h) -> s.charHandler = h).getList();
+ }
+
+ public static RuntimeList SetProcessingInstructionHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.procHandler, (s, h) -> s.procHandler = h).getList();
+ }
+
+ public static RuntimeList SetCommentHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.commentHandler, (s, h) -> s.commentHandler = h).getList();
+ }
+
+ public static RuntimeList SetStartCdataHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.startCdataHandler, (s, h) -> s.startCdataHandler = h).getList();
+ }
+
+ public static RuntimeList SetEndCdataHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.endCdataHandler, (s, h) -> s.endCdataHandler = h).getList();
+ }
+
+ public static RuntimeList SetDefaultHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.defaultHandler, (s, h) -> s.defaultHandler = h).getList();
+ }
+
+ public static RuntimeList SetUnparsedEntityDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.unparsedHandler, (s, h) -> s.unparsedHandler = h).getList();
+ }
+
+ public static RuntimeList SetNotationDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.notationHandler, (s, h) -> s.notationHandler = h).getList();
+ }
+
+ public static RuntimeList SetExternalEntityRefHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.externEntHandler, (s, h) -> s.externEntHandler = h).getList();
+ }
+
+ public static RuntimeList SetExtEntFinishHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.externEntFinHandler, (s, h) -> s.externEntFinHandler = h).getList();
+ }
+
+ public static RuntimeList SetEntityDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.entityDeclHandler, (s, h) -> s.entityDeclHandler = h).getList();
+ }
+
+ public static RuntimeList SetElementDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.elementDeclHandler, (s, h) -> s.elementDeclHandler = h).getList();
+ }
+
+ public static RuntimeList SetAttListDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.attlistDeclHandler, (s, h) -> s.attlistDeclHandler = h).getList();
+ }
+
+ public static RuntimeList SetDoctypeHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.doctypeHandler, (s, h) -> s.doctypeHandler = h).getList();
+ }
+
+ public static RuntimeList SetEndDoctypeHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.endDoctypeHandler, (s, h) -> s.endDoctypeHandler = h).getList();
+ }
+
+ public static RuntimeList SetXMLDeclHandler(RuntimeArray args, int ctx) {
+ return setHandler(args.get(0), args.size() > 1 ? args.get(1) : null,
+ s -> s.xmlDeclHandler, (s, h) -> s.xmlDeclHandler = h).getList();
+ }
+
+ // ================================================================
+ // Position / info methods
+ // ================================================================
+
+ public static RuntimeList GetCurrentLineNumber(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ if (state.locator != null) {
+ return new RuntimeScalar(state.locator.getLineNumber()).getList();
+ }
+ return new RuntimeScalar(state.currentLine).getList();
+ }
+
+ public static RuntimeList GetCurrentColumnNumber(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ if (state.locator != null) {
+ // SAX locator returns 1-based column AFTER the current token.
+ // Expat returns 0-based column at the START of the current token.
+ // Convert: (1-based position after) - 1 - tokenLength = 0-based start position
+ int col = state.locator.getColumnNumber() - 1;
+ if (state.recognizedString != null) {
+ col -= state.recognizedString.length();
+ }
+ if (col < 0) col = 0;
+ return new RuntimeScalar(col).getList();
+ }
+ return new RuntimeScalar(state.currentColumn).getList();
+ }
+
+ public static RuntimeList GetCurrentByteIndex(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.currentByteIndex).getList();
+ }
+
+ public static RuntimeList GetCurrentByteCount(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.currentByteCount).getList();
+ }
+
+ public static RuntimeList GetSpecifiedAttributeCount(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.specifiedAttributeCount).getList();
+ }
+
+ public static RuntimeList ElementIndex(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.elementIndex).getList();
+ }
+
+ // ================================================================
+ // Base URI
+ // ================================================================
+
+ public static RuntimeList SetBase(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ if (args.size() > 1) {
+ RuntimeScalar val = args.get(1);
+ if (val.type == RuntimeScalarType.UNDEF) {
+ state.base = null;
+ } else {
+ state.base = val.toString();
+ }
+ }
+ return scalarUndef.getList();
+ }
+
+ public static RuntimeList GetBase(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ if (state.base != null) {
+ return new RuntimeScalar(state.base).getList();
+ }
+ return scalarUndef.getList();
+ }
+
+ // ================================================================
+ // String access methods
+ // ================================================================
+
+ public static RuntimeList RecognizedString(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.recognizedString).getList();
+ }
+
+ public static RuntimeList OriginalString(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ return new RuntimeScalar(state.originalString).getList();
+ }
+
+ public static RuntimeList DefaultCurrent(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ // Fire the default handler with the current recognized string
+ if (state.defaultHandler != null && !state.recognizedString.isEmpty()) {
+ fireCallback(state, state.defaultHandler, new RuntimeScalar(state.recognizedString));
+ }
+ return scalarUndef.getList();
+ }
+
+ public static RuntimeList PositionContext(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ int numLines = args.size() > 1 ? args.get(1).getInt() : 0;
+
+ if (state.inputBytes == null || state.locator == null) {
+ RuntimeArray result = new RuntimeArray();
+ RuntimeArray.push(result, scalarUndef);
+ RuntimeArray.push(result, scalarZero);
+ return result.getList();
+ }
+
+ String input = new String(state.inputBytes, StandardCharsets.UTF_8);
+ int currentLine = state.locator.getLineNumber(); // 1-based
+
+ // Split input into lines
+ String[] lines = input.split("\n", -1);
+ int totalLines = lines.length;
+
+ // Clamp to valid range
+ int lineIdx = Math.max(0, Math.min(currentLine - 1, totalLines - 1));
+
+ // Calculate range of lines to show
+ int startLine = Math.max(0, lineIdx - numLines);
+ int endLine = Math.min(totalLines - 1, lineIdx + numLines);
+
+ // Build the context string and track where the current line ends
+ StringBuilder sb = new StringBuilder();
+ int linepos = 0;
+ for (int i = startLine; i <= endLine; i++) {
+ sb.append(lines[i]);
+ if (i < endLine) {
+ sb.append("\n");
+ }
+ if (i == lineIdx) {
+ // linepos = position AFTER the current line (including \n)
+ // This is where Expat.pm inserts the "===^" pointer
+ linepos = sb.length();
+ }
+ }
+
+ RuntimeArray result = new RuntimeArray();
+ RuntimeArray.push(result, new RuntimeScalar(sb.toString()));
+ RuntimeArray.push(result, new RuntimeScalar(linepos));
+ return result.getList();
+ }
+
+ // ================================================================
+ // Handler control
+ // ================================================================
+
+ public static RuntimeList UnsetAllHandlers(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ state.startHandler = null;
+ state.endHandler = null;
+ state.charHandler = null;
+ state.procHandler = null;
+ state.commentHandler = null;
+ state.startCdataHandler = null;
+ state.endCdataHandler = null;
+ state.defaultHandler = null;
+ state.unparsedHandler = null;
+ state.notationHandler = null;
+ state.externEntHandler = null;
+ state.externEntFinHandler = null;
+ state.entityDeclHandler = null;
+ state.elementDeclHandler = null;
+ state.attlistDeclHandler = null;
+ state.doctypeHandler = null;
+ state.endDoctypeHandler = null;
+ state.xmlDeclHandler = null;
+ return scalarUndef.getList();
+ }
+
+ public static RuntimeList SkipUntil(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ if (args.size() > 1) {
+ state.skipUntilIndex = args.get(1).getInt();
+ }
+ return scalarUndef.getList();
+ }
+
+ // ================================================================
+ // Encoding stubs (Java handles encodings natively)
+ // ================================================================
+
+ public static RuntimeList LoadEncoding(RuntimeArray args, int ctx) {
+ // No-op: Java handles encodings via java.nio.charset
+ return scalarUndef.getList();
+ }
+
+ public static RuntimeList FreeEncoding(RuntimeArray args, int ctx) {
+ return scalarUndef.getList();
+ }
+
+ // ================================================================
+ // Version info - emulate expat version format
+ // ================================================================
+
+ public static RuntimeList ExpatVersion(RuntimeArray args, int ctx) {
+ return new RuntimeScalar("expat_2.6.4").getList();
+ }
+
+ public static RuntimeList ExpatVersionInfo(RuntimeArray args, int ctx) {
+ RuntimeArray result = new RuntimeArray();
+ RuntimeArray.push(result, new RuntimeScalar("major"));
+ RuntimeArray.push(result, new RuntimeScalar(2));
+ RuntimeArray.push(result, new RuntimeScalar("minor"));
+ RuntimeArray.push(result, new RuntimeScalar(6));
+ RuntimeArray.push(result, new RuntimeScalar("micro"));
+ RuntimeArray.push(result, new RuntimeScalar(4));
+ return result.getList();
+ }
+
+ // ================================================================
+ // ErrorString - map error codes to descriptions
+ // ================================================================
+
+ private static final String[] ERROR_STRINGS = {
+ "", // 0 - XML_ERROR_NONE
+ "out of memory", // 1 - XML_ERROR_NO_MEMORY
+ "syntax error", // 2 - XML_ERROR_SYNTAX
+ "no element found", // 3 - XML_ERROR_NO_ELEMENTS
+ "not well-formed (invalid token)", // 4 - XML_ERROR_INVALID_TOKEN
+ "unclosed token", // 5 - XML_ERROR_UNCLOSED_TOKEN
+ "partial character", // 6 - XML_ERROR_PARTIAL_CHAR
+ "mismatched tag", // 7 - XML_ERROR_TAG_MISMATCH
+ "duplicate attribute", // 8 - XML_ERROR_DUPLICATE_ATTRIBUTE
+ "junk after document element", // 9 - XML_ERROR_JUNK_AFTER_DOC_ELEMENT
+ "illegal parameter entity reference", // 10 - XML_ERROR_PARAM_ENTITY_REF
+ "undefined entity", // 11 - XML_ERROR_UNDEFINED_ENTITY
+ "recursive entity reference", // 12 - XML_ERROR_RECURSIVE_ENTITY_REF
+ "asynchronous entity", // 13 - XML_ERROR_ASYNC_ENTITY
+ "reference to invalid character number",// 14 - XML_ERROR_BAD_CHAR_REF
+ "reference to binary entity", // 15 - XML_ERROR_BINARY_ENTITY_REF
+ "reference to external entity in attribute", // 16
+ "XML or text declaration not at start of entity", // 17
+ "unknown encoding", // 18
+ "encoding specified in XML declaration is incorrect", // 19
+ "unclosed CDATA section", // 20
+ "error in processing external entity reference", // 21
+ "not standalone", // 22
+ };
+
+ public static RuntimeList ErrorString(RuntimeArray args, int ctx) {
+ if (args.size() > 0) {
+ int code = args.get(0).getInt();
+ if (code >= 0 && code < ERROR_STRINGS.length) {
+ return new RuntimeScalar(ERROR_STRINGS[code]).getList();
+ }
+ return new RuntimeScalar("unknown error code " + code).getList();
+ }
+ return scalarUndef.getList();
+ }
+
+ // ================================================================
+ // Namespace helper
+ // ================================================================
+
+ /**
+ * GenerateNSName(name, namespace, table, list)
+ * Creates a dualvar: string value = localname, integer value = namespace index.
+ * This matches expat's behavior where int($name) gives the namespace index
+ * and "$name" gives the local name.
+ */
+ public static RuntimeList GenerateNSName(RuntimeArray args, int ctx) {
+ if (args.size() < 4) return args.get(0).getList();
+
+ String name = args.get(0).toString();
+ String ns = args.get(1).toString();
+ RuntimeHash table = args.get(2).hashDeref();
+ RuntimeArray list = args.get(3).arrayDeref();
+
+ RuntimeScalar nsName = generateNSNameInternal(name, ns, table, list);
+ return nsName.getList();
+ }
+
+ /**
+ * Internal helper to generate namespace-qualified name as a dualvar.
+ * Returns a dualvar: string value = localname, integer value = namespace index.
+ * This replicates expat's gen_ns_name() which creates a dual PV/IV scalar.
+ */
+ private static RuntimeScalar generateNSNameInternal(String name, String ns,
+ RuntimeHash table, RuntimeArray list) {
+ RuntimeScalar existing = table.get(ns);
+ int nsIndex;
+ if (existing == null || existing.type == RuntimeScalarType.UNDEF) {
+ nsIndex = list.size();
+ RuntimeArray.push(list, new RuntimeScalar(ns));
+ table.put(ns, new RuntimeScalar(nsIndex));
+ } else {
+ nsIndex = existing.getInt();
+ }
+ // Create a dualvar: int = nsIndex, string = localname
+ RuntimeScalar dualvar = new RuntimeScalar();
+ dualvar.type = RuntimeScalarType.DUALVAR;
+ dualvar.value = new DualVar(new RuntimeScalar(nsIndex), new RuntimeScalar(name));
+ return dualvar;
+ }
+
+ // ================================================================
+ // Security API stubs - return 1 to indicate success
+ // ================================================================
+
+ public static RuntimeList SetBillionLaughsAttackProtectionMaximumAmplification(RuntimeArray args, int ctx) {
+ return scalarTrue.getList();
+ }
+
+ public static RuntimeList SetBillionLaughsAttackProtectionActivationThreshold(RuntimeArray args, int ctx) {
+ return scalarTrue.getList();
+ }
+
+ public static RuntimeList SetAllocTrackerMaximumAmplification(RuntimeArray args, int ctx) {
+ return scalarTrue.getList();
+ }
+
+ public static RuntimeList SetAllocTrackerActivationThreshold(RuntimeArray args, int ctx) {
+ return scalarTrue.getList();
+ }
+
+ public static RuntimeList SetReparseDeferralEnabled(RuntimeArray args, int ctx) {
+ return scalarTrue.getList();
+ }
+
+ // ================================================================
+ // Core parsing methods
+ // ================================================================
+
+ /**
+ * ParseString(parser, string) - Parse a complete XML string
+ */
+ public static RuntimeList ParseString(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ RuntimeScalar xmlArg = args.get(1);
+ String xmlString = xmlArg.toString();
+
+ try {
+ // Use ISO_8859_1 for BYTE_STRING to avoid double-encoding UTF-8 bytes.
+ // BYTE_STRING chars are raw byte values (0-255); ISO_8859_1 preserves them as-is.
+ // STRING (UTF-8 flagged) uses UTF_8 encoding as normal.
+ byte[] xmlBytes = (xmlArg.type == RuntimeScalarType.BYTE_STRING)
+ ? xmlString.getBytes(StandardCharsets.ISO_8859_1)
+ : xmlString.getBytes(StandardCharsets.UTF_8);
+ xmlBytes = convertEncoding(xmlBytes);
+ state.bytesProcessed = 0;
+ state.inputBytes = xmlBytes;
+ state.inputScanPos = 0;
+ doParse(state, new ByteArrayInputStream(xmlBytes));
+ return scalarTrue.getList();
+ } catch (PerlDieException e) {
+ throw e;
+ } catch (Exception e) {
+ state.errorMessage = e.getMessage() != null ? e.getMessage() : e.toString();
+ // Set error in Perl's ErrorMessage field
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ selfHash.put("ErrorMessage", new RuntimeScalar(state.errorMessage));
+ throw new PerlDieException(new RuntimeScalar(formatError(state, e)));
+ }
+ }
+
+ /**
+ * ParseStream(parser, ioref, delim) - Parse from IO handle
+ */
+ public static RuntimeList ParseStream(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ RuntimeScalar ioref = args.get(1);
+ String delim = args.size() > 2 ? args.get(2).toString() : null;
+
+ try {
+ // Read the IO handle into a byte array
+ RuntimeIO fh = RuntimeIO.getRuntimeIO(ioref);
+ if (fh == null) {
+ throw new PerlCompilerException("Not a filehandle");
+ }
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+ if (delim != null && !delim.isEmpty()) {
+ // The Perl shim (Expat.pm) sets $/ to "\n$delim\n" before calling
+ // ParseStream. So readline() will read everything up to (and including)
+ // the delimiter, leaving the filehandle positioned right after it.
+ RuntimeScalar record = Readline.readline(fh);
+ if (record.type != RuntimeScalarType.UNDEF) {
+ String recordStr = record.toString();
+ // Strip the trailing "\n$delim\n" if present
+ String suffix = "\n" + delim + "\n";
+ if (recordStr.endsWith(suffix)) {
+ recordStr = recordStr.substring(0, recordStr.length() - suffix.length());
+ }
+ java.nio.charset.Charset cs = (record.type == RuntimeScalarType.BYTE_STRING)
+ ? StandardCharsets.ISO_8859_1 : StandardCharsets.UTF_8;
+ baos.write(recordStr.getBytes(cs));
+ }
+ } else {
+ // No delimiter - read entire stream in chunks
+ byte[] buffer = new byte[8192];
+ while (true) {
+ RuntimeScalar result = fh.ioHandle.read(buffer.length);
+ if (result.type == RuntimeScalarType.UNDEF) {
+ break;
+ }
+ String chunk = result.toString();
+ if (chunk.isEmpty()) {
+ break;
+ }
+ java.nio.charset.Charset cs = (result.type == RuntimeScalarType.BYTE_STRING)
+ ? StandardCharsets.ISO_8859_1 : StandardCharsets.UTF_8;
+ baos.write(chunk.getBytes(cs));
+ }
+ }
+
+ byte[] xmlBytes = baos.toByteArray();
+ xmlBytes = convertEncoding(xmlBytes);
+ state.bytesProcessed = 0;
+ state.inputBytes = xmlBytes;
+ state.inputScanPos = 0;
+ doParse(state, new ByteArrayInputStream(xmlBytes));
+ return scalarTrue.getList();
+ } catch (PerlDieException e) {
+ throw e;
+ } catch (Exception e) {
+ state.errorMessage = e.getMessage() != null ? e.getMessage() : e.toString();
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ selfHash.put("ErrorMessage", new RuntimeScalar(state.errorMessage));
+ throw new PerlDieException(new RuntimeScalar(formatError(state, e)));
+ }
+ }
+
+ /**
+ * ParsePartial(parser, string) - Feed a chunk for non-blocking parsing
+ */
+ public static RuntimeList ParsePartial(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+ RuntimeScalar chunkArg = args.get(1);
+ String chunk = chunkArg.toString();
+
+ if (state.partialBuffer == null) {
+ state.partialBuffer = new StringBuilder();
+ }
+ state.partialBuffer.append(chunk);
+ state.partialMode = true;
+ // Track if any chunk is BYTE_STRING for correct encoding in ParseDone
+ if (chunkArg.type == RuntimeScalarType.BYTE_STRING) {
+ state.partialIsByteString = true;
+ }
+
+ return scalarTrue.getList();
+ }
+
+ /**
+ * ParseDone(parser) - Signal end of non-blocking parse
+ */
+ public static RuntimeList ParseDone(RuntimeArray args, int ctx) {
+ ParserState state = getState(args.get(0));
+
+ if (state.partialBuffer == null) {
+ return scalarTrue.getList();
+ }
+
+ try {
+ String xml = state.partialBuffer.toString();
+ state.partialBuffer = null;
+ state.partialMode = false;
+ // Use ISO_8859_1 if any chunk was BYTE_STRING to avoid double-encoding
+ byte[] xmlBytes = state.partialIsByteString
+ ? xml.getBytes(StandardCharsets.ISO_8859_1)
+ : xml.getBytes(StandardCharsets.UTF_8);
+ xmlBytes = convertEncoding(xmlBytes);
+ state.partialIsByteString = false;
+ state.bytesProcessed = 0;
+ state.inputBytes = xmlBytes;
+ state.inputScanPos = 0;
+ doParse(state, new ByteArrayInputStream(xmlBytes));
+ return scalarTrue.getList();
+ } catch (PerlDieException e) {
+ throw e;
+ } catch (Exception e) {
+ state.errorMessage = e.getMessage() != null ? e.getMessage() : e.toString();
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ selfHash.put("ErrorMessage", new RuntimeScalar(state.errorMessage));
+ throw new PerlDieException(new RuntimeScalar(formatError(state, e)));
+ }
+ }
+
+ // ================================================================
+ // Encoding conversion utilities
+ // ================================================================
+
+ // Map of expat-specific encoding names to JDK charset names
+ private static final Map ENCODING_MAP = new HashMap<>();
+ static {
+ ENCODING_MAP.put("x-sjis-unicode", "Shift_JIS");
+ ENCODING_MAP.put("x-euc-jp-unicode", "EUC-JP");
+ }
+
+ // Pattern to extract encoding from XML/text declarations
+ private static final Pattern ENCODING_PATTERN = Pattern.compile(
+ "<\\?xml[^>]*?encoding\\s*=\\s*[\"']([^\"']+)[\"']");
+
+ /**
+ * Map an encoding name to a JDK-supported charset name.
+ * Returns the mapped name if in ENCODING_MAP, otherwise returns the original.
+ */
+ private static String mapToJdkCharset(String encoding) {
+ if (encoding == null) return null;
+ String mapped = ENCODING_MAP.get(encoding.toLowerCase());
+ return mapped != null ? mapped : encoding;
+ }
+
+ /**
+ * Extract the encoding name from an XML/text declaration in raw bytes.
+ * Scans the first 200 bytes (ASCII-safe) for .
+ */
+ private static String extractDeclaredEncoding(byte[] input) {
+ int len = Math.min(input.length, 200);
+ String header = new String(input, 0, len, StandardCharsets.ISO_8859_1);
+ Matcher m = ENCODING_PATTERN.matcher(header);
+ return m.find() ? m.group(1) : null;
+ }
+
+ /**
+ * Convert encoding if the declared encoding is a custom name not supported by JDK.
+ * Re-decodes the raw bytes using the correct charset and re-encodes as UTF-8,
+ * updating the encoding declaration to match.
+ * Returns original bytes if no conversion is needed.
+ */
+ private static byte[] convertEncoding(byte[] input) {
+ String declared = extractDeclaredEncoding(input);
+ if (declared == null) return input;
+
+ String jdkCharset = ENCODING_MAP.get(declared.toLowerCase());
+ if (jdkCharset == null) return input; // not a custom encoding, let SAX handle it
+
+ try {
+ // Decode with the correct charset, re-encode as UTF-8
+ String content = new String(input, Charset.forName(jdkCharset));
+ content = content.replaceFirst(
+ "encoding\\s*=\\s*[\"']" + Pattern.quote(declared) + "[\"']",
+ "encoding=\"UTF-8\"");
+ return content.getBytes(StandardCharsets.UTF_8);
+ } catch (Exception e) {
+ return input; // fallback to original
+ }
+ }
+
+ // ================================================================
+ // SAX parsing engine
+ // ================================================================
+
+ private static void doParse(ParserState state, InputStream input) throws Exception {
+ // Check if ParseParamEnt is enabled in the Perl self hash
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ RuntimeScalar parseParamEntSV = selfHash.get("ParseParamEnt");
+ boolean parseParamEnt = (parseParamEntSV != null && parseParamEntSV.getBoolean());
+
+ // UseForeignDTD: synthesize ExternEnt handler call and inject DOCTYPE
+ // for documents without a DOCTYPE declaration (per libexpat behavior)
+ RuntimeScalar useForeignDtdSV = selfHash.get("UseForeignDTD");
+ boolean useForeignDTD = (useForeignDtdSV != null && useForeignDtdSV.getBoolean());
+
+ if (useForeignDTD && parseParamEnt && state.externEntHandler != null
+ && state.inputBytes != null) {
+ // Check if document already has a DOCTYPE declaration
+ String docPrefix = new String(state.inputBytes, 0,
+ Math.min(500, state.inputBytes.length), StandardCharsets.UTF_8);
+ if (!docPrefix.contains("");
+ if (endOfXmlDecl >= 0) {
+ insertPos = endOfXmlDecl + 2;
+ if (insertPos < docStr.length() && docStr.charAt(insertPos) == '\n') {
+ insertPos++;
+ }
+ }
+ }
+ String doctypeDecl = "\n";
+ StringBuilder sb = new StringBuilder(docStr);
+ sb.insert(insertPos, doctypeDecl);
+ byte[] newBytes = sb.toString().getBytes(StandardCharsets.UTF_8);
+ state.inputBytes = newBytes;
+ input = new ByteArrayInputStream(newBytes);
+ }
+ }
+ }
+ }
+
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(state.namespaces);
+ factory.setValidating(false);
+
+ // Enable features for DTD handling
+ try {
+ factory.setFeature("http://xml.org/sax/features/external-general-entities", true);
+ } catch (Exception ignored) {}
+ // Only enable parameter entity processing when ParseParamEnt is set
+ try {
+ factory.setFeature("http://xml.org/sax/features/external-parameter-entities", parseParamEnt);
+ } catch (Exception ignored) {}
+ // Load external DTDs only when ParseParamEnt is set
+ try {
+ factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", parseParamEnt);
+ } catch (Exception ignored) {}
+
+ SAXParser saxParser = factory.newSAXParser();
+ XMLReader reader = saxParser.getXMLReader();
+
+ // Remove JDK security limits that restrict deep nesting and entity expansion
+ try {
+ reader.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 0);
+ } catch (Exception ignored) {}
+ try {
+ reader.setProperty("http://www.oracle.com/xml/jaxp/properties/maxElementDepth", 0);
+ } catch (Exception ignored) {}
+ try {
+ reader.setProperty("http://www.oracle.com/xml/jaxp/properties/totalEntitySizeLimit", 0);
+ } catch (Exception ignored) {}
+
+ ExpatSAXHandler handler = new ExpatSAXHandler(state);
+ reader.setContentHandler(handler);
+ reader.setErrorHandler(handler);
+ reader.setDTDHandler(handler);
+
+ // Set LexicalHandler for comments, CDATA, DOCTYPE
+ try {
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
+ } catch (Exception ignored) {}
+
+ // Set DeclHandler for entity/element/attlist declarations
+ try {
+ reader.setProperty("http://xml.org/sax/properties/declaration-handler", handler);
+ } catch (Exception ignored) {}
+
+ // Always set EntityResolver - when ExternEnt handler is set, it bridges
+ // to Perl callbacks; otherwise, it returns empty content for unresolvable
+ // entities (preventing parse errors from missing external DTDs/PEs)
+ reader.setEntityResolver(handler);
+
+ InputSource inputSource = new InputSource(input);
+ // Set protocol encoding if specified, mapping custom names to JDK charsets
+ if (state.protocolEncoding != null) {
+ inputSource.setEncoding(mapToJdkCharset(state.protocolEncoding));
+ }
+ // Set systemId to the current working directory so SAX resolves relative URIs correctly.
+ // This also allows unresolveSysId to strip this prefix and recover relative paths.
+ String cwd = System.getProperty("user.dir");
+ String baseUri = new java.io.File(cwd, "dummy").toURI().toString();
+ baseUri = baseUri.substring(0, baseUri.lastIndexOf('/') + 1);
+ inputSource.setSystemId(baseUri);
+ // Store the base URI for un-resolution in callbacks
+ state.parseBaseUri = baseUri;
+ reader.parse(inputSource);
+ }
+
+ // ================================================================
+ // SAX event handler that dispatches to Perl callbacks
+ // ================================================================
+
+ private static class ExpatSAXHandler extends DefaultHandler
+ implements LexicalHandler, DeclHandler, EntityResolver {
+
+ private final ParserState state;
+ private boolean inCDATA = false;
+ // Track if XMLDecl was detected
+ private boolean xmlDeclFired = false;
+ // Track if we've seen an element yet (for XMLDecl detection)
+ private boolean documentStarted = false;
+
+ ExpatSAXHandler(ParserState state) {
+ this.state = state;
+ }
+
+ // ---- Locator for position tracking ----
+
+ @Override
+ public void setDocumentLocator(Locator locator) {
+ state.locator = locator;
+ }
+
+ // ---- Document lifecycle ----
+
+ @Override
+ public void startDocument() throws SAXException {
+ documentStarted = true;
+ // Fire XMLDecl handler if set - we detect the xml declaration
+ // by checking if the input starts with "= 5) {
+ String start = new String(state.inputBytes, 0,
+ Math.min(100, state.inputBytes.length), StandardCharsets.UTF_8);
+ if (start.startsWith("= decl.length()) return null;
+ char quote = decl.charAt(pos);
+ if (quote != '"' && quote != '\'') return null;
+ int end = decl.indexOf(quote, pos + 1);
+ if (end < 0) return null;
+ return decl.substring(pos + 1, end);
+ }
+
+ // ---- Namespace prefix mapping ----
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ if (!state.namespaces) return;
+
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+
+ // Prefix Table: $self->{Prefix_Table}{$prefix} = [$uri_stack]
+ String perlPrefix = (prefix == null || prefix.isEmpty()) ? "#default" : prefix;
+
+ RuntimeScalar prefixTableRef = selfHash.get("Prefix_Table");
+ if (prefixTableRef != null && prefixTableRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeHash prefixTable = prefixTableRef.hashDeref();
+ RuntimeScalar stackRef = prefixTable.get(perlPrefix);
+ if (stackRef != null && stackRef.type != RuntimeScalarType.UNDEF
+ && RuntimeScalarType.isReference(stackRef)) {
+ RuntimeArray stack = stackRef.arrayDeref();
+ RuntimeArray.push(stack, (uri != null) ? new RuntimeScalar(uri) : scalarUndef);
+ } else {
+ RuntimeArray newStack = new RuntimeArray();
+ RuntimeArray.push(newStack, (uri != null) ? new RuntimeScalar(uri) : scalarUndef);
+ prefixTable.put(perlPrefix, newStack.createReference());
+ }
+ }
+
+ // New_Prefixes: push @{$self->{New_Prefixes}}, $prefix
+ RuntimeScalar newPrefRef = selfHash.get("New_Prefixes");
+ if (newPrefRef != null && newPrefRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeArray newPrefixes = newPrefRef.arrayDeref();
+ RuntimeArray.push(newPrefixes, new RuntimeScalar(perlPrefix));
+ }
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ if (!state.namespaces) return;
+
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ String perlPrefix = (prefix == null || prefix.isEmpty()) ? "#default" : prefix;
+
+ RuntimeScalar prefixTableRef = selfHash.get("Prefix_Table");
+ if (prefixTableRef != null && prefixTableRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeHash prefixTable = prefixTableRef.hashDeref();
+ RuntimeScalar stackRef = prefixTable.get(perlPrefix);
+ if (stackRef != null && stackRef.type != RuntimeScalarType.UNDEF
+ && RuntimeScalarType.isReference(stackRef)) {
+ RuntimeArray stack = stackRef.arrayDeref();
+ if (stack.size() > 1) {
+ RuntimeArray.pop(stack);
+ } else {
+ prefixTable.delete(perlPrefix);
+ }
+ }
+ }
+ }
+
+ // ---- ContentHandler ----
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ org.xml.sax.Attributes attributes)
+ throws SAXException {
+ state.elementIndexCounter++;
+ state.elementIndex = state.elementIndexCounter;
+ state.elementIndexStack.push(state.elementIndex);
+
+ // Determine element name (as RuntimeScalar, possibly dualvar for namespaces)
+ RuntimeScalar elementNameScalar;
+ if (state.namespaces) {
+ if (uri != null && !uri.isEmpty()) {
+ elementNameScalar = generateNSNameForElement(localName, uri);
+ } else {
+ String name = localName.isEmpty() ? qName : localName;
+ elementNameScalar = new RuntimeScalar(name);
+ }
+ } else {
+ elementNameScalar = new RuntimeScalar(qName);
+ }
+
+ // Update Perl's Context array: push @{$self->{Context}}, $elementName
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ RuntimeScalar contextRef = selfHash.get("Context");
+ if (contextRef != null && contextRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeArray context = contextRef.arrayDeref();
+ RuntimeArray.push(context, elementNameScalar);
+ }
+
+ // Separate specified from defaulted attributes for specifiedAttributeCount
+ List specifiedIndices = new ArrayList<>();
+ List defaultedIndices = new ArrayList<>();
+ if (attributes instanceof Attributes2) {
+ Attributes2 attrs2 = (Attributes2) attributes;
+ for (int i = 0; i < attributes.getLength(); i++) {
+ if (attrs2.isSpecified(i)) {
+ specifiedIndices.add(i);
+ } else {
+ defaultedIndices.add(i);
+ }
+ }
+ } else {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ specifiedIndices.add(i);
+ }
+ }
+ // Track specified attribute count (number of attribute name+value pairs)
+ state.specifiedAttributeCount = specifiedIndices.size() * 2;
+
+ // Update recognized string for original_string() approximation
+ StringBuilder sb = new StringBuilder("<");
+ sb.append(qName);
+ for (int i = 0; i < attributes.getLength(); i++) {
+ sb.append(" ").append(attributes.getQName(i)).append("=\"")
+ .append(escapeXmlAttr(attributes.getValue(i))).append("\"");
+ }
+ // Detect self-closing tags () by scanning inputBytes.
+ // SAX treats and identically, but for column
+ // tracking we need to know the actual token length.
+ boolean selfClosing = false;
+ if (state.inputBytes != null && state.locator != null) {
+ // Scan forward to find "'
+ for (int endPos = pos + tagStart.length; endPos < state.inputBytes.length; endPos++) {
+ if (state.inputBytes[endPos] == '>') {
+ if (endPos > 0 && state.inputBytes[endPos - 1] == '/') {
+ selfClosing = true;
+ }
+ state.inputScanPos = endPos + 1;
+ break;
+ }
+ }
+ break;
+ }
+ }
+ }
+ if (selfClosing) {
+ sb.append("/>");
+ state.lastWasSelfClosing = true;
+ } else {
+ sb.append(">");
+ state.lastWasSelfClosing = false;
+ }
+ state.recognizedString = sb.toString();
+ state.originalString = state.recognizedString;
+ updateBytePosition(state);
+
+ // Skip if skip_until is active
+ if (state.skipUntilIndex >= 0 && state.elementIndex < state.skipUntilIndex) {
+ return;
+ }
+
+ if (state.startHandler != null) {
+ // Build args: (expat, element, attr1, val1, attr2, val2, ...)
+ // Specified attributes first, then defaulted (expat convention)
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, elementNameScalar);
+ // Specified attributes first
+ for (int idx : specifiedIndices) {
+ RuntimeArray.push(callArgs, makeAttrNameScalar(attributes, idx));
+ RuntimeArray.push(callArgs, new RuntimeScalar(attributes.getValue(idx)));
+ }
+ // Defaulted attributes after
+ for (int idx : defaultedIndices) {
+ RuntimeArray.push(callArgs, makeAttrNameScalar(attributes, idx));
+ RuntimeArray.push(callArgs, new RuntimeScalar(attributes.getValue(idx)));
+ }
+ try {
+ RuntimeCode.apply(state.startHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.defaultHandler != null) {
+ fireDefault(state, state.recognizedString);
+ }
+
+ // Clear New_Prefixes after start handler has been called
+ if (state.namespaces) {
+ RuntimeScalar newPrefRef = selfHash.get("New_Prefixes");
+ if (newPrefRef != null && newPrefRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeArray newPrefixes = newPrefRef.arrayDeref();
+ // Clear the array by setting its elements count to 0
+ while (newPrefixes.size() > 0) {
+ RuntimeArray.pop(newPrefixes);
+ }
+ }
+ }
+ }
+
+ /**
+ * Generate a namespace-qualified name as a dualvar using $self's Namespace_Table/List
+ */
+ private RuntimeScalar generateNSNameForElement(String localName, String nsUri) {
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ RuntimeScalar nsTableRef = selfHash.get("Namespace_Table");
+ RuntimeScalar nsListRef = selfHash.get("Namespace_List");
+ if (nsTableRef == null || nsTableRef.type == RuntimeScalarType.UNDEF
+ || nsListRef == null || nsListRef.type == RuntimeScalarType.UNDEF) {
+ return new RuntimeScalar(localName);
+ }
+ RuntimeHash nsTable = nsTableRef.hashDeref();
+ RuntimeArray nsList = nsListRef.arrayDeref();
+ return generateNSNameInternal(localName, nsUri, nsTable, nsList);
+ }
+
+ /**
+ * Create a RuntimeScalar for an attribute name, handling namespace mode.
+ */
+ private RuntimeScalar makeAttrNameScalar(org.xml.sax.Attributes attributes, int index) {
+ if (state.namespaces) {
+ String attrUri = attributes.getURI(index);
+ String attrLocal = attributes.getLocalName(index);
+ if (attrUri != null && !attrUri.isEmpty()) {
+ return generateNSNameForElement(attrLocal, attrUri);
+ } else {
+ String name = !attrLocal.isEmpty() ? attrLocal : attributes.getQName(index);
+ return new RuntimeScalar(name);
+ }
+ } else {
+ return new RuntimeScalar(attributes.getQName(index));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ // Restore elementIndex to match the corresponding startElement
+ if (!state.elementIndexStack.isEmpty()) {
+ state.elementIndex = state.elementIndexStack.pop();
+ }
+
+ RuntimeScalar elementNameScalar;
+ if (state.namespaces) {
+ if (uri != null && !uri.isEmpty()) {
+ elementNameScalar = generateNSNameForElement(localName, uri);
+ } else {
+ String name = localName.isEmpty() ? qName : localName;
+ elementNameScalar = new RuntimeScalar(name);
+ }
+ } else {
+ elementNameScalar = new RuntimeScalar(qName);
+ }
+
+ // For self-closing tags (), SAX fires endElement immediately after
+ // startElement. For column calculation: libexpat returns column AFTER the
+ // '>' for self-closing end handlers. Set recognizedString to empty so
+ // GetCurrentColumnNumber doesn't subtract anything.
+ if (state.lastWasSelfClosing) {
+ state.recognizedString = "";
+ state.originalString = "";
+ } else {
+ state.recognizedString = "" + qName + ">";
+ state.originalString = state.recognizedString;
+ }
+ // Always clear the flag after use
+ state.lastWasSelfClosing = false;
+ updateBytePosition(state);
+
+ if (state.skipUntilIndex >= 0 && state.elementIndex < state.skipUntilIndex) {
+ // Pop Context even when skipping
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ RuntimeScalar contextRef = selfHash.get("Context");
+ if (contextRef != null && contextRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeArray context = contextRef.arrayDeref();
+ if (context.size() > 0) {
+ RuntimeArray.pop(context);
+ }
+ }
+ return;
+ }
+
+ // Reset skip after matching element
+ if (state.skipUntilIndex >= 0 && state.elementIndex >= state.skipUntilIndex) {
+ state.skipUntilIndex = -1;
+ }
+
+ if (state.endHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, elementNameScalar);
+ try {
+ RuntimeCode.apply(state.endHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.defaultHandler != null) {
+ fireDefault(state, state.recognizedString);
+ }
+
+ // Pop Perl's Context array AFTER the end handler (matches libexpat behavior)
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ RuntimeScalar contextRef = selfHash.get("Context");
+ if (contextRef != null && contextRef.type != RuntimeScalarType.UNDEF) {
+ RuntimeArray context = contextRef.arrayDeref();
+ if (context.size() > 0) {
+ RuntimeArray.pop(context);
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (state.skipUntilIndex >= 0) return;
+
+ String text = new String(ch, start, length);
+ state.recognizedString = text;
+ // When inside an entity expansion, originalString should be the
+ // unexpanded entity reference (e.g. "&draft.day;")
+ if (state.currentEntityName != null) {
+ state.originalString = "&" + state.currentEntityName + ";";
+ state.currentEntityName = null; // consume - only first characters() gets it
+ } else {
+ state.originalString = text;
+ }
+ updateBytePosition(state);
+
+ if (state.charHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(text));
+ try {
+ RuntimeCode.apply(state.charHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.defaultHandler != null) {
+ fireDefault(state, text);
+ }
+ }
+
+ @Override
+ public void processingInstruction(String target, String data) throws SAXException {
+ if (state.skipUntilIndex >= 0) return;
+
+ state.recognizedString = "" + target + " " + data + "?>";
+ state.originalString = state.recognizedString;
+ updateBytePosition(state);
+
+ if (state.procHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(target));
+ RuntimeArray.push(callArgs, new RuntimeScalar(data != null ? data : ""));
+ try {
+ RuntimeCode.apply(state.procHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.defaultHandler != null) {
+ fireDefault(state, state.recognizedString);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ characters(ch, start, length);
+ }
+
+ // ---- DTDHandler ----
+
+ @Override
+ public void unparsedEntityDecl(String name, String publicId, String systemId,
+ String notationName) throws SAXException {
+ if (state.unparsedHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, state.base != null ? new RuntimeScalar(state.base) : scalarUndef);
+ String rawSysId = unresolveSysId(systemId, state);
+ RuntimeArray.push(callArgs, new RuntimeScalar(rawSysId != null ? rawSysId : ""));
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(notationName));
+ try {
+ RuntimeCode.apply(state.unparsedHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.entityDeclHandler != null) {
+ // Per Expat.pm docs: "If both [Entity and Unparsed handlers] are set,
+ // then [Entity] handler will not be called for unparsed entities."
+ // When only Entity handler is set, route unparsed entities through it.
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, scalarUndef); // val (undef for external entities)
+ String rawSysId2 = unresolveSysId(systemId, state);
+ RuntimeArray.push(callArgs, rawSysId2 != null ? new RuntimeScalar(rawSysId2) : scalarUndef);
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(notationName)); // ndata
+ RuntimeArray.push(callArgs, scalarZero); // is_param
+ try {
+ RuntimeCode.apply(state.entityDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void notationDecl(String name, String publicId, String systemId)
+ throws SAXException {
+ if (state.notationHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, state.base != null ? new RuntimeScalar(state.base) : scalarUndef);
+ String rawNotSysId = unresolveSysId(systemId, state);
+ RuntimeArray.push(callArgs, rawNotSysId != null ? new RuntimeScalar(rawNotSysId) : scalarUndef);
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ try {
+ RuntimeCode.apply(state.notationHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ // ---- LexicalHandler ----
+
+ @Override
+ public void comment(char[] ch, int start, int length) throws SAXException {
+ if (state.skipUntilIndex >= 0) return;
+
+ String text = new String(ch, start, length);
+ state.recognizedString = "";
+ state.originalString = state.recognizedString;
+ updateBytePosition(state);
+
+ if (state.commentHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(text));
+ try {
+ RuntimeCode.apply(state.commentHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ } else if (state.defaultHandler != null) {
+ fireDefault(state, state.recognizedString);
+ }
+ }
+
+ @Override
+ public void startCDATA() throws SAXException {
+ inCDATA = true;
+ if (state.skipUntilIndex >= 0) return;
+
+ if (state.startCdataHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ try {
+ RuntimeCode.apply(state.startCdataHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void endCDATA() throws SAXException {
+ inCDATA = false;
+ if (state.skipUntilIndex >= 0) return;
+
+ if (state.endCdataHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ try {
+ RuntimeCode.apply(state.endCdataHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void startDTD(String name, String publicId, String systemId) throws SAXException {
+ if (state.doctypeHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, systemId != null ? new RuntimeScalar(systemId) : scalarUndef);
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ RuntimeArray.push(callArgs, scalarTrue); // internal subset
+ try {
+ RuntimeCode.apply(state.doctypeHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void endDTD() throws SAXException {
+ if (state.endDoctypeHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ try {
+ RuntimeCode.apply(state.endDoctypeHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void startEntity(String name) throws SAXException {
+ // Track entity name so characters() can set originalString correctly.
+ // JDK SAX fires: startEntity → endEntity → characters,
+ // so we use a "pending" approach: set the name here, consume in characters().
+ if (!name.startsWith("[")) { // Skip internal SAX entities like [dtd]
+ state.currentEntityName = name;
+ }
+ }
+
+ @Override
+ public void endEntity(String name) throws SAXException {
+ // Don't clear here - characters() hasn't fired yet (JDK ordering)
+ }
+
+ // ---- DeclHandler ----
+
+ @Override
+ public void internalEntityDecl(String name, String value) throws SAXException {
+ if (state.entityDeclHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, new RuntimeScalar(value)); // value
+ RuntimeArray.push(callArgs, scalarUndef); // sysid
+ RuntimeArray.push(callArgs, scalarUndef); // pubid
+ RuntimeArray.push(callArgs, scalarUndef); // notation
+ RuntimeArray.push(callArgs, new RuntimeScalar(name.startsWith("%") ? 1 : 0)); // is_param
+ try {
+ RuntimeCode.apply(state.entityDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void externalEntityDecl(String name, String publicId, String systemId)
+ throws SAXException {
+ if (state.entityDeclHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, scalarUndef); // value (external entities have no inline value)
+ String rawExtSysId = unresolveSysId(systemId, state);
+ RuntimeArray.push(callArgs, rawExtSysId != null ? new RuntimeScalar(rawExtSysId) : scalarUndef);
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ RuntimeArray.push(callArgs, scalarUndef); // notation
+ RuntimeArray.push(callArgs, new RuntimeScalar(name.startsWith("%") ? 1 : 0)); // is_param
+ try {
+ RuntimeCode.apply(state.entityDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ @Override
+ public void elementDecl(String name, String model) throws SAXException {
+ if (state.elementDeclHandler != null) {
+ RuntimeScalar modelRef = parseContentModel(model);
+
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(name));
+ RuntimeArray.push(callArgs, modelRef);
+ try {
+ RuntimeCode.apply(state.elementDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ /**
+ * Parse a DTD content model string into a blessed ContentModel hash.
+ * Handles EMPTY, ANY, (#PCDATA), and nested (a,b|c) with quantifiers.
+ */
+ private RuntimeScalar parseContentModel(String model) {
+ model = model.trim();
+ return parseModelExpr(model, 0, model.length());
+ }
+
+ private RuntimeScalar parseModelExpr(String model, int start, int end) {
+ String s = model.substring(start, end).trim();
+
+ // EMPTY
+ if (s.equals("EMPTY")) {
+ return makeContentModel(1, null, null, null); // Type 1 = EMPTY
+ }
+ // ANY
+ if (s.equals("ANY")) {
+ return makeContentModel(2, null, null, null); // Type 2 = ANY
+ }
+
+ // Check for quantifier at the end
+ String quant = null;
+ if (s.endsWith("*") || s.endsWith("+") || s.endsWith("?")) {
+ quant = s.substring(s.length() - 1);
+ s = s.substring(0, s.length() - 1).trim();
+ }
+
+ // Parenthesized group
+ if (s.startsWith("(") && s.endsWith(")")) {
+ String inner = s.substring(1, s.length() - 1).trim();
+
+ // (#PCDATA...) = MIXED
+ if (inner.startsWith("#PCDATA")) {
+ return makeContentModel(3, null, quant, parseMixedChildren(inner));
+ }
+
+ // Find the separator: ',' for SEQ, '|' for CHOICE
+ List parts = splitModelGroup(inner);
+ if (parts.size() == 1 && !inner.contains(",") && !inner.contains("|")) {
+ // Single child, check if it's a name with quantifier
+ return parseModelExpr(inner, 0, inner.length());
+ }
+
+ boolean isChoice = inner.contains("|") && !inner.contains(",");
+ int type = isChoice ? 5 : 6; // 5=CHOICE, 6=SEQ
+
+ List children = new ArrayList<>();
+ for (String part : parts) {
+ children.add(parseModelExpr(part.trim(), 0, part.trim().length()));
+ }
+ return makeContentModel(type, null, quant, children);
+ }
+
+ // Simple NAME (possibly with quantifier)
+ if (quant != null) {
+ return makeContentModel(4, s, quant, null); // Type 4 = NAME
+ }
+ // Check for trailing quantifier on name
+ if (s.endsWith("*") || s.endsWith("+") || s.endsWith("?")) {
+ quant = s.substring(s.length() - 1);
+ s = s.substring(0, s.length() - 1).trim();
+ }
+ return makeContentModel(4, s, quant, null); // Type 4 = NAME
+ }
+
+ private List parseMixedChildren(String inner) {
+ // (#PCDATA|foo|bar) - split on | and skip #PCDATA
+ List children = new ArrayList<>();
+ String[] parts = inner.split("\\|");
+ for (String part : parts) {
+ part = part.trim();
+ if (!part.equals("#PCDATA")) {
+ children.add(makeContentModel(4, part, null, null));
+ }
+ }
+ return children;
+ }
+
+ /**
+ * Split a model group respecting nested parentheses.
+ * E.g. "(a,(b|c)),d" → ["(a,(b|c))", "d"]
+ */
+ private List splitModelGroup(String group) {
+ List parts = new ArrayList<>();
+ int depth = 0;
+ int start = 0;
+ char sep = group.contains(",") ? ',' : '|';
+ for (int i = 0; i < group.length(); i++) {
+ char c = group.charAt(i);
+ if (c == '(') depth++;
+ else if (c == ')') depth--;
+ else if (c == sep && depth == 0) {
+ parts.add(group.substring(start, i));
+ start = i + 1;
+ }
+ }
+ parts.add(group.substring(start));
+ return parts;
+ }
+
+ private RuntimeScalar makeContentModel(int type, String tag, String quant,
+ List children) {
+ RuntimeHash model = new RuntimeHash();
+ model.put("Type", new RuntimeScalar(type));
+ model.put("Tag", tag != null ? new RuntimeScalar(tag) : scalarUndef);
+ model.put("Quant", quant != null ? new RuntimeScalar(quant) : scalarUndef);
+ if (children != null && !children.isEmpty()) {
+ RuntimeArray childArray = new RuntimeArray();
+ for (RuntimeScalar child : children) {
+ RuntimeArray.push(childArray, child);
+ }
+ model.put("Children", childArray.createReference());
+ }
+ RuntimeScalar ref = model.createReference();
+ ReferenceOperators.bless(ref, new RuntimeScalar("XML::Parser::ContentModel"));
+ return ref;
+ }
+
+ @Override
+ public void attributeDecl(String eName, String aName, String type, String mode,
+ String value) throws SAXException {
+ if (state.attlistDeclHandler != null) {
+ // Fix type format: SAX reports "NOTATION (x|y|z)" with space,
+ // expat reports "NOTATION(x|y|z)" without space
+ String fixedType = type;
+ if (fixedType != null && fixedType.startsWith("NOTATION ")) {
+ fixedType = "NOTATION" + fixedType.substring(9);
+ }
+
+ // Compute default parameter per Perl API:
+ // "#REQUIRED", "#IMPLIED", or "'quoted_value'" (with quotes)
+ String defaultStr;
+ if ("#REQUIRED".equals(mode)) {
+ defaultStr = "#REQUIRED";
+ } else if ("#IMPLIED".equals(mode)) {
+ defaultStr = "#IMPLIED";
+ } else if (value != null) {
+ defaultStr = "'" + value + "'";
+ } else {
+ defaultStr = null;
+ }
+
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(eName));
+ RuntimeArray.push(callArgs, new RuntimeScalar(aName));
+ RuntimeArray.push(callArgs, new RuntimeScalar(fixedType));
+ RuntimeArray.push(callArgs, defaultStr != null ? new RuntimeScalar(defaultStr) : scalarUndef);
+ RuntimeArray.push(callArgs, new RuntimeScalar("#FIXED".equals(mode) ? 1 : 0));
+ try {
+ RuntimeCode.apply(state.attlistDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+ }
+
+ /**
+ * Fire the XMLDecl handler for text declarations in external entities.
+ * In libexpat, XML_SetXmlDeclHandler fires for both the main document's
+ * XML declaration and text declarations in external parsed entities
+ * (with version=undef). SAX doesn't do this, so we detect and fire manually.
+ */
+ private void fireTextDeclHandler(byte[] rawBytes) throws SAXException {
+ if (state.xmlDeclHandler == null) return;
+ String encoding = extractDeclaredEncoding(rawBytes);
+ if (encoding == null) return;
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, scalarUndef); // version is undef for text declarations
+ RuntimeArray.push(callArgs, new RuntimeScalar(encoding));
+ RuntimeArray.push(callArgs, scalarUndef); // standalone is undef
+ try {
+ RuntimeCode.apply(state.xmlDeclHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ }
+ }
+
+ // ---- EntityResolver ----
+
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) throws SAXException {
+ // Handle synthetic foreign DTD system ID (from UseForeignDTD injection)
+ if (systemId != null && systemId.contains("__perlonjava_foreign_dtd__")
+ && state.foreignDtdContent != null) {
+ InputSource is = new InputSource(new ByteArrayInputStream(state.foreignDtdContent));
+ if (systemId != null) {
+ is.setSystemId(systemId);
+ }
+ return is;
+ }
+
+ if (state.externEntHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, state.base != null ? new RuntimeScalar(state.base) : scalarUndef);
+ String rawResSysId = unresolveSysId(systemId, state);
+ RuntimeArray.push(callArgs, rawResSysId != null ? new RuntimeScalar(rawResSysId) : scalarUndef);
+ RuntimeArray.push(callArgs, publicId != null ? new RuntimeScalar(publicId) : scalarUndef);
+ try {
+ RuntimeList result = RuntimeCode.apply(state.externEntHandler, callArgs,
+ RuntimeContextType.SCALAR);
+ RuntimeScalar retVal = result.getFirst();
+
+ if (retVal.type == RuntimeScalarType.UNDEF) {
+ // Handler returned undef - entity could not be resolved
+ return null;
+ }
+
+ // Handler returned a string (entity content) or filehandle
+ if (RuntimeScalarType.isReference(retVal) || retVal.type == RuntimeScalarType.GLOB) {
+ // Filehandle - read content as bytes for proper encoding handling
+ RuntimeIO fh = RuntimeIO.getRuntimeIO(retVal);
+ if (fh != null) {
+ ByteArrayOutputStream entBaos = new ByteArrayOutputStream();
+ while (true) {
+ RuntimeScalar line = fh.ioHandle.read(8192);
+ if (line.type == RuntimeScalarType.UNDEF) break;
+ String s = line.toString();
+ if (s.isEmpty()) break;
+ java.nio.charset.Charset cs = (line.type == RuntimeScalarType.BYTE_STRING)
+ ? StandardCharsets.ISO_8859_1 : StandardCharsets.UTF_8;
+ entBaos.write(s.getBytes(cs));
+ }
+ // Call ExternEntFin if set
+ if (state.externEntFinHandler != null) {
+ RuntimeArray finArgs = new RuntimeArray();
+ RuntimeArray.push(finArgs, state.selfRef);
+ RuntimeCode.apply(state.externEntFinHandler, finArgs, RuntimeContextType.VOID);
+ }
+ byte[] entRawBytes = entBaos.toByteArray();
+ fireTextDeclHandler(entRawBytes);
+ byte[] rawBytes = convertEncoding(entRawBytes);
+ InputSource is = new InputSource(new ByteArrayInputStream(rawBytes));
+ // Preserve systemId so SAX can resolve relative references within this entity
+ if (systemId != null) {
+ is.setSystemId(systemId);
+ }
+ return is;
+ }
+ }
+
+ // String content
+ String content = retVal.toString();
+ if (!content.isEmpty()) {
+ // Call ExternEntFin if set
+ if (state.externEntFinHandler != null) {
+ RuntimeArray finArgs = new RuntimeArray();
+ RuntimeArray.push(finArgs, state.selfRef);
+ RuntimeCode.apply(state.externEntFinHandler, finArgs, RuntimeContextType.VOID);
+ }
+ // Convert to bytes for encoding handling (string may contain raw byte values)
+ java.nio.charset.Charset cs = (retVal.type == RuntimeScalarType.BYTE_STRING)
+ ? StandardCharsets.ISO_8859_1 : StandardCharsets.UTF_8;
+ byte[] entRawBytes = content.getBytes(cs);
+ fireTextDeclHandler(entRawBytes);
+ byte[] rawBytes = convertEncoding(entRawBytes);
+ InputSource is = new InputSource(new ByteArrayInputStream(rawBytes));
+ if (systemId != null) {
+ is.setSystemId(systemId);
+ }
+ return is;
+ }
+ } catch (PerlDieException e) {
+ throw new SAXException(e);
+ } catch (IOException e) {
+ throw new SAXException(e);
+ }
+ }
+ // Return empty input source to avoid network access
+ return new InputSource(new StringReader(""));
+ }
+
+ // ---- ErrorHandler ----
+
+ @Override
+ public void warning(SAXParseException e) throws SAXException {
+ // Ignore warnings
+ }
+
+ @Override
+ public void error(SAXParseException e) throws SAXException {
+ state.errorMessage = formatSAXError(e);
+ // Also set ErrorMessage in Perl hash for expat compatibility
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ selfHash.put("ErrorMessage", new RuntimeScalar(state.errorMessage));
+ throw e;
+ }
+
+ @Override
+ public void fatalError(SAXParseException e) throws SAXException {
+ state.errorMessage = formatSAXError(e);
+ // Also set ErrorMessage in Perl hash for expat compatibility
+ RuntimeHash selfHash = state.selfRef.hashDeref();
+ selfHash.put("ErrorMessage", new RuntimeScalar(state.errorMessage));
+ throw e;
+ }
+
+ private String formatSAXError(SAXParseException e) {
+ return "not well-formed (invalid token) at line " + e.getLineNumber()
+ + ", column " + e.getColumnNumber();
+ }
+ }
+
+ // ================================================================
+ // Utility methods
+ // ================================================================
+
+ /**
+ * Fire a Perl callback with the expat self + additional args
+ */
+ private static void fireCallback(ParserState state, RuntimeScalar handler, RuntimeScalar... extraArgs) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ for (RuntimeScalar arg : extraArgs) {
+ RuntimeArray.push(callArgs, arg);
+ }
+ RuntimeCode.apply(handler, callArgs, RuntimeContextType.VOID);
+ }
+
+ /**
+ * Fire the Default handler with a string
+ */
+ private static void fireDefault(ParserState state, String text) {
+ if (state.defaultHandler != null) {
+ RuntimeArray callArgs = new RuntimeArray();
+ RuntimeArray.push(callArgs, state.selfRef);
+ RuntimeArray.push(callArgs, new RuntimeScalar(text));
+ try {
+ RuntimeCode.apply(state.defaultHandler, callArgs, RuntimeContextType.VOID);
+ } catch (PerlDieException e) {
+ // Wrap in SAXException if we're in a SAX context
+ throw e;
+ }
+ }
+ }
+
+ /**
+ * Update approximate byte position by accumulating byte lengths of recognized tokens.
+ */
+ private static void updateBytePosition(ParserState state) {
+ if (state.recognizedString != null) {
+ int byteLen = state.recognizedString.getBytes(StandardCharsets.UTF_8).length;
+ state.currentByteIndex = state.bytesProcessed;
+ state.currentByteCount = byteLen;
+ state.bytesProcessed += byteLen;
+ }
+ }
+
+ /**
+ * Escape special characters in XML attribute values
+ */
+ private static String escapeXmlAttr(String value) {
+ return value.replace("&", "&")
+ .replace("<", "<")
+ .replace("\"", """);
+ }
+
+ /**
+ * Un-resolve a systemId that SAX has resolved to an absolute URI.
+ * SAX resolves relative systemIds (like "logo.gif") to absolute URIs
+ * (like "file:///path/to/logo.gif"), but expat passes the raw string.
+ * This strips the base URI prefix to recover the original relative path.
+ */
+ private static String unresolveSysId(String systemId, ParserState state) {
+ if (systemId == null) return null;
+ // Try to strip the parse base URI that we set on the InputSource
+ if (state.parseBaseUri != null && systemId.startsWith(state.parseBaseUri)) {
+ return systemId.substring(state.parseBaseUri.length());
+ }
+ // If state has an explicit base, try to make systemId relative to it
+ if (state.base != null) {
+ String base = state.base;
+ // Ensure base ends with /
+ if (!base.endsWith("/")) {
+ int lastSlash = base.lastIndexOf('/');
+ if (lastSlash >= 0) {
+ base = base.substring(0, lastSlash + 1);
+ }
+ }
+ if (systemId.startsWith(base)) {
+ return systemId.substring(base.length());
+ }
+ }
+ // Try to strip file:// + CWD prefix to recover relative or absolute file paths
+ if (systemId.startsWith("file:")) {
+ try {
+ String cwd = System.getProperty("user.dir");
+ String filePath;
+ if (systemId.startsWith("file:///")) {
+ filePath = systemId.substring(7); // file:///path -> /path
+ } else if (systemId.startsWith("file://")) {
+ filePath = systemId.substring(7); // file://path -> path
+ } else if (systemId.startsWith("file:/")) {
+ filePath = systemId.substring(5); // file:/path -> /path
+ } else {
+ filePath = systemId.substring(5); // file:path -> path
+ }
+ if (cwd != null) {
+ String cwdWithSlash = cwd.endsWith("/") ? cwd : cwd + "/";
+ if (filePath.startsWith(cwdWithSlash)) {
+ return filePath.substring(cwdWithSlash.length());
+ }
+ }
+ return filePath;
+ } catch (Exception ignored) {}
+ }
+ return systemId;
+ }
+
+ /**
+ * Format an error with line/column info, matching expat error format.
+ * SAX error messages are wrapped with "not well-formed (invalid token)"
+ * prefix and a hint about common escaping issues, matching libexpat behavior.
+ */
+ private static String formatError(ParserState state, Exception e) {
+ // Unwrap SAXException wrapping PerlDieException
+ if (e instanceof SAXException) {
+ Exception nested = ((SAXException) e).getException();
+ if (nested instanceof PerlDieException) {
+ throw (PerlDieException) nested;
+ }
+ }
+ String msg = e.getMessage() != null ? e.getMessage() : e.toString();
+ // For SAXParseExceptions (XML parse errors), format like expat
+ if (e instanceof org.xml.sax.SAXParseException) {
+ org.xml.sax.SAXParseException spe = (org.xml.sax.SAXParseException) e;
+ StringBuilder sb = new StringBuilder();
+ // Detect specific error types and map to expat error messages
+ if (msg.contains("was referenced, but not declared")) {
+ sb.append("undefined entity");
+ } else {
+ sb.append("not well-formed (invalid token)");
+ sb.append("\n(Hint: \"not well-formed\" often indicates unescaped '<', '>' or '&'");
+ sb.append(" in content \u2014 use < > or & instead)");
+ }
+ sb.append("\nat line ").append(spe.getLineNumber());
+ sb.append(", column ").append(spe.getColumnNumber());
+ sb.append("\n");
+ return sb.toString();
+ }
+ if (state.locator != null) {
+ msg += "\nat line " + state.locator.getLineNumber()
+ + ", column " + state.locator.getColumnNumber();
+ }
+ return msg;
+ }
+}
diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java
index c0e784df8..9f5ab65b6 100644
--- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java
+++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java
@@ -2067,7 +2067,16 @@ public static RuntimeList apply(RuntimeScalar runtimeScalar, RuntimeArray a, int
HintHashRegistry.pushCallerHintHash();
try {
// Cast the value to RuntimeCode and call apply()
- return code.apply(a, callContext);
+ RuntimeList result = code.apply(a, callContext);
+ // Handle tail calls (goto &func) — trampoline loop
+ // JVM-generated bytecode has its own trampoline; this handles calls from Java code
+ while (result instanceof RuntimeControlFlowList cfList
+ && cfList.getControlFlowType() == ControlFlowType.TAILCALL) {
+ RuntimeScalar tailCodeRef = cfList.getTailCallCodeRef();
+ RuntimeArray tailArgs = cfList.getTailCallArgs();
+ result = apply(tailCodeRef, tailArgs != null ? tailArgs : a, callContext);
+ }
+ return result;
} catch (PerlNonLocalReturnException e) {
// Non-local return from map/grep block
if (code.isMapGrepBlock || code.isEvalBlock) {
diff --git a/src/main/perl/lib/CPAN/Distribution.pm b/src/main/perl/lib/CPAN/Distribution.pm
index 90038b293..0d3faf2bd 100644
--- a/src/main/perl/lib/CPAN/Distribution.pm
+++ b/src/main/perl/lib/CPAN/Distribution.pm
@@ -2101,6 +2101,10 @@ sub prepare {
CPAN::Reporter::grade_PL( $self, $system, $output, $ret );
}
else {
+ # PerlOnJava: Stub out Devel::CheckLib in build directory
+ # so Makefile.PL can proceed to WriteMakefile even when
+ # native library checks would fail (we can't compile C).
+ $self->_stub_native_checkers_perlonjava();
$ret = system($system);
}
if ($ret != 0) {
@@ -2117,6 +2121,14 @@ sub prepare {
$self->store_persistent_state;
return $self->success("$system -- OK");
} else {
+ # PerlOnJava: When Makefile.PL exits 0 but no Makefile is created,
+ # generate a fallback Makefile.PL from META and re-run it.
+ if ($self->_try_perlonjava_fallback_pl($system)) {
+ $self->{writemakefile} = CPAN::Distrostatus->new("YES");
+ delete $self->{make_clean};
+ $self->store_persistent_state;
+ return $self->success("$system -- OK (PerlOnJava XS fallback)");
+ }
my $makefile = $self->{modulebuild} ? "Build" : "Makefile";
my $why = "No '$makefile' created";
$CPAN::Frontend->mywarn($why);
@@ -2130,6 +2142,86 @@ sub prepare {
return 1; # success
}
+#-> sub CPAN::Distribution::_stub_native_checkers_perlonjava
+# PerlOnJava: Replace Devel::CheckLib in build dir with a no-op stub.
+# This allows Makefile.PL to proceed to WriteMakefile() even when
+# native library checks (check_lib, assert_lib) would fail.
+sub _stub_native_checkers_perlonjava {
+ my ($self) = @_;
+ my $checklib = "inc/Devel/CheckLib.pm";
+ if (-f $checklib) {
+ $CPAN::Frontend->myprint("PerlOnJava: Stubbing $checklib for XS module\n");
+ if (open my $fh, '>', $checklib) {
+ print $fh <<'STUB';
+package Devel::CheckLib;
+use strict;
+use Exporter;
+our @ISA = ('Exporter');
+our @EXPORT = qw(assert_lib check_lib_or_exit check_lib);
+sub assert_lib { 1 }
+sub check_lib_or_exit { 1 }
+sub check_lib { 1 }
+1;
+STUB
+ close $fh;
+ }
+ }
+}
+
+#-> sub CPAN::Distribution::_try_perlonjava_fallback_pl
+# PerlOnJava: When Makefile.PL exits cleanly but creates no Makefile,
+# generate a minimal fallback Makefile.PL from META.yml/META.json
+# and re-run it so PerlOnJava's WriteMakefile can install .pm files.
+sub _try_perlonjava_fallback_pl {
+ my ($self, $system) = @_;
+
+ # Try to extract NAME and VERSION from META files
+ my ($name, $version);
+ for my $meta_file ('META.yml', 'META.json') {
+ next unless -f $meta_file;
+ if (open my $fh, '<', $meta_file) {
+ local $/;
+ my $content = <$fh>;
+ close $fh;
+ if ($meta_file eq 'META.json') {
+ ($name) = $content =~ /"name"\s*:\s*"([^"]+)"/;
+ ($version) = $content =~ /"version"\s*:\s*"([^"]+)"/;
+ } else {
+ ($name) = $content =~ /^name:\s*(\S+)/m;
+ ($version) = $content =~ /^version:\s*['"]?(\S+?)['"]?\s*$/m;
+ }
+ last if $name;
+ }
+ }
+
+ return 0 unless $name;
+ $version ||= '0';
+
+ # Convert dist name to module name (e.g., XML-Parser -> XML::Parser)
+ (my $module_name = $name) =~ s/-/::/g;
+
+ $CPAN::Frontend->myprint("PerlOnJava: Generating fallback Makefile.PL for $module_name $version\n");
+
+ # Write minimal Makefile.PL
+ if (open my $fh, '>', 'Makefile.PL') {
+ print $fh <<"FALLBACK";
+use ExtUtils::MakeMaker;
+WriteMakefile(
+ NAME => '$module_name',
+ VERSION => '$version',
+);
+FALLBACK
+ close $fh;
+ } else {
+ return 0;
+ }
+
+ # Re-run Makefile.PL
+ my $ret = system($system);
+ return 0 if $ret != 0;
+ return -f "Makefile" ? 1 : 0;
+}
+
#-> sub CPAN::Distribution::shortcut_make ;
# return values: undef means don't shortcut; 0 means shortcut as fail;
# and 1 means shortcut as success
diff --git a/src/main/perl/lib/ExtUtils/MakeMaker.pm b/src/main/perl/lib/ExtUtils/MakeMaker.pm
index 7485c2074..e37cd78e4 100644
--- a/src/main/perl/lib/ExtUtils/MakeMaker.pm
+++ b/src/main/perl/lib/ExtUtils/MakeMaker.pm
@@ -240,9 +240,10 @@ sub _install_pure_perl {
# We derive the install subdirectory from the NAME parameter.
if (!%pm && $name) {
my @parts = split /::/, $name;
- pop @parts; # Remove BASEEXT (e.g. Crypt::RC4 -> Crypt)
+ my $baseext = pop @parts; # Remove BASEEXT (e.g. XML::Parser -> Parser)
my $parent_dir = @parts ? File::Spec->catdir(@parts) : '';
+ # Scan flat .pm files in current directory
opendir(my $dh, '.') or warn "Cannot opendir .: $!";
if ($dh) {
while (my $file = readdir($dh)) {
@@ -254,6 +255,22 @@ sub _install_pure_perl {
}
closedir($dh);
}
+
+ # Also scan BASEEXT directory recursively (standard MakeMaker PMLIBDIRS)
+ # e.g. for XML::Parser, scan Parser/ which contains Style/*.pm
+ if ($baseext && -d $baseext) {
+ find({
+ wanted => sub {
+ return unless -f && /$installable_re/;
+ my $src = $File::Find::name;
+ my $rel = $parent_dir
+ ? File::Spec->catfile($parent_dir, $src)
+ : $src;
+ $pm{$src} = File::Spec->catfile($INSTALL_BASE, $rel);
+ },
+ no_chdir => 1,
+ }, $baseext);
+ }
}
}
diff --git a/src/main/perl/lib/XML/Parser.pm b/src/main/perl/lib/XML/Parser.pm
new file mode 100644
index 000000000..ccdca2edd
--- /dev/null
+++ b/src/main/perl/lib/XML/Parser.pm
@@ -0,0 +1,889 @@
+# XML::Parser
+#
+# Copyright (c) 1998-2000 Larry Wall and Clark Cooper
+# All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+
+package XML::Parser;
+
+use strict;
+
+our ( $VERSION, $LWP_load_failed );
+
+use Carp;
+
+BEGIN {
+ require XML::Parser::Expat;
+ $VERSION = '2.56';
+ die "Parser.pm and Expat.pm versions don't match"
+ unless $VERSION eq $XML::Parser::Expat::VERSION;
+}
+
+$LWP_load_failed = 0;
+
+sub new {
+ my ( $class, %args ) = @_;
+ my $style = $args{Style};
+
+ my $nonexopt = $args{Non_Expat_Options} ||= {};
+
+ $nonexopt->{Style} = 1;
+ $nonexopt->{Non_Expat_Options} = 1;
+ $nonexopt->{Handlers} = 1;
+ $nonexopt->{_HNDL_TYPES} = 1;
+ $nonexopt->{NoLWP} = 1;
+
+ $args{_HNDL_TYPES} = {%XML::Parser::Expat::Handler_Setters};
+ $args{_HNDL_TYPES}->{Init} = 1;
+ $args{_HNDL_TYPES}->{Final} = 1;
+
+ $args{Handlers} ||= {};
+ my $handlers = $args{Handlers};
+
+ if ( defined($style) ) {
+ my $stylepkg = $style;
+
+ if ( $stylepkg !~ /::/ ) {
+ $stylepkg = "\u$style";
+
+ eval {
+ my $fullpkg = "XML::Parser::Style::$stylepkg";
+ my $stylefile = $fullpkg;
+ $stylefile =~ s/::/\//g;
+ require "$stylefile.pm";
+ $stylepkg = $fullpkg;
+ };
+ if ($@) {
+
+ # fallback to old behaviour
+ $stylepkg = "XML::Parser::$stylepkg";
+ }
+ }
+
+ foreach my $htype ( keys %{ $args{_HNDL_TYPES} } ) {
+
+ # Handlers explicitly given override
+ # handlers from the Style package
+ unless ( defined( $handlers->{$htype} ) ) {
+
+ # A handler in the style package must either have
+ # exactly the right case as the type name or a
+ # completely lower case version of it.
+
+ my $hname = "${stylepkg}::$htype";
+ if ( defined(&$hname) ) {
+ $handlers->{$htype} = \&$hname;
+ next;
+ }
+
+ $hname = "${stylepkg}::\L$htype";
+ if ( defined(&$hname) ) {
+ $handlers->{$htype} = \&$hname;
+ next;
+ }
+ }
+ }
+ }
+
+ unless ( defined( $handlers->{ExternEnt} )
+ or defined( $handlers->{ExternEntFin} ) ) {
+
+ if ( $args{NoLWP} or $LWP_load_failed ) {
+ $handlers->{ExternEnt} = \&file_ext_ent_handler;
+ $handlers->{ExternEntFin} = \&file_ext_ent_cleanup;
+ }
+ else {
+ # The following just bootstraps the real LWP external entity
+ # handler
+
+ $handlers->{ExternEnt} = \&initial_ext_ent_handler;
+
+ # No cleanup function available until LWPExternEnt.pl loaded
+ }
+ }
+
+ $args{Pkg} ||= caller;
+ bless \%args, $class;
+} # End of new
+
+sub setHandlers {
+ my ( $self, @handler_pairs ) = @_;
+
+ croak('Uneven number of arguments to setHandlers method')
+ if ( int(@handler_pairs) & 1 );
+
+ my @ret;
+ while (@handler_pairs) {
+ my $type = shift @handler_pairs;
+ my $handler = shift @handler_pairs;
+ unless ( defined( $self->{_HNDL_TYPES}->{$type} ) ) {
+ my @types = sort keys %{ $self->{_HNDL_TYPES} };
+
+ croak("Unknown Parser handler type: $type\n Valid types: @types");
+ }
+ push( @ret, $type, $self->{Handlers}->{$type} );
+ $self->{Handlers}->{$type} = $handler;
+ }
+
+ return @ret;
+}
+
+sub parse_start {
+ my $self = shift;
+ my @expat_options = ();
+
+ for my $key ( keys %{$self} ) {
+ push( @expat_options, $key, $self->{$key} )
+ unless exists $self->{Non_Expat_Options}->{$key};
+ }
+
+ my %handlers = %{ $self->{Handlers} };
+ my $init = delete $handlers{Init};
+ my $final = delete $handlers{Final};
+
+ my $expatnb = XML::Parser::ExpatNB->new( @expat_options, @_ );
+ $expatnb->setHandlers(%handlers);
+
+ &$init($expatnb)
+ if defined($init);
+
+ $expatnb->{_State_} = 1;
+
+ $expatnb->{FinalHandler} = $final
+ if defined($final);
+
+ return $expatnb;
+}
+
+sub parse {
+ my $self = shift;
+ my $arg = shift;
+ my @expat_options = ();
+ for my $key ( keys %{$self} ) {
+ push( @expat_options, $key, $self->{$key} )
+ unless exists $self->{Non_Expat_Options}->{$key};
+ }
+
+ my $expat = XML::Parser::Expat->new( @expat_options, @_ );
+ my %handlers = %{ $self->{Handlers} };
+ my $init = delete $handlers{Init};
+ my $final = delete $handlers{Final};
+
+ $expat->setHandlers(%handlers);
+
+ if ( $self->{Base} ) {
+ $expat->base( $self->{Base} );
+ }
+
+ &$init($expat)
+ if defined($init);
+
+ my @result = ();
+ my $result;
+ eval { $result = $expat->parse($arg); };
+ my $err = $@;
+ if ($err) {
+ $expat->release;
+ die $err;
+ }
+
+ if ( $result and defined($final) ) {
+ if (wantarray) {
+ @result = &$final($expat);
+ }
+ else {
+ $result = &$final($expat);
+ }
+ }
+
+ $expat->release;
+
+ return unless defined wantarray;
+ return wantarray ? @result : $result;
+}
+
+sub parsestring {
+ my $self = shift;
+ $self->parse(@_);
+}
+
+sub parsefile {
+ my $self = shift;
+ my $file = shift;
+
+ open( my $fh, '<', $file ) or croak "Couldn't open $file:\n$!";
+ binmode($fh);
+ my @ret;
+ my $ret;
+
+ my $old_base = $self->{Base};
+ $self->{Base} = $file;
+
+ if (wantarray) {
+ eval { @ret = $self->parse( $fh, @_ ); };
+ }
+ else {
+ eval { $ret = $self->parse( $fh, @_ ); };
+ }
+ my $err = $@;
+ $self->{Base} = $old_base;
+ close($fh);
+ die $err if $err;
+
+ return unless defined wantarray;
+ return wantarray ? @ret : $ret;
+}
+
+sub initial_ext_ent_handler {
+
+ # This just bootstraps in the real lwp_ext_ent_handler which
+ # also loads the URI and LWP modules.
+
+ unless ($LWP_load_failed) {
+ my $stat = do {
+ no warnings;
+ eval { require('XML/Parser/LWPExternEnt.pl'); };
+ };
+
+ if ($stat) {
+ $_[0]->setHandlers(
+ ExternEnt => \&lwp_ext_ent_handler,
+ ExternEntFin => \&lwp_ext_ent_cleanup
+ );
+
+ goto &lwp_ext_ent_handler;
+ }
+
+ # Failed to load lwp handler, act as if NoLWP
+
+ $LWP_load_failed = 1;
+
+ my $cmsg = "Couldn't load LWP based external entity handler\n" . "Switching to file-based external entity handler\n" . " (To avoid this message, use NoLWP option to XML::Parser)\n";
+ warn($cmsg);
+ }
+
+ $_[0]->setHandlers(
+ ExternEnt => \&file_ext_ent_handler,
+ ExternEntFin => \&file_ext_ent_cleanup
+ );
+ goto &file_ext_ent_handler;
+
+}
+
+sub file_ext_ent_handler {
+ my ( $xp, $base, $path ) = @_;
+
+ # Prepend base only for relative paths
+
+ if ( defined($base)
+ and not( $path =~ m!^(?:[\\/]|\w+:)! ) ) {
+ my $newpath = $base;
+ $newpath =~ s![^\\/:]*$!$path!;
+ $path = $newpath;
+ }
+
+ if ( $path =~ /^\s*[|>+]/
+ or $path =~ /\|\s*$/ ) {
+ $xp->{ErrorMessage} .= "System ID ($path) contains Perl IO control characters";
+ return undef;
+ }
+
+ require IO::File;
+ my $fh = IO::File->new($path);
+ unless ( defined $fh ) {
+ $xp->{ErrorMessage} .= "Failed to open $path:\n$!";
+ return undef;
+ }
+
+ $xp->{_BaseStack} ||= [];
+ $xp->{_FhStack} ||= [];
+
+ push( @{ $xp->{_BaseStack} }, $base );
+ push( @{ $xp->{_FhStack} }, $fh );
+
+ $xp->base($path);
+
+ return $fh;
+}
+
+sub file_ext_ent_cleanup {
+ my ($xp) = @_;
+
+ my $fh = pop( @{ $xp->{_FhStack} } );
+ $fh->close;
+
+ my $base = pop( @{ $xp->{_BaseStack} } );
+ $xp->base($base);
+}
+
+1;
+
+__END__
+
+=for markdown [](https://github.com/cpan-authors/XML-Parser/actions/workflows/testsuite.yml)
+
+=head1 NAME
+
+XML::Parser - A perl module for parsing XML documents
+
+=head1 SYNOPSIS
+
+ use XML::Parser;
+
+ $p1 = XML::Parser->new(Style => 'Debug');
+ $p1->parsefile('REC-xml-19980210.xml');
+ $p1->parse('Hello World');
+
+ # Alternative
+ $p2 = XML::Parser->new(Handlers => {Start => \&handle_start,
+ End => \&handle_end,
+ Char => \&handle_char});
+ $p2->parse($socket);
+
+ # Another alternative
+ $p3 = XML::Parser->new(ErrorContext => 2);
+
+ $p3->setHandlers(Char => \&text,
+ Default => \&other);
+
+ open(my $fh, 'xmlgenerator |');
+ $p3->parse($fh, ProtocolEncoding => 'ISO-8859-1');
+ close($fh);
+
+ $p3->parsefile('junk.xml', ErrorContext => 3);
+
+=begin man
+.ds PI
+
+=end man
+
+=head1 DESCRIPTION
+
+This module provides ways to parse XML documents. It is built on top of
+L, which is a lower level interface to James Clark's
+expat library. Each call to one of the parsing methods creates a new
+instance of XML::Parser::Expat which is then used to parse the document.
+Expat options may be provided when the XML::Parser object is created.
+These options are then passed on to the Expat object on each parse call.
+They can also be given as extra arguments to the parse methods, in which
+case they override options given at XML::Parser creation time.
+
+The behavior of the parser is controlled either by C> and/or
+C> options, or by L method. These all provide
+mechanisms for XML::Parser to set the handlers needed by XML::Parser::Expat.
+If neither C