From 3619568ab5476d69749d18abf13b50c3bffc57e8 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Tue, 21 Apr 2026 10:35:32 +0200 Subject: [PATCH 1/3] fix(interpreter): raise register cap so big eval STRINGs compile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interpreter bytecode compiler hard-capped registers at 65535 with a stale comment claiming registers were 16-bit. In reality the bytecode stream is int[] and the per-call register array is allocated exactly sized to maxRegisterEverUsed+1, so nothing structural limits registers to 65535 — it was a safety check left over from an older encoding. Large eval STRINGs (notably CPAN CHECKSUMS files read via Safe->reval(), which can be 50K+ lines and one giant hash literal) legitimately need ~200K registers because registers are only recycled between statements, not within a single expression. Raise the cap to 16M (keeps a sanity bound against runaway allocations: the register array would still fit in ~128MB in the worst case) and fix the stale "16-bit" comment on emitReg. Repro: use Safe; my $s = Safe->new; my $code = "{" . (map { qq["k$_" => { a=>1,b=>2,c=>3,d=>4,e=>5,f=>6 },] } 1..5000) . "}"; $s->reval($code); Before: "Too many registers: exceeded 65535 register limit" After: ok This unblocks `./jcpan -t ` for any distribution whose author has a large CHECKSUMS file (PERLANCAR, etc.). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/bytecode/BytecodeCompiler.java | 19 ++++++++++++++----- .../org/perlonjava/core/Configuration.java | 4 ++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index 094d841da..05135dfd7 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -4446,11 +4446,20 @@ public void visit(OperatorNode node) { CompileOperator.visitOperator(this, node); } + // Sanity cap on register count. The backing storage is fully dynamic — + // bytecode is an ArrayList during compile, and the runtime register + // array is allocated once per call as new RuntimeBase[maxRegisterEverUsed+1]. + // This cap only exists to catch pathological / runaway allocations; at 16M + // registers a single call frame would already need ~128MB for the register + // array, which is plenty of headroom for real code (e.g. CPAN CHECKSUMS + // files eval'd by Safe->reval, which can legitimately need 200K+ registers). + private static final int REGISTER_LIMIT = 16 * 1024 * 1024; + int allocateRegister() { int reg = nextRegister++; - if (reg > 65535) { - throwCompilerException("Too many registers: exceeded 65535 register limit. " + - "Consider breaking this code into smaller subroutines."); + if (reg > REGISTER_LIMIT) { + throwCompilerException("Too many registers: exceeded " + REGISTER_LIMIT + + " register limit. Consider breaking this code into smaller subroutines."); } // Track the highest register ever used for array sizing if (reg > maxRegisterEverUsed) { @@ -4665,8 +4674,8 @@ private void emitShort(int value) { } /** - * Emit a register index as a short value. - * Registers are now 16-bit (0-65535) instead of 8-bit (0-255). + * Emit a register index. Registers are ints (full 32-bit range, bounded by + * REGISTER_LIMIT) stored in one bytecode slot since the stream is int[]. */ void emitReg(int register) { bytecode.add(register); diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 6acefdaf7..fc68534b4 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "774dfd55f"; + public static final String gitCommitId = "d890c966c"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 21 2026 10:24:56"; + public static final String buildTimestamp = "Apr 21 2026 10:32:18"; // Prevent instantiation private Configuration() { From 57b3c2940e11d1baa079f230e6dbc35bf03a3aa2 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Tue, 21 Apr 2026 10:43:46 +0200 Subject: [PATCH 2/3] fix(regex): use fold-safe placeholder marker for unsupported (?{..}) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-constant (?{ CODE }) blocks are replaced by the string-interpolation parser with a placeholder that RegexPreprocessor detects and reports as "not implemented" (or warns under JPERL_UNIMPLEMENTED=warn). The old placeholder spelled "UNIMPLEMENTED_CODE_BLOCK" — that trailing 'K' participates in the preprocessor's /i single-char fold expansion (K ↔ k ↔ Kelvin sign U+212A), which rewrites the marker into (?:\Qk\E|\QK\E|\Qℹ K\E) *before* the marker-detection check runs. Under /i the detection therefore silently failed and the pattern containing the garbled placeholder was compiled as-is, producing no diagnostic and either failing to match or matching unrelated input. This also introduces a shared RegexMarkers class so StringSegmentParser and RegexPreprocessor cannot drift out of sync on the exact spelling of the placeholder. The (??{...}) recursive-pattern placeholder is renamed identically for consistency, but its handling is deliberately unchanged: the existing (??{...}) code path reduces non-constant bodies to an empty non-capturing group, a soft fallback that existing tests and CPAN modules rely on. Before (with /i): qr{(?{ $x })wk}ix silently compiled a broken pattern. After: "(?{...}) code blocks in regex not implemented" regardless of flags, matching the pre-existing non-/i behavior. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 4 +- .../frontend/parser/StringSegmentParser.java | 7 +-- .../runtime/regex/RegexMarkers.java | 46 +++++++++++++++++++ .../runtime/regex/RegexPreprocessor.java | 2 +- 4 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index fc68534b4..01a4d317f 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "d890c966c"; + public static final String gitCommitId = "ecabff69c"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 21 2026 10:32:18"; + public static final String buildTimestamp = "Apr 21 2026 10:57:46"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java index c3313aaa7..ba7a6396b 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java @@ -8,6 +8,7 @@ import org.perlonjava.frontend.lexer.LexerToken; import org.perlonjava.frontend.lexer.LexerTokenType; import org.perlonjava.runtime.regex.CaptureNameEncoder; +import org.perlonjava.runtime.regex.RegexMarkers; import org.perlonjava.runtime.regex.UnicodeResolver; import org.perlonjava.runtime.runtimetypes.PerlCompilerException; import org.perlonjava.runtime.runtimetypes.RuntimeScalar; @@ -799,7 +800,7 @@ private void parseRegexCodeBlock(boolean isRecursive) { if (captureName == null) { // Encoding failed (e.g., name too long) - use fallback - segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex)); } else { // Encoding succeeded - create capture group StringNode captureNode = new StringNode("(?<" + captureName + ">)", savedTokenIndex); @@ -809,9 +810,9 @@ private void parseRegexCodeBlock(boolean isRecursive) { } else { // Not a constant - use unimplemented marker if (isRecursive) { - segments.add(new StringNode("(??{UNIMPLEMENTED_RECURSIVE_PATTERN})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.RECURSIVE_PATTERN, savedTokenIndex)); } else { - segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex)); } } } diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java new file mode 100644 index 000000000..b73e3f37d --- /dev/null +++ b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java @@ -0,0 +1,46 @@ +package org.perlonjava.runtime.regex; + +/** + * Shared placeholder markers used by the string-interpolation parser to + * stand in for regex constructs that PerlOnJava cannot compile literally + * (because they require features unsupported by the underlying Java regex + * engine — e.g. arbitrary {@code (?{ CODE })} code blocks and + * {@code (??{ CODE })} recursive/dynamic patterns). + * + *

The markers are emitted by {@code StringSegmentParser} when a code + * block can't be constant-folded. {@link RegexPreprocessor} detects the + * {@link #CODE_BLOCK} marker and reports a clean "not implemented" error + * (or warning under {@code JPERL_UNIMPLEMENTED=warn}). The + * {@link #RECURSIVE_PATTERN} marker is handled by the generic + * {@code (??{ ... })} code path, which for non-constant bodies reduces + * them to an empty non-capturing group (a deliberate soft-fallback many + * existing tests and modules rely on). + * + *

Why these specific spellings? The preprocessor performs some + * {@code /i} case-fold expansions (notably for {@code K}↔{@code k}↔ + * Kelvin sign U+212A, {@code µ}↔U+00B5↔U+03BC, and {@code Å}↔ + * U+212B↔{@code å}) by rewriting matching code points into alternations. + * If the marker contained any of these "problem" letters it would be + * silently rewritten under {@code /i}, bypassing the detection check and + * leaving a garbled placeholder embedded in the compiled pattern (observed + * bug: {@code (?{UNIMPLEMENTED_CODE_BLOC(?:\QK\E|\Qk\E|\QK\E)})}). Keeping + * the markers free of {@code k}, {@code K}, {@code µ}, {@code å} (and + * their Unicode counterparts) guarantees the detection check always + * matches regardless of flags. + */ +public final class RegexMarkers { + /** + * Marker for a {@code (?{ CODE })} code block that could not be + * constant-folded at parse time. Contains no fold-affected letters. + */ + public static final String CODE_BLOCK = "(?{UNIMPLEMENTED_CODE_BLOC})"; + + /** + * Marker for a {@code (??{ CODE })} recursive/dynamic pattern that + * could not be constant-folded at parse time. Contains no + * fold-affected letters. + */ + public static final String RECURSIVE_PATTERN = "(??{UNIMPLEMENTED_RECURSIVE_PATTERN})"; + + private RegexMarkers() {} +} diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java index 035d8332f..2352e47de 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java @@ -1025,7 +1025,7 @@ private static int handleParentheses(String s, int offset, int length, StringBui regexUnimplemented(s, offset + 3, "Sequence (?@...) not implemented"); } else if (c3 == '{') { // Check if this is our special unimplemented marker - if (s.startsWith("(?{UNIMPLEMENTED_CODE_BLOCK})", offset)) { + if (s.startsWith(RegexMarkers.CODE_BLOCK, offset)) { regexUnimplemented(s, offset + 2, "(?{...}) code blocks in regex not implemented"); } // Handle (?{ ... }) code blocks - try constant folding From 75db071a06c2f01cdca1d35c3f72ab5ad23eb3bd Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Tue, 21 Apr 2026 11:07:38 +0200 Subject: [PATCH 3/3] fix(regex): warn-and-continue for unsupported (??{..}), don't silently hide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the (?{...}) fold-safe marker fix. (??{ CODE }) recursive/dynamic patterns are not supported by the underlying Java regex engine. Previously the preprocessor silently substituted an empty non-capturing group for non-constant bodies, producing wrong match semantics with no diagnostic whatsoever. That silent fallback was masking real problems: tests and CPAN modules that used (??{...}) compiled "successfully" and sometimes matched by coincidence (because an empty pattern happens to match at position 0 or because the rest of the regex was what actually mattered), giving the false impression that the feature worked. JPERL_UNIMPLEMENTED=warn exists so test runs can keep going past unsupported features — it is NOT meant to hide problems. Bring (??{...}) in line with that principle: - Under default die mode: throw a clean "(??{...}) recursive/dynamic regex patterns not implemented" error, same as (?{...}). - Under JPERL_UNIMPLEMENTED=warn: emit the warning *and* fall through to the existing non-constant (??{...}) code path (which appends "(?:" for a soft empty-group fallback). The user sees a diagnostic but the surrounding pattern still compiles and test harnesses don't abort mid-run. Implementation: new regexUnimplementedSoft() helper that picks the die-vs-warn behavior based on $ENV{JPERL_UNIMPLEMENTED}. Unlike the RuntimeRegex-level catch (which replaces the whole pattern with a never-matching sentinel), this helper warns directly and lets the caller emit its fallback, so only the unsupported construct is degraded — the rest of the pattern keeps its semantics. Tests in perl5_t/t/re/reg_eval.t and re/regexp.t that previously "passed" only because the silent fallback happened to give the same match result will now correctly report as failing, surfacing the real state of (??{...}) support. That is the intended outcome. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 4 +- .../runtime/regex/RegexMarkers.java | 19 ++++--- .../runtime/regex/RegexPreprocessor.java | 49 +++++++++++++++++++ 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 01a4d317f..e707a84bc 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "ecabff69c"; + public static final String gitCommitId = "57b3c2940"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 21 2026 10:57:46"; + public static final String buildTimestamp = "Apr 21 2026 11:07:07"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java index b73e3f37d..d9c051c9e 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java @@ -8,13 +8,18 @@ * {@code (??{ CODE })} recursive/dynamic patterns). * *

The markers are emitted by {@code StringSegmentParser} when a code - * block can't be constant-folded. {@link RegexPreprocessor} detects the - * {@link #CODE_BLOCK} marker and reports a clean "not implemented" error - * (or warning under {@code JPERL_UNIMPLEMENTED=warn}). The - * {@link #RECURSIVE_PATTERN} marker is handled by the generic - * {@code (??{ ... })} code path, which for non-constant bodies reduces - * them to an empty non-capturing group (a deliberate soft-fallback many - * existing tests and modules rely on). + * block can't be constant-folded. {@link RegexPreprocessor} detects them + * and reports "not implemented": + *

    + *
  • {@link #CODE_BLOCK} — a hard error; the surrounding regex can't + * usefully run without the code block.
  • + *
  • {@link #RECURSIVE_PATTERN} — a hard error under default die mode, + * or a warning under {@code JPERL_UNIMPLEMENTED=warn} followed by + * the soft {@code (?:} fallback so the surrounding pattern still + * compiles (many CPAN modules build dynamic patterns that happen + * to work with the empty-group fallback; under warn mode we want + * tests to continue but the user must see a diagnostic).
  • + *
* *

Why these specific spellings? The preprocessor performs some * {@code /i} case-fold expansions (notably for {@code K}↔{@code k}↔ diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java index 2352e47de..eaf15a371 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java @@ -1,8 +1,11 @@ package org.perlonjava.runtime.regex; import com.ibm.icu.lang.UCharacter; +import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.runtimetypes.GlobalVariable; import org.perlonjava.runtime.runtimetypes.PerlCompilerException; import org.perlonjava.runtime.runtimetypes.PerlJavaUnimplementedException; +import org.perlonjava.runtime.runtimetypes.RuntimeScalar; import java.util.LinkedHashSet; import java.util.Map; @@ -1031,6 +1034,15 @@ private static int handleParentheses(String s, int offset, int length, StringBui // Handle (?{ ... }) code blocks - try constant folding offset = handleCodeBlock(s, offset, length, sb, regexFlags); } else if (c3 == '?' && c4 == '{') { + // Check if this is the unimplemented marker for (??{...}). + // Under JPERL_UNIMPLEMENTED=warn, warn and fall through to the + // existing non-constant handling (which appends "(?:"); under + // die mode, abort with a clean diagnostic. Either way the user + // sees the issue — silent substitution would be a lie. + if (s.startsWith(RegexMarkers.RECURSIVE_PATTERN, offset)) { + regexUnimplementedSoft(s, offset + 3, + "(??{...}) recursive/dynamic regex patterns not implemented"); + } // Handle (??{ ... }) recursive/dynamic regex patterns // These insert a regex pattern at runtime based on code execution @@ -1559,6 +1571,43 @@ static void regexUnimplemented(String s, int offset, String errMsg) { before + marker + after + "/"); } + /** + * Soft variant of {@link #regexUnimplemented}: under + * {@code JPERL_UNIMPLEMENTED=warn} the caller emits a best-effort fallback + * construct (e.g. an empty non-capturing group) and this method warns so + * the user still sees the issue, letting compilation of the surrounding + * pattern continue. Under the default die mode it behaves identically to + * {@code regexUnimplemented} and throws, aborting the regex compile. + * + *

The intent of {@code JPERL_UNIMPLEMENTED=warn} is to let test runs + * keep going past unsupported features, not to silently hide them. Using + * this helper (instead of silent fallback) ensures the user always sees a + * diagnostic. + */ + static void regexUnimplementedSoft(String s, int offset, String errMsg) { + if (!isUnimplementedWarnMode()) { + regexUnimplemented(s, offset, errMsg); + return; + } + + if (offset > s.length()) { + offset = s.length(); + } + String before = s.substring(0, offset); + String after = s.substring(offset); + String marker = after.isEmpty() ? " <-- HERE" : " <-- HERE "; + + String message = errMsg + " in regex; marked by <-- HERE in m/" + + before + marker + after + "/\n"; + WarnDie.warn(new RuntimeScalar(message), new RuntimeScalar()); + } + + private static boolean isUnimplementedWarnMode() { + return "warn".equals( + GlobalVariable.getGlobalHash("main::ENV") + .get("JPERL_UNIMPLEMENTED").toString()); + } + /** * Calculates the maximum length a pattern can match. * Returns -1 if the pattern can match unlimited length.