diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index 094d841da..05135dfd7 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -4446,11 +4446,20 @@ public void visit(OperatorNode node) { CompileOperator.visitOperator(this, node); } + // Sanity cap on register count. The backing storage is fully dynamic — + // bytecode is an ArrayList during compile, and the runtime register + // array is allocated once per call as new RuntimeBase[maxRegisterEverUsed+1]. + // This cap only exists to catch pathological / runaway allocations; at 16M + // registers a single call frame would already need ~128MB for the register + // array, which is plenty of headroom for real code (e.g. CPAN CHECKSUMS + // files eval'd by Safe->reval, which can legitimately need 200K+ registers). + private static final int REGISTER_LIMIT = 16 * 1024 * 1024; + int allocateRegister() { int reg = nextRegister++; - if (reg > 65535) { - throwCompilerException("Too many registers: exceeded 65535 register limit. " + - "Consider breaking this code into smaller subroutines."); + if (reg > REGISTER_LIMIT) { + throwCompilerException("Too many registers: exceeded " + REGISTER_LIMIT + + " register limit. Consider breaking this code into smaller subroutines."); } // Track the highest register ever used for array sizing if (reg > maxRegisterEverUsed) { @@ -4665,8 +4674,8 @@ private void emitShort(int value) { } /** - * Emit a register index as a short value. - * Registers are now 16-bit (0-65535) instead of 8-bit (0-255). + * Emit a register index. Registers are ints (full 32-bit range, bounded by + * REGISTER_LIMIT) stored in one bytecode slot since the stream is int[]. */ void emitReg(int register) { bytecode.add(register); diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 6acefdaf7..e707a84bc 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "774dfd55f"; + public static final String gitCommitId = "57b3c2940"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 21 2026 10:24:56"; + public static final String buildTimestamp = "Apr 21 2026 11:07:07"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java index c3313aaa7..ba7a6396b 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java @@ -8,6 +8,7 @@ import org.perlonjava.frontend.lexer.LexerToken; import org.perlonjava.frontend.lexer.LexerTokenType; import org.perlonjava.runtime.regex.CaptureNameEncoder; +import org.perlonjava.runtime.regex.RegexMarkers; import org.perlonjava.runtime.regex.UnicodeResolver; import org.perlonjava.runtime.runtimetypes.PerlCompilerException; import org.perlonjava.runtime.runtimetypes.RuntimeScalar; @@ -799,7 +800,7 @@ private void parseRegexCodeBlock(boolean isRecursive) { if (captureName == null) { // Encoding failed (e.g., name too long) - use fallback - segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex)); } else { // Encoding succeeded - create capture group StringNode captureNode = new StringNode("(?<" + captureName + ">)", savedTokenIndex); @@ -809,9 +810,9 @@ private void parseRegexCodeBlock(boolean isRecursive) { } else { // Not a constant - use unimplemented marker if (isRecursive) { - segments.add(new StringNode("(??{UNIMPLEMENTED_RECURSIVE_PATTERN})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.RECURSIVE_PATTERN, savedTokenIndex)); } else { - segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex)); + segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex)); } } } diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java new file mode 100644 index 000000000..d9c051c9e --- /dev/null +++ b/src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java @@ -0,0 +1,51 @@ +package org.perlonjava.runtime.regex; + +/** + * Shared placeholder markers used by the string-interpolation parser to + * stand in for regex constructs that PerlOnJava cannot compile literally + * (because they require features unsupported by the underlying Java regex + * engine — e.g. arbitrary {@code (?{ CODE })} code blocks and + * {@code (??{ CODE })} recursive/dynamic patterns). + * + *

The markers are emitted by {@code StringSegmentParser} when a code + * block can't be constant-folded. {@link RegexPreprocessor} detects them + * and reports "not implemented": + *

+ * + *

Why these specific spellings? The preprocessor performs some + * {@code /i} case-fold expansions (notably for {@code K}↔{@code k}↔ + * Kelvin sign U+212A, {@code µ}↔U+00B5↔U+03BC, and {@code Å}↔ + * U+212B↔{@code å}) by rewriting matching code points into alternations. + * If the marker contained any of these "problem" letters it would be + * silently rewritten under {@code /i}, bypassing the detection check and + * leaving a garbled placeholder embedded in the compiled pattern (observed + * bug: {@code (?{UNIMPLEMENTED_CODE_BLOC(?:\QK\E|\Qk\E|\QK\E)})}). Keeping + * the markers free of {@code k}, {@code K}, {@code µ}, {@code å} (and + * their Unicode counterparts) guarantees the detection check always + * matches regardless of flags. + */ +public final class RegexMarkers { + /** + * Marker for a {@code (?{ CODE })} code block that could not be + * constant-folded at parse time. Contains no fold-affected letters. + */ + public static final String CODE_BLOCK = "(?{UNIMPLEMENTED_CODE_BLOC})"; + + /** + * Marker for a {@code (??{ CODE })} recursive/dynamic pattern that + * could not be constant-folded at parse time. Contains no + * fold-affected letters. + */ + public static final String RECURSIVE_PATTERN = "(??{UNIMPLEMENTED_RECURSIVE_PATTERN})"; + + private RegexMarkers() {} +} diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java index 035d8332f..eaf15a371 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java @@ -1,8 +1,11 @@ package org.perlonjava.runtime.regex; import com.ibm.icu.lang.UCharacter; +import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.runtimetypes.GlobalVariable; import org.perlonjava.runtime.runtimetypes.PerlCompilerException; import org.perlonjava.runtime.runtimetypes.PerlJavaUnimplementedException; +import org.perlonjava.runtime.runtimetypes.RuntimeScalar; import java.util.LinkedHashSet; import java.util.Map; @@ -1025,12 +1028,21 @@ private static int handleParentheses(String s, int offset, int length, StringBui regexUnimplemented(s, offset + 3, "Sequence (?@...) not implemented"); } else if (c3 == '{') { // Check if this is our special unimplemented marker - if (s.startsWith("(?{UNIMPLEMENTED_CODE_BLOCK})", offset)) { + if (s.startsWith(RegexMarkers.CODE_BLOCK, offset)) { regexUnimplemented(s, offset + 2, "(?{...}) code blocks in regex not implemented"); } // Handle (?{ ... }) code blocks - try constant folding offset = handleCodeBlock(s, offset, length, sb, regexFlags); } else if (c3 == '?' && c4 == '{') { + // Check if this is the unimplemented marker for (??{...}). + // Under JPERL_UNIMPLEMENTED=warn, warn and fall through to the + // existing non-constant handling (which appends "(?:"); under + // die mode, abort with a clean diagnostic. Either way the user + // sees the issue — silent substitution would be a lie. + if (s.startsWith(RegexMarkers.RECURSIVE_PATTERN, offset)) { + regexUnimplementedSoft(s, offset + 3, + "(??{...}) recursive/dynamic regex patterns not implemented"); + } // Handle (??{ ... }) recursive/dynamic regex patterns // These insert a regex pattern at runtime based on code execution @@ -1559,6 +1571,43 @@ static void regexUnimplemented(String s, int offset, String errMsg) { before + marker + after + "/"); } + /** + * Soft variant of {@link #regexUnimplemented}: under + * {@code JPERL_UNIMPLEMENTED=warn} the caller emits a best-effort fallback + * construct (e.g. an empty non-capturing group) and this method warns so + * the user still sees the issue, letting compilation of the surrounding + * pattern continue. Under the default die mode it behaves identically to + * {@code regexUnimplemented} and throws, aborting the regex compile. + * + *

The intent of {@code JPERL_UNIMPLEMENTED=warn} is to let test runs + * keep going past unsupported features, not to silently hide them. Using + * this helper (instead of silent fallback) ensures the user always sees a + * diagnostic. + */ + static void regexUnimplementedSoft(String s, int offset, String errMsg) { + if (!isUnimplementedWarnMode()) { + regexUnimplemented(s, offset, errMsg); + return; + } + + if (offset > s.length()) { + offset = s.length(); + } + String before = s.substring(0, offset); + String after = s.substring(offset); + String marker = after.isEmpty() ? " <-- HERE" : " <-- HERE "; + + String message = errMsg + " in regex; marked by <-- HERE in m/" + + before + marker + after + "/\n"; + WarnDie.warn(new RuntimeScalar(message), new RuntimeScalar()); + } + + private static boolean isUnimplementedWarnMode() { + return "warn".equals( + GlobalVariable.getGlobalHash("main::ENV") + .get("JPERL_UNIMPLEMENTED").toString()); + } + /** * Calculates the maximum length a pattern can match. * Returns -1 if the pattern can match unlimited length.