Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4446,11 +4446,20 @@ public void visit(OperatorNode node) {
CompileOperator.visitOperator(this, node);
}

// Sanity cap on register count. The backing storage is fully dynamic —
// bytecode is an ArrayList<Integer> during compile, and the runtime register
// array is allocated once per call as new RuntimeBase[maxRegisterEverUsed+1].
// This cap only exists to catch pathological / runaway allocations; at 16M
// registers a single call frame would already need ~128MB for the register
// array, which is plenty of headroom for real code (e.g. CPAN CHECKSUMS
// files eval'd by Safe->reval, which can legitimately need 200K+ registers).
private static final int REGISTER_LIMIT = 16 * 1024 * 1024;

int allocateRegister() {
int reg = nextRegister++;
if (reg > 65535) {
throwCompilerException("Too many registers: exceeded 65535 register limit. " +
"Consider breaking this code into smaller subroutines.");
if (reg > REGISTER_LIMIT) {
throwCompilerException("Too many registers: exceeded " + REGISTER_LIMIT +
" register limit. Consider breaking this code into smaller subroutines.");
}
// Track the highest register ever used for array sizing
if (reg > maxRegisterEverUsed) {
Expand Down Expand Up @@ -4665,8 +4674,8 @@ private void emitShort(int value) {
}

/**
* Emit a register index as a short value.
* Registers are now 16-bit (0-65535) instead of 8-bit (0-255).
* Emit a register index. Registers are ints (full 32-bit range, bounded by
* REGISTER_LIMIT) stored in one bytecode slot since the stream is int[].
*/
void emitReg(int register) {
bytecode.add(register);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/perlonjava/core/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public final class Configuration {
* Automatically populated by Gradle/Maven during build.
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
public static final String gitCommitId = "774dfd55f";
public static final String gitCommitId = "57b3c2940";

/**
* Git commit date of the build (ISO format: YYYY-MM-DD).
Expand All @@ -48,7 +48,7 @@ public final class Configuration {
* Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at"
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
public static final String buildTimestamp = "Apr 21 2026 10:24:56";
public static final String buildTimestamp = "Apr 21 2026 11:07:07";

// Prevent instantiation
private Configuration() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.perlonjava.frontend.lexer.LexerToken;
import org.perlonjava.frontend.lexer.LexerTokenType;
import org.perlonjava.runtime.regex.CaptureNameEncoder;
import org.perlonjava.runtime.regex.RegexMarkers;
import org.perlonjava.runtime.regex.UnicodeResolver;
import org.perlonjava.runtime.runtimetypes.PerlCompilerException;
import org.perlonjava.runtime.runtimetypes.RuntimeScalar;
Expand Down Expand Up @@ -799,7 +800,7 @@ private void parseRegexCodeBlock(boolean isRecursive) {

if (captureName == null) {
// Encoding failed (e.g., name too long) - use fallback
segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex));
segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex));
} else {
// Encoding succeeded - create capture group
StringNode captureNode = new StringNode("(?<" + captureName + ">)", savedTokenIndex);
Expand All @@ -809,9 +810,9 @@ private void parseRegexCodeBlock(boolean isRecursive) {
} else {
// Not a constant - use unimplemented marker
if (isRecursive) {
segments.add(new StringNode("(??{UNIMPLEMENTED_RECURSIVE_PATTERN})", savedTokenIndex));
segments.add(new StringNode(RegexMarkers.RECURSIVE_PATTERN, savedTokenIndex));
} else {
segments.add(new StringNode("(?{UNIMPLEMENTED_CODE_BLOCK})", savedTokenIndex));
segments.add(new StringNode(RegexMarkers.CODE_BLOCK, savedTokenIndex));
}
}
}
Expand Down
51 changes: 51 additions & 0 deletions src/main/java/org/perlonjava/runtime/regex/RegexMarkers.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.perlonjava.runtime.regex;

/**
* Shared placeholder markers used by the string-interpolation parser to
* stand in for regex constructs that PerlOnJava cannot compile literally
* (because they require features unsupported by the underlying Java regex
* engine — e.g. arbitrary {@code (?{ CODE })} code blocks and
* {@code (??{ CODE })} recursive/dynamic patterns).
*
* <p>The markers are emitted by {@code StringSegmentParser} when a code
* block can't be constant-folded. {@link RegexPreprocessor} detects them
* and reports "not implemented":
* <ul>
* <li>{@link #CODE_BLOCK} — a hard error; the surrounding regex can't
* usefully run without the code block.</li>
* <li>{@link #RECURSIVE_PATTERN} — a hard error under default die mode,
* or a warning under {@code JPERL_UNIMPLEMENTED=warn} followed by
* the soft {@code (?:} fallback so the surrounding pattern still
* compiles (many CPAN modules build dynamic patterns that happen
* to work with the empty-group fallback; under warn mode we want
* tests to continue but the user must see a diagnostic).</li>
* </ul>
*
* <p><b>Why these specific spellings?</b> The preprocessor performs some
* {@code /i} case-fold expansions (notably for {@code K}↔{@code k}↔
* Kelvin sign U+212A, {@code µ}↔U+00B5↔U+03BC, and {@code Å}↔
* U+212B↔{@code å}) by rewriting matching code points into alternations.
* If the marker contained any of these "problem" letters it would be
* silently rewritten under {@code /i}, bypassing the detection check and
* leaving a garbled placeholder embedded in the compiled pattern (observed
* bug: {@code (?{UNIMPLEMENTED_CODE_BLOC(?:\QK\E|\Qk\E|\QK\E)})}). Keeping
* the markers free of {@code k}, {@code K}, {@code µ}, {@code å} (and
* their Unicode counterparts) guarantees the detection check always
* matches regardless of flags.
*/
public final class RegexMarkers {
/**
* Marker for a {@code (?{ CODE })} code block that could not be
* constant-folded at parse time. Contains no fold-affected letters.
*/
public static final String CODE_BLOCK = "(?{UNIMPLEMENTED_CODE_BLOC})";

/**
* Marker for a {@code (??{ CODE })} recursive/dynamic pattern that
* could not be constant-folded at parse time. Contains no
* fold-affected letters.
*/
public static final String RECURSIVE_PATTERN = "(??{UNIMPLEMENTED_RECURSIVE_PATTERN})";

private RegexMarkers() {}
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package org.perlonjava.runtime.regex;

import com.ibm.icu.lang.UCharacter;
import org.perlonjava.runtime.operators.WarnDie;
import org.perlonjava.runtime.runtimetypes.GlobalVariable;
import org.perlonjava.runtime.runtimetypes.PerlCompilerException;
import org.perlonjava.runtime.runtimetypes.PerlJavaUnimplementedException;
import org.perlonjava.runtime.runtimetypes.RuntimeScalar;

import java.util.LinkedHashSet;
import java.util.Map;
Expand Down Expand Up @@ -1025,12 +1028,21 @@ private static int handleParentheses(String s, int offset, int length, StringBui
regexUnimplemented(s, offset + 3, "Sequence (?@...) not implemented");
} else if (c3 == '{') {
// Check if this is our special unimplemented marker
if (s.startsWith("(?{UNIMPLEMENTED_CODE_BLOCK})", offset)) {
if (s.startsWith(RegexMarkers.CODE_BLOCK, offset)) {
regexUnimplemented(s, offset + 2, "(?{...}) code blocks in regex not implemented");
}
// Handle (?{ ... }) code blocks - try constant folding
offset = handleCodeBlock(s, offset, length, sb, regexFlags);
} else if (c3 == '?' && c4 == '{') {
// Check if this is the unimplemented marker for (??{...}).
// Under JPERL_UNIMPLEMENTED=warn, warn and fall through to the
// existing non-constant handling (which appends "(?:"); under
// die mode, abort with a clean diagnostic. Either way the user
// sees the issue — silent substitution would be a lie.
if (s.startsWith(RegexMarkers.RECURSIVE_PATTERN, offset)) {
regexUnimplementedSoft(s, offset + 3,
"(??{...}) recursive/dynamic regex patterns not implemented");
}
// Handle (??{ ... }) recursive/dynamic regex patterns
// These insert a regex pattern at runtime based on code execution

Expand Down Expand Up @@ -1559,6 +1571,43 @@ static void regexUnimplemented(String s, int offset, String errMsg) {
before + marker + after + "/");
}

/**
* Soft variant of {@link #regexUnimplemented}: under
* {@code JPERL_UNIMPLEMENTED=warn} the caller emits a best-effort fallback
* construct (e.g. an empty non-capturing group) and this method warns so
* the user still sees the issue, letting compilation of the surrounding
* pattern continue. Under the default die mode it behaves identically to
* {@code regexUnimplemented} and throws, aborting the regex compile.
*
* <p>The intent of {@code JPERL_UNIMPLEMENTED=warn} is to let test runs
* keep going past unsupported features, not to silently hide them. Using
* this helper (instead of silent fallback) ensures the user always sees a
* diagnostic.
*/
static void regexUnimplementedSoft(String s, int offset, String errMsg) {
if (!isUnimplementedWarnMode()) {
regexUnimplemented(s, offset, errMsg);
return;
}

if (offset > s.length()) {
offset = s.length();
}
String before = s.substring(0, offset);
String after = s.substring(offset);
String marker = after.isEmpty() ? " <-- HERE" : " <-- HERE ";

String message = errMsg + " in regex; marked by <-- HERE in m/" +
before + marker + after + "/\n";
WarnDie.warn(new RuntimeScalar(message), new RuntimeScalar());
}

private static boolean isUnimplementedWarnMode() {
return "warn".equals(
GlobalVariable.getGlobalHash("main::ENV")
.get("JPERL_UNIMPLEMENTED").toString());
}

/**
* Calculates the maximum length a pattern can match.
* Returns -1 if the pattern can match unlimited length.
Expand Down
Loading