From 5a6b1522a942e5988cc6c14ac4a7a2d160bd5013 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 28 Feb 2026 20:37:36 +0100 Subject: [PATCH 1/5] Store capture groups in lastCaptureGroups array instead of relying on globalMatcher.group() The old code read $1/$2/etc from globalMatcher.group() which could throw IllegalStateException after certain match sequences (caught silently in ScalarSpecialVariable). This was fragile and made capture persistence behavior implicit. Now captures are explicitly stored in a String[] array on each successful capturing match, cleared on successful non-capturing matches and failed capturing matches, and preserved across failed non-capturing matches. RegexState saves/restores the array for dynamic scoping. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin --- .../runtime/regex/RuntimeRegex.java | 40 ++++++++++++++----- .../runtime/runtimetypes/RegexState.java | 5 ++- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java index 48cf2a194..84d4b9920 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java @@ -56,6 +56,9 @@ protected boolean removeEldestEntry(Map.Entry eldest) { public static String lastSuccessfulMatchString = null; // ${^LAST_SUCCESSFUL_PATTERN} public static RuntimeRegex lastSuccessfulPattern = null; + // Capture groups from the last successful match that had captures. + // In Perl 5, $1/$2/etc persist across non-capturing matches. + public static String[] lastCaptureGroups = null; // Indicates if \G assertion is used private final boolean useGAssertion = false; // Compiled regex pattern @@ -462,12 +465,17 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc // Always initialize $1, $2, @+, @-, $`, $&, $' for every successful match globalMatcher = matcher; globalMatchString = inputStr; - // Store match information to avoid IllegalStateException later + if (captureCount > 0) { + lastCaptureGroups = new String[captureCount]; + for (int i = 0; i < captureCount; i++) { + lastCaptureGroups[i] = matcher.group(i + 1); + } + } else { + lastCaptureGroups = null; + } lastMatchedString = matcher.group(0); lastMatchStart = matcher.start(); lastMatchEnd = matcher.end(); - // System.err.println("DEBUG: Set globalMatcher for match at position " + matcher.start() + "-" + matcher.end()); - // System.err.println("DEBUG: Stored match info - matched: '" + lastMatchedString + "', start: " + lastMatchStart + ", end: " + lastMatchEnd); if (regex.regexFlags.isGlobalMatch() && captureCount < 1 && ctx == RuntimeContextType.LIST) { // Global match and no captures, in list context return the matched string @@ -546,6 +554,9 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc lastMatchedString = null; lastMatchStart = -1; lastMatchEnd = -1; + if (matcher.groupCount() > 0) { + lastCaptureGroups = null; + } } if (found) { @@ -715,7 +726,14 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar // Initialize $1, $2, @+, @- only when we have a match globalMatcher = matcher; globalMatchString = inputStr; - // Store match information + if (matcher.groupCount() > 0) { + lastCaptureGroups = new String[matcher.groupCount()]; + for (int i = 0; i < matcher.groupCount(); i++) { + lastCaptureGroups[i] = matcher.group(i + 1); + } + } else { + lastCaptureGroups = null; + } lastMatchedString = matcher.group(0); lastMatchStart = matcher.start(); lastMatchEnd = matcher.end(); @@ -812,6 +830,7 @@ public static void initialize() { lastSuccessfulMatchStart = -1; lastSuccessfulMatchEnd = -1; lastSuccessfulMatchString = null; + lastCaptureGroups = null; // Reset regex cache matched flags reset(); @@ -844,18 +863,21 @@ public static String postMatchString() { } public static String captureString(int group) { - if (globalMatcher == null || group < 0 || group > globalMatcher.groupCount()) { + if (group <= 0) { + if (globalMatcher == null) return null; + return globalMatcher.group(0); + } + if (lastCaptureGroups == null || group > lastCaptureGroups.length) { return null; } - return globalMatcher.group(group); + return lastCaptureGroups[group - 1]; } public static String lastCaptureString() { - if (globalMatcher == null) { + if (lastCaptureGroups == null || lastCaptureGroups.length == 0) { return null; } - int lastGroup = globalMatcher.groupCount(); - return globalMatcher.group(lastGroup); + return lastCaptureGroups[lastCaptureGroups.length - 1]; } public static RuntimeScalar matcherStart(int group) { diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java index 83b1c95b5..d2bdcb392 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java @@ -19,9 +19,9 @@ public class RegexState { public final int lastSuccessfulMatchEnd; public final String lastSuccessfulMatchString; public final RuntimeRegex lastSuccessfulPattern; + public final String[] lastCaptureGroups; public RegexState() { - // Save all the static fields from RuntimeRegex this.globalMatcher = RuntimeRegex.globalMatcher; this.globalMatchString = RuntimeRegex.globalMatchString; this.lastMatchedString = RuntimeRegex.lastMatchedString; @@ -32,10 +32,10 @@ public RegexState() { this.lastSuccessfulMatchEnd = RuntimeRegex.lastSuccessfulMatchEnd; this.lastSuccessfulMatchString = RuntimeRegex.lastSuccessfulMatchString; this.lastSuccessfulPattern = RuntimeRegex.lastSuccessfulPattern; + this.lastCaptureGroups = RuntimeRegex.lastCaptureGroups; } public void restore() { - // Restore all the static fields to RuntimeRegex RuntimeRegex.globalMatcher = this.globalMatcher; RuntimeRegex.globalMatchString = this.globalMatchString; RuntimeRegex.lastMatchedString = this.lastMatchedString; @@ -46,5 +46,6 @@ public void restore() { RuntimeRegex.lastSuccessfulMatchEnd = this.lastSuccessfulMatchEnd; RuntimeRegex.lastSuccessfulMatchString = this.lastSuccessfulMatchString; RuntimeRegex.lastSuccessfulPattern = this.lastSuccessfulPattern; + RuntimeRegex.lastCaptureGroups = this.lastCaptureGroups; } } From 9abb2320dfbd033744bf7f7877db40f15abef8ea Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 28 Feb 2026 20:56:27 +0100 Subject: [PATCH 2/5] Implement dynamic scoping for regex match variables across subroutine calls In Perl 5, $1/$2/etc are dynamically scoped - regex matches inside called functions don't affect the caller's capture variables. PerlOnJava stored these globally, so any function doing a regex match would clobber the caller's $1. Fix: save/restore RegexState around every subroutine call in both JVM compiled (RuntimeCode.apply) and interpreted (InterpretedCode.apply) code paths. Before restoring, materialize any ScalarSpecialVariable values in the return list so that "return $1" works correctly. This fixes ExifTool test 2 where GetTagTable's internal regex clobbered $1 before HandleTag could read it. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin --- .../backend/bytecode/InterpretedCode.java | 28 ++++++----- .../runtime/runtimetypes/RuntimeCode.java | 46 +++++++++++++------ .../runtimetypes/ScalarSpecialVariable.java | 2 +- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java index a9352ffb3..6268f233a 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java +++ b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java @@ -130,22 +130,26 @@ public InterpretedCode(int[] bytecode, Object[] constants, String[] stringPool, */ @Override public RuntimeList apply(RuntimeArray args, int callContext) { - // Dispatch to interpreter (not compiled bytecode) - return BytecodeInterpreter.execute(this, args, callContext); + RegexState savedRegexState = new RegexState(); + try { + RuntimeList result = BytecodeInterpreter.execute(this, args, callContext); + RuntimeCode.materializeSpecialVarsInResult(result); + return result; + } finally { + savedRegexState.restore(); + } } - /** - * Override RuntimeCode.apply() with subroutine name. - * - * @param subroutineName The subroutine name (for stack traces) - * @param args The arguments array (@_) - * @param callContext The calling context - * @return RuntimeList containing the result - */ @Override public RuntimeList apply(String subroutineName, RuntimeArray args, int callContext) { - // Dispatch to interpreter with subroutine name for stack traces - return BytecodeInterpreter.execute(this, args, callContext, subroutineName); + RegexState savedRegexState = new RegexState(); + try { + RuntimeList result = BytecodeInterpreter.execute(this, args, callContext, subroutineName); + RuntimeCode.materializeSpecialVarsInResult(result); + return result; + } finally { + savedRegexState.restore(); + } } /** diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index 83623ab9e..f21064ca8 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -1717,20 +1717,22 @@ public boolean defined() { */ public RuntimeList apply(RuntimeArray a, int callContext) { if (constantValue != null) { - // Alternative way to create constants like: `$constant::{_CAN_PCS} = \$const` return new RuntimeList(constantValue); } + RegexState savedRegexState = new RegexState(); try { - // Wait for the compilerThread to finish if it exists if (this.compilerSupplier != null) { - this.compilerSupplier.get(); // Wait for the task to finish + this.compilerSupplier.get(); } + RuntimeList result; if (isStatic) { - return (RuntimeList) this.methodHandle.invoke(a, callContext); + result = (RuntimeList) this.methodHandle.invoke(a, callContext); } else { - return (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); + result = (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); } + materializeSpecialVarsInResult(result); + return result; } catch (NullPointerException e) { if (this.methodHandle == null) { @@ -1738,11 +1740,8 @@ public RuntimeList apply(RuntimeArray a, int callContext) { } else if (this.codeObject == null && !isStatic) { throw new PerlCompilerException("Subroutine exists but has null code object at "); } else { - // Original NPE from somewhere else throw new PerlCompilerException("Null pointer exception in subroutine call: " + e.getMessage() + " at "); } - - //throw new PerlCompilerException("Undefined subroutine called at "); } catch (InvocationTargetException e) { Throwable targetException = e.getTargetException(); if (!(targetException instanceof RuntimeException)) { @@ -1751,26 +1750,29 @@ public RuntimeList apply(RuntimeArray a, int callContext) { throw (RuntimeException) targetException; } catch (Throwable e) { throw new RuntimeException(e); + } finally { + savedRegexState.restore(); } } public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) { if (constantValue != null) { - // Alternative way to create constants like: `$constant::{_CAN_PCS} = \$const` return new RuntimeList(constantValue); } + RegexState savedRegexState = new RegexState(); try { - // Wait for the compilerThread to finish if it exists if (this.compilerSupplier != null) { - // System.out.println("Waiting for compiler thread to finish..."); - this.compilerSupplier.get(); // Wait for the task to finish + this.compilerSupplier.get(); } + RuntimeList result; if (isStatic) { - return (RuntimeList) this.methodHandle.invoke(a, callContext); + result = (RuntimeList) this.methodHandle.invoke(a, callContext); } else { - return (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); + result = (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); } + materializeSpecialVarsInResult(result); + return result; } catch (NullPointerException e) { throw new PerlCompilerException("Undefined subroutine &" + subroutineName + " called at "); } catch (InvocationTargetException e) { @@ -1781,6 +1783,22 @@ public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) throw (RuntimeException) targetException; } catch (Throwable e) { throw new RuntimeException(e); + } finally { + savedRegexState.restore(); + } + } + + public static void materializeSpecialVarsInResult(RuntimeList result) { + List elems = result.elements; + for (int i = 0; i < elems.size(); i++) { + RuntimeBase elem = elems.get(i); + if (elem instanceof ScalarSpecialVariable ssv) { + RuntimeScalar resolved = ssv.getValueAsScalar(); + RuntimeScalar concrete = new RuntimeScalar(); + concrete.type = resolved.type; + concrete.value = resolved.value; + elems.set(i, concrete); + } } } diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/ScalarSpecialVariable.java b/src/main/java/org/perlonjava/runtime/runtimetypes/ScalarSpecialVariable.java index cfea89ab4..be64a1ae6 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/ScalarSpecialVariable.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/ScalarSpecialVariable.java @@ -108,7 +108,7 @@ public RuntimeScalar addToScalar(RuntimeScalar var) { * * @return The RuntimeScalar value of the special variable, or null if not available. */ - private RuntimeScalar getValueAsScalar() { + RuntimeScalar getValueAsScalar() { try { RuntimeScalar result = switch (variableId) { case CAPTURE -> { From cf2142dfe8d1f6c1de51142423457756035d5c35 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 28 Feb 2026 21:28:28 +0100 Subject: [PATCH 3/5] Add block-level regex state scoping to match Perl 5 behavior In Perl 5, all blocks ({ }, if, while, for, do) scope $1/$2/etc. Previously PerlOnJava only scoped regex state at subroutine call boundaries. This adds save/restore at block level using a RegexUsageDetector visitor to only emit the overhead for blocks that actually contain regex operations. Changes: - New RegexUsageDetector: iterative AST walker that detects matchRegex, replaceRegex, =~, !~, split (stops at sub boundaries) - New SAVE_REGEX_STATE/RESTORE_REGEX_STATE bytecode opcodes - BytecodeCompiler.visit(BlockNode): emit save/restore when needed - EmitBlock.emitBlock(): JVM path save/restore for block bodies - EmitForeach: save/restore for inlined foreach loop bodies Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin --- .../backend/bytecode/BytecodeCompiler.java | 14 ++++ .../backend/bytecode/BytecodeInterpreter.java | 23 ++++++ .../perlonjava/backend/bytecode/Opcodes.java | 8 ++ .../org/perlonjava/backend/jvm/EmitBlock.java | 21 +++++- .../perlonjava/backend/jvm/EmitForeach.java | 17 +++++ .../frontend/analysis/RegexUsageDetector.java | 74 +++++++++++++++++++ 6 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index 3d752fe18..e357f4266 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -1,5 +1,6 @@ package org.perlonjava.backend.bytecode; +import org.perlonjava.frontend.analysis.RegexUsageDetector; import org.perlonjava.frontend.analysis.Visitor; import org.perlonjava.backend.jvm.EmitterMethodCreator; import org.perlonjava.backend.jvm.EmitterContext; @@ -726,6 +727,14 @@ public void visit(BlockNode node) { && node.elements.get(0) instanceof OperatorNode localOp && localOp.operator.equals("local"); + int regexStateReg = -1; + if (!(node instanceof AbstractNode an && an.getBooleanAnnotation("blockIsSubroutine")) + && RegexUsageDetector.containsRegexOperation(node)) { + regexStateReg = allocateRegister(); + emit(Opcodes.SAVE_REGEX_STATE); + emitReg(regexStateReg); + } + // If the first statement is a scoped package (package Foo { }), // save the DynamicVariableManager level before the block body so PUSH_PACKAGE is restored. int scopedPackageLevelReg = -1; @@ -790,6 +799,11 @@ public void visit(BlockNode node) { emitReg(scopedPackageLevelReg); } + if (regexStateReg >= 0) { + emit(Opcodes.RESTORE_REGEX_STATE); + emitReg(regexStateReg); + } + // Set lastResultReg to the outer register (or -1 if VOID context) lastResultReg = outerResultReg; } diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index 17d020b5b..fa05e2fd0 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -75,6 +75,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c java.util.Stack labeledBlockStack = new java.util.Stack<>(); // Each entry is [labelStringPoolIdx, exitPc] + java.util.ArrayList regexStateStack = null; + try { outer: while (true) { @@ -1589,6 +1591,27 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c break; } + case Opcodes.SAVE_REGEX_STATE: { + int rd = bytecode[pc++]; + if (regexStateStack == null) regexStateStack = new java.util.ArrayList<>(); + int level = regexStateStack.size(); + regexStateStack.add(new RegexState()); + registers[rd] = new RuntimeScalar(level); + break; + } + + case Opcodes.RESTORE_REGEX_STATE: { + int rs = bytecode[pc++]; + int level = ((RuntimeScalar) registers[rs]).getInt(); + if (regexStateStack != null && level < regexStateStack.size()) { + regexStateStack.get(level).restore(); + while (regexStateStack.size() > level) { + regexStateStack.remove(regexStateStack.size() - 1); + } + } + break; + } + // ================================================================= // LIST OPERATIONS // ================================================================= diff --git a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java index 83b1f0062..415402b44 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java @@ -1173,5 +1173,13 @@ public class Opcodes { * Format: POP_LABELED_BLOCK */ public static final short POP_LABELED_BLOCK = 355; + /** Save regex state into register rd. + * Format: SAVE_REGEX_STATE rd */ + public static final short SAVE_REGEX_STATE = 356; + + /** Restore regex state from register rs. + * Format: RESTORE_REGEX_STATE rs */ + public static final short RESTORE_REGEX_STATE = 357; + private Opcodes() {} // Utility class - no instantiation } diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java b/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java index 6761b2324..044a4294d 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java @@ -3,7 +3,9 @@ import org.objectweb.asm.Label; import org.objectweb.asm.MethodVisitor; import org.objectweb.asm.Opcodes; -import org.perlonjava.backend.jvm.astrefactor.LargeBlockRefactorer;import org.perlonjava.frontend.analysis.EmitterVisitor; +import org.perlonjava.backend.jvm.astrefactor.LargeBlockRefactorer; +import org.perlonjava.frontend.analysis.EmitterVisitor; +import org.perlonjava.frontend.analysis.RegexUsageDetector; import org.perlonjava.frontend.astnode.*; import org.perlonjava.runtime.runtimetypes.RuntimeContextType; @@ -143,6 +145,17 @@ public static void emitBlock(EmitterVisitor emitterVisitor, BlockNode node) { // Setup 'local' environment if needed Local.localRecord localRecord = Local.localSetup(emitterVisitor.ctx, node, mv); + int regexStateLocal = -1; + if (!node.getBooleanAnnotation("blockIsSubroutine") + && RegexUsageDetector.containsRegexOperation(node)) { + regexStateLocal = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); + mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RegexState"); + mv.visitInsn(Opcodes.DUP); + mv.visitMethodInsn(Opcodes.INVOKESPECIAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "", "()V", false); + mv.visitVarInsn(Opcodes.ASTORE, regexStateLocal); + } + // Add redo label mv.visitLabel(redoLabel); @@ -249,6 +262,12 @@ public static void emitBlock(EmitterVisitor emitterVisitor, BlockNode node) { Local.localTeardown(localRecord, mv); + if (regexStateLocal >= 0) { + mv.visitVarInsn(Opcodes.ALOAD, regexStateLocal); + mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "restore", "()V", false); + } + emitterVisitor.ctx.symbolTable.exitScope(scopeIndex); emitterVisitor.ctx.logDebug("generateCodeBlock end"); } diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java b/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java index 491a814aa..8526e04ec 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java @@ -4,6 +4,7 @@ import org.objectweb.asm.MethodVisitor; import org.objectweb.asm.Opcodes; import org.perlonjava.frontend.analysis.EmitterVisitor; +import org.perlonjava.frontend.analysis.RegexUsageDetector; import org.perlonjava.frontend.astnode.*; import org.perlonjava.runtime.perlmodule.Warnings; import org.perlonjava.runtime.runtimetypes.RuntimeContextType; @@ -517,6 +518,16 @@ public static void emitFor1(EmitterVisitor emitterVisitor, For1Node node) { int bodyScopeIndex = emitterVisitor.ctx.symbolTable.enterScope(); Local.localRecord bodyLocalRecord = Local.localSetup(emitterVisitor.ctx, blockNode, mv); + int regexStateLocal = -1; + if (RegexUsageDetector.containsRegexOperation(blockNode)) { + regexStateLocal = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); + mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RegexState"); + mv.visitInsn(Opcodes.DUP); + mv.visitMethodInsn(Opcodes.INVOKESPECIAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "", "()V", false); + mv.visitVarInsn(Opcodes.ASTORE, regexStateLocal); + } + pushGotoLabelsForBlock(emitterVisitor, blockNode); java.util.List list = blockNode.elements; @@ -544,6 +555,12 @@ public static void emitFor1(EmitterVisitor emitterVisitor, For1Node node) { popGotoLabelsForBlock(emitterVisitor, blockNode); + if (regexStateLocal >= 0) { + mv.visitVarInsn(Opcodes.ALOAD, regexStateLocal); + mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "restore", "()V", false); + } + Local.localTeardown(bodyLocalRecord, mv); emitterVisitor.ctx.symbolTable.exitScope(bodyScopeIndex); } else { diff --git a/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java b/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java new file mode 100644 index 000000000..5b8a87963 --- /dev/null +++ b/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java @@ -0,0 +1,74 @@ +package org.perlonjava.frontend.analysis; + +import org.perlonjava.frontend.astnode.*; + +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.List; + +public class RegexUsageDetector { + + private static final java.util.Set REGEX_OPERATORS = + java.util.Set.of("matchRegex", "replaceRegex"); + private static final java.util.Set REGEX_BINARY_OPERATORS = + java.util.Set.of("=~", "!~", "split"); + + public static boolean containsRegexOperation(Node root) { + if (root == null) return false; + Deque stack = new ArrayDeque<>(); + stack.push(root); + while (!stack.isEmpty()) { + Node node = stack.pop(); + if (node == null) continue; + if (node instanceof SubroutineNode) continue; + if (node instanceof OperatorNode op) { + if (REGEX_OPERATORS.contains(op.operator)) return true; + if (op.operand != null) stack.push(op.operand); + } else if (node instanceof BinaryOperatorNode bop) { + if (REGEX_BINARY_OPERATORS.contains(bop.operator)) return true; + if (bop.left != null) stack.push(bop.left); + if (bop.right != null) stack.push(bop.right); + } else if (node instanceof BlockNode bn) { + pushAll(stack, bn.elements); + } else if (node instanceof ListNode ln) { + pushAll(stack, ln.elements); + if (ln.handle != null) stack.push(ln.handle); + } else if (node instanceof IfNode ifn) { + if (ifn.condition != null) stack.push(ifn.condition); + if (ifn.thenBranch != null) stack.push(ifn.thenBranch); + if (ifn.elseBranch != null) stack.push(ifn.elseBranch); + } else if (node instanceof For1Node f1) { + if (f1.variable != null) stack.push(f1.variable); + if (f1.list != null) stack.push(f1.list); + if (f1.body != null) stack.push(f1.body); + if (f1.continueBlock != null) stack.push(f1.continueBlock); + } else if (node instanceof For3Node f3) { + if (f3.initialization != null) stack.push(f3.initialization); + if (f3.condition != null) stack.push(f3.condition); + if (f3.increment != null) stack.push(f3.increment); + if (f3.body != null) stack.push(f3.body); + if (f3.continueBlock != null) stack.push(f3.continueBlock); + } else if (node instanceof TernaryOperatorNode tern) { + if (tern.condition != null) stack.push(tern.condition); + if (tern.trueExpr != null) stack.push(tern.trueExpr); + if (tern.falseExpr != null) stack.push(tern.falseExpr); + } else if (node instanceof TryNode tryN) { + if (tryN.tryBlock != null) stack.push(tryN.tryBlock); + if (tryN.catchBlock != null) stack.push(tryN.catchBlock); + if (tryN.finallyBlock != null) stack.push(tryN.finallyBlock); + } else if (node instanceof HashLiteralNode hn) { + pushAll(stack, hn.elements); + } else if (node instanceof ArrayLiteralNode an) { + pushAll(stack, an.elements); + } + } + return false; + } + + private static void pushAll(Deque stack, List elements) { + for (int i = elements.size() - 1; i >= 0; i--) { + Node e = elements.get(i); + if (e != null) stack.push(e); + } + } +} From 7f031cce7c7a61f1428c66a7e95a72543af59500 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 28 Feb 2026 21:54:59 +0100 Subject: [PATCH 4/5] Move regex save/restore from apply() into generated code Instead of saving/restoring RegexState in RuntimeCode.apply() and InterpretedCode.apply() (runtime wrapper), move it into the generated subroutine code itself, matching how local variable teardown works: JVM path: save at method entry, materialize+restore at returnLabel (same join point where Local.localTeardown runs) Bytecode path: save at execute() entry, restore in finally block; materialize at RETURN opcode before the return This removes the try/finally overhead from apply() and unifies regex scoping with the existing local-variable cleanup mechanism. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin --- .../backend/bytecode/BytecodeInterpreter.java | 7 +++++-- .../backend/bytecode/InterpretedCode.java | 18 ++---------------- .../backend/jvm/EmitterMethodCreator.java | 18 ++++++++++++++++++ .../runtime/runtimetypes/RuntimeCode.java | 8 -------- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index fa05e2fd0..e99e0aa6c 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -77,6 +77,7 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c java.util.ArrayList regexStateStack = null; + RegexState savedRegexState = new RegexState(); try { outer: while (true) { @@ -106,7 +107,9 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c if (retVal == null) { return new RuntimeList(); } - return retVal.getList(); + RuntimeList retList = retVal.getList(); + RuntimeCode.materializeSpecialVarsInResult(retList); + return retList; } case Opcodes.GOTO: { @@ -2341,7 +2344,7 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } } // end outer while } finally { - // Always pop the interpreter state + savedRegexState.restore(); InterpreterState.pop(); } } diff --git a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java index 6268f233a..2c72cd45b 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java +++ b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java @@ -130,26 +130,12 @@ public InterpretedCode(int[] bytecode, Object[] constants, String[] stringPool, */ @Override public RuntimeList apply(RuntimeArray args, int callContext) { - RegexState savedRegexState = new RegexState(); - try { - RuntimeList result = BytecodeInterpreter.execute(this, args, callContext); - RuntimeCode.materializeSpecialVarsInResult(result); - return result; - } finally { - savedRegexState.restore(); - } + return BytecodeInterpreter.execute(this, args, callContext); } @Override public RuntimeList apply(String subroutineName, RuntimeArray args, int callContext) { - RegexState savedRegexState = new RegexState(); - try { - RuntimeList result = BytecodeInterpreter.execute(this, args, callContext, subroutineName); - RuntimeCode.materializeSpecialVarsInResult(result); - return result; - } finally { - savedRegexState.restore(); - } + return BytecodeInterpreter.execute(this, args, callContext, subroutineName); } /** diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index 6151612e7..2de336371 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -650,6 +650,13 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean // Setup local variables and environment for the method Local.localRecord localRecord = Local.localSetup(ctx, ast, mv); + int regexStateSlot = ctx.symbolTable.allocateLocalVariable(); + mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RegexState"); + mv.visitInsn(Opcodes.DUP); + mv.visitMethodInsn(Opcodes.INVOKESPECIAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "", "()V", false); + mv.visitVarInsn(Opcodes.ASTORE, regexStateSlot); + // Store the computed RuntimeList return value in a dedicated local slot. // This keeps the operand stack empty at join labels (endCatch), avoiding // inconsistent stack map frames when multiple control-flow paths merge. @@ -1041,6 +1048,17 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean mv.visitVarInsn(Opcodes.ALOAD, returnListSlot); } + // Materialize special vars ($1 etc.) in the return list before restoring regex state + mv.visitInsn(Opcodes.DUP); + mv.visitMethodInsn(Opcodes.INVOKESTATIC, + "org/perlonjava/runtime/runtimetypes/RuntimeCode", + "materializeSpecialVarsInResult", + "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;)V", false); + + mv.visitVarInsn(Opcodes.ALOAD, regexStateSlot); + mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, + "org/perlonjava/runtime/runtimetypes/RegexState", "restore", "()V", false); + // Teardown local variables and environment after the return value is materialized Local.localTeardown(localRecord, mv); diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index f21064ca8..90cb537e0 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -1719,7 +1719,6 @@ public RuntimeList apply(RuntimeArray a, int callContext) { if (constantValue != null) { return new RuntimeList(constantValue); } - RegexState savedRegexState = new RegexState(); try { if (this.compilerSupplier != null) { this.compilerSupplier.get(); @@ -1731,7 +1730,6 @@ public RuntimeList apply(RuntimeArray a, int callContext) { } else { result = (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); } - materializeSpecialVarsInResult(result); return result; } catch (NullPointerException e) { @@ -1750,8 +1748,6 @@ public RuntimeList apply(RuntimeArray a, int callContext) { throw (RuntimeException) targetException; } catch (Throwable e) { throw new RuntimeException(e); - } finally { - savedRegexState.restore(); } } @@ -1759,7 +1755,6 @@ public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) if (constantValue != null) { return new RuntimeList(constantValue); } - RegexState savedRegexState = new RegexState(); try { if (this.compilerSupplier != null) { this.compilerSupplier.get(); @@ -1771,7 +1766,6 @@ public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) } else { result = (RuntimeList) this.methodHandle.invoke(this.codeObject, a, callContext); } - materializeSpecialVarsInResult(result); return result; } catch (NullPointerException e) { throw new PerlCompilerException("Undefined subroutine &" + subroutineName + " called at "); @@ -1783,8 +1777,6 @@ public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) throw (RuntimeException) targetException; } catch (Throwable e) { throw new RuntimeException(e); - } finally { - savedRegexState.restore(); } } From b7aa802521ff5a048bc3ce3daa0ec23ab9ae362d Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 28 Feb 2026 22:42:40 +0100 Subject: [PATCH 5/5] Fix regex state regression: blockIsSubroutine for named subs, safe matcher access - Set blockIsSubroutine annotation in createRuntimeCode() so named subroutine blocks skip redundant block-level regex save/restore - Use lastMatchedString in captureString(0) instead of globalMatcher.group(0) to avoid IllegalStateException after s///g - Use saved lastMatchStart/lastMatchEnd for matcherStart/matcherEnd group 0 - Add try-catch for matcherStart/matcherEnd on non-zero groups - Remove all debug logging (RegexState id/label, EmitBlock/EmitterMethodCreator/ RuntimeCode System.err.println) re/subst.t: 183/281 (matches master) in both JVM and interpreter modes. All 154 unit tests pass. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin --- .../backend/bytecode/BytecodeCompiler.java | 5 +++ .../backend/bytecode/BytecodeInterpreter.java | 21 +++++++++ .../backend/bytecode/InterpretedCode.java | 5 ++- .../perlonjava/backend/bytecode/Opcodes.java | 14 ++++-- .../org/perlonjava/backend/jvm/EmitBlock.java | 4 ++ .../perlonjava/backend/jvm/EmitForeach.java | 3 ++ .../backend/jvm/EmitterMethodCreator.java | 12 ++++- .../frontend/analysis/RegexUsageDetector.java | 20 +++++++++ .../runtime/regex/RuntimeRegex.java | 45 +++++++++++-------- .../runtime/runtimetypes/RegexState.java | 27 ++++++++++- .../runtime/runtimetypes/RuntimeCode.java | 13 +++++- 11 files changed, 141 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index e357f4266..d6f446fae 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -727,6 +727,11 @@ public void visit(BlockNode node) { && node.elements.get(0) instanceof OperatorNode localOp && localOp.operator.equals("local"); + // Perl 5 block-level regex state scoping: save $1, $&, etc. on entry, restore on exit. + // Skip if blockIsSubroutine: the subroutine-level save in BytecodeInterpreter.execute() + // (savedRegexState + finally) already handles this, so block-level would be redundant. + // If last/next/redo jumps past the RESTORE opcode, the interpreter's truncation logic + // in RESTORE_REGEX_STATE handles cleanup of orphaned stack entries. int regexStateReg = -1; if (!(node instanceof AbstractNode an && an.getBooleanAnnotation("blockIsSubroutine")) && RegexUsageDetector.containsRegexOperation(node)) { diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index e99e0aa6c..856e06f57 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -75,8 +75,15 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c java.util.Stack labeledBlockStack = new java.util.Stack<>(); // Each entry is [labelStringPoolIdx, exitPc] + // Block-level regex state stack, used by SAVE_REGEX_STATE/RESTORE_REGEX_STATE opcodes. + // Each block containing regex ops pushes a snapshot; the matching restore pops it. + // Lazily initialized because most subroutines don't have nested regex-using blocks. java.util.ArrayList regexStateStack = null; + // Subroutine-level regex state: unconditionally saved on entry, restored in the + // finally block. This implements Perl 5 semantics where $1, $&, etc. are + // dynamically scoped per subroutine. The finally block guarantees restoration + // even when the sub exits via return, die, or exception. RegexState savedRegexState = new RegexState(); try { outer: @@ -108,6 +115,10 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c return new RuntimeList(); } RuntimeList retList = retVal.getList(); + // Materialize $1, $&, etc. into concrete scalars BEFORE returning. + // The finally block will call savedRegexState.restore(), which overwrites + // global regex state. Any lazy ScalarSpecialVariable references in the + // return list must be resolved while this sub's regex state is still active. RuntimeCode.materializeSpecialVarsInResult(retList); return retList; } @@ -1595,6 +1606,11 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } case Opcodes.SAVE_REGEX_STATE: { + // Block-level regex state save. Snapshot current regex state and + // store the stack level in register rd. The level is used by + // RESTORE_REGEX_STATE to find the correct snapshot and truncate + // any orphaned entries (e.g., if inner blocks were skipped by + // last/next/redo/die). int rd = bytecode[pc++]; if (regexStateStack == null) regexStateStack = new java.util.ArrayList<>(); int level = regexStateStack.size(); @@ -1604,6 +1620,9 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } case Opcodes.RESTORE_REGEX_STATE: { + // Block-level regex state restore. Restore snapshot at the saved + // level and discard all entries above it (handles cases where inner + // RESTORE opcodes were skipped by last/next/redo/die). int rs = bytecode[pc++]; int level = ((RuntimeScalar) registers[rs]).getInt(); if (regexStateStack != null && level < regexStateStack.size()) { @@ -2344,6 +2363,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } } // end outer while } finally { + // Restore the caller's regex state. Runs after any return/die/exception, + // ensuring the caller sees its own $1, $&, etc. regardless of how the sub exited. savedRegexState.restore(); InterpreterState.pop(); } diff --git a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java index 2c72cd45b..a77d0a5ac 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java +++ b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java @@ -121,9 +121,12 @@ public InterpretedCode(int[] bytecode, Object[] constants, String[] stringPool, /** * Override RuntimeCode.apply() to dispatch to interpreter. * - * This is the ONLY method that differs from compiled RuntimeCode. + *

This is the ONLY method that differs from compiled RuntimeCode. * The API signature is IDENTICAL, ensuring perfect compatibility. * + *

Regex state save/restore is handled inside {@code BytecodeInterpreter.execute()} + * (via {@code savedRegexState}/finally), not here. + * * @param args The arguments array (@_) * @param callContext The calling context (VOID/SCALAR/LIST) * @return RuntimeList containing the result (may be RuntimeControlFlowList) diff --git a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java index 415402b44..02a22b473 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java @@ -1173,12 +1173,18 @@ public class Opcodes { * Format: POP_LABELED_BLOCK */ public static final short POP_LABELED_BLOCK = 355; - /** Save regex state into register rd. - * Format: SAVE_REGEX_STATE rd */ + /** Save regex state (Perl 5 dynamic scoping of $1, $&, etc.) into register rd. + * The register receives an integer index into the interpreter's regexStateStack. + * Emitted at block entry for blocks containing regex operations. + * @see org.perlonjava.runtime.runtimetypes.RegexState + * Format: SAVE_REGEX_STATE rd */ public static final short SAVE_REGEX_STATE = 356; - /** Restore regex state from register rs. - * Format: RESTORE_REGEX_STATE rs */ + /** Restore regex state from the level stored in register rs, undoing all + * regex state changes made within the block. Also truncates any orphaned + * stack entries (from inner blocks skipped by last/next/redo/die). + * Emitted at block exit. + * Format: RESTORE_REGEX_STATE rs */ public static final short RESTORE_REGEX_STATE = 357; private Opcodes() {} // Utility class - no instantiation diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java b/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java index 044a4294d..a1c02eb1c 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitBlock.java @@ -145,6 +145,9 @@ public static void emitBlock(EmitterVisitor emitterVisitor, BlockNode node) { // Setup 'local' environment if needed Local.localRecord localRecord = Local.localSetup(emitterVisitor.ctx, node, mv); + // Perl 5 block-level regex state scoping: save $1, $&, etc. on entry, restore on exit. + // Skip if blockIsSubroutine: EmitterMethodCreator already emits subroutine-level + // save/restore (regexStateSlot), so block-level would be redundant. int regexStateLocal = -1; if (!node.getBooleanAnnotation("blockIsSubroutine") && RegexUsageDetector.containsRegexOperation(node)) { @@ -262,6 +265,7 @@ public static void emitBlock(EmitterVisitor emitterVisitor, BlockNode node) { Local.localTeardown(localRecord, mv); + // Restore block-level regex state (counterpart to the save above) if (regexStateLocal >= 0) { mv.visitVarInsn(Opcodes.ALOAD, regexStateLocal); mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java b/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java index 8526e04ec..3a1f22675 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitForeach.java @@ -518,6 +518,8 @@ public static void emitFor1(EmitterVisitor emitterVisitor, For1Node node) { int bodyScopeIndex = emitterVisitor.ctx.symbolTable.enterScope(); Local.localRecord bodyLocalRecord = Local.localSetup(emitterVisitor.ctx, blockNode, mv); + // Perl 5 regex state scoping for foreach body. Each iteration saves/restores + // independently. No blockIsSubroutine check needed: foreach body is never a sub. int regexStateLocal = -1; if (RegexUsageDetector.containsRegexOperation(blockNode)) { regexStateLocal = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); @@ -555,6 +557,7 @@ public static void emitFor1(EmitterVisitor emitterVisitor, For1Node node) { popGotoLabelsForBlock(emitterVisitor, blockNode); + // Restore block-level regex state at end of each iteration if (regexStateLocal >= 0) { mv.visitVarInsn(Opcodes.ALOAD, regexStateLocal); mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index 2de336371..6e3b85b36 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -650,6 +650,9 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean // Setup local variables and environment for the method Local.localRecord localRecord = Local.localSetup(ctx, ast, mv); + // Subroutine-level regex state scoping (Perl 5 semantics): unconditionally save + // the caller's $1, $&, etc. on entry. Restored at returnLabel before ARETURN. + // This is separate from block-level scoping (EmitBlock/EmitForeach + RegexUsageDetector). int regexStateSlot = ctx.symbolTable.allocateLocalVariable(); mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RegexState"); mv.visitInsn(Opcodes.DUP); @@ -1048,13 +1051,16 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean mv.visitVarInsn(Opcodes.ALOAD, returnListSlot); } - // Materialize special vars ($1 etc.) in the return list before restoring regex state + // Materialize $1, $&, etc. into concrete scalars BEFORE restoring regex state. + // The return list may contain lazy ScalarSpecialVariable references; if we + // restored first, they would resolve to the caller's (stale) values. mv.visitInsn(Opcodes.DUP); mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", "materializeSpecialVarsInResult", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;)V", false); + // Restore caller's regex state (counterpart to the save at method entry) mv.visitVarInsn(Opcodes.ALOAD, regexStateSlot); mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/runtimetypes/RegexState", "restore", "()V", false); @@ -1520,6 +1526,10 @@ public static Class loadBytecode(EmitterContext ctx, byte[] classData) { */ public static RuntimeCode createRuntimeCode( EmitterContext ctx, Node ast, boolean useTryCatch) { + // Ensure block-level regex save/restore is skipped for the outermost block of a sub/method. + // For anonymous subs this is set by SubroutineNode constructor, but for named subs the block + // is passed directly here without going through SubroutineNode. + ast.setAnnotation("blockIsSubroutine", true); try { // Try compiler path Class generatedClass = createClassWithMethod(ctx, ast, useTryCatch); diff --git a/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java b/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java index 5b8a87963..e2337f8e2 100644 --- a/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java +++ b/src/main/java/org/perlonjava/frontend/analysis/RegexUsageDetector.java @@ -6,13 +6,32 @@ import java.util.Deque; import java.util.List; +/** + * Detects whether an AST subtree contains regex operations (=~, !~, split, m//, s///). + * Used as an optimization gate: block-level regex state save/restore is only emitted + * for blocks that actually use regex, avoiding unnecessary snapshots of {@code RegexState}. + * + *

The walk stops at {@link SubroutineNode} boundaries because nested subroutines + * get their own subroutine-level regex state save/restore (in {@code EmitterMethodCreator} + * / {@code BytecodeInterpreter}), so their regex usage should not trigger block-level + * save/restore in the enclosing scope. + * + * @see org.perlonjava.runtime.runtimetypes.RegexState + */ public class RegexUsageDetector { + /** Unary operators that perform regex matching/substitution. */ private static final java.util.Set REGEX_OPERATORS = java.util.Set.of("matchRegex", "replaceRegex"); + /** Binary operators that perform regex matching (=~, !~) or use regex internally (split). */ private static final java.util.Set REGEX_BINARY_OPERATORS = java.util.Set.of("=~", "!~", "split"); + /** + * Returns true if the AST rooted at {@code root} contains any regex operation, + * excluding nested subroutine bodies (which have their own regex state scope). + * Uses iterative DFS to avoid StackOverflow on deeply nested ASTs. + */ public static boolean containsRegexOperation(Node root) { if (root == null) return false; Deque stack = new ArrayDeque<>(); @@ -20,6 +39,7 @@ public static boolean containsRegexOperation(Node root) { while (!stack.isEmpty()) { Node node = stack.pop(); if (node == null) continue; + // Stop at subroutine boundaries: nested subs have their own regex state scope if (node instanceof SubroutineNode) continue; if (node instanceof OperatorNode op) { if (REGEX_OPERATORS.contains(op.operator)) return true; diff --git a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java index 84d4b9920..9c6159ca3 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java @@ -864,8 +864,7 @@ public static String postMatchString() { public static String captureString(int group) { if (group <= 0) { - if (globalMatcher == null) return null; - return globalMatcher.group(0); + return lastMatchedString; } if (lastCaptureGroups == null || group > lastCaptureGroups.length) { return null; @@ -881,35 +880,45 @@ public static String lastCaptureString() { } public static RuntimeScalar matcherStart(int group) { - if (globalMatcher == null) { - return scalarUndef; + if (group == 0) { + return lastMatchStart >= 0 ? getScalarInt(lastMatchStart) : scalarUndef; } - if (group < 0 || group > globalMatcher.groupCount()) { + if (globalMatcher == null) { return scalarUndef; } - int start = globalMatcher.start(group); - // If the group didn't participate in the match, start() returns -1 - // Perl returns undef in this case - if (start == -1) { + try { + if (group < 0 || group > globalMatcher.groupCount()) { + return scalarUndef; + } + int start = globalMatcher.start(group); + if (start == -1) { + return scalarUndef; + } + return getScalarInt(start); + } catch (IllegalStateException e) { return scalarUndef; } - return getScalarInt(start); } public static RuntimeScalar matcherEnd(int group) { - if (globalMatcher == null) { - return scalarUndef; + if (group == 0) { + return lastMatchEnd >= 0 ? getScalarInt(lastMatchEnd) : scalarUndef; } - if (group < 0 || group > globalMatcher.groupCount()) { + if (globalMatcher == null) { return scalarUndef; } - int end = globalMatcher.end(group); - // If the group didn't participate in the match, end() returns -1 - // Perl returns undef in this case - if (end == -1) { + try { + if (group < 0 || group > globalMatcher.groupCount()) { + return scalarUndef; + } + int end = globalMatcher.end(group); + if (end == -1) { + return scalarUndef; + } + return getScalarInt(end); + } catch (IllegalStateException e) { return scalarUndef; } - return getScalarInt(end); } public static int matcherSize() { diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java index d2bdcb392..64262f659 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RegexState.java @@ -5,8 +5,31 @@ import java.util.regex.Matcher; /** - * RegexState holds a snapshot of all regex-related state. - * Used to save and restore regex state when entering/exiting eval blocks. + * Immutable snapshot of all regex-related global state (Perl's $1, $&, $`, $', etc.). + * + *

In Perl 5, regex match variables are dynamically scoped: each subroutine and + * each block that contains regex operations saves the current state on entry and + * restores it on exit. This ensures that a caller's match variables are not + * clobbered by callees or inner blocks. + * + *

Two levels of scoping use this class: + *

    + *
  • Subroutine-level (unconditional): saved at method entry, restored at exit. + * In JVM-compiled code: {@code EmitterMethodCreator} ({@code regexStateSlot}). + * In interpreted code: {@code BytecodeInterpreter} ({@code savedRegexState} + finally block).
  • + *
  • Block-level (conditional, gated by {@link org.perlonjava.frontend.analysis.RegexUsageDetector}): + * saved/restored around blocks that contain regex ops. + * In JVM-compiled code: {@code EmitBlock} / {@code EmitForeach}. + * In interpreted code: {@code SAVE_REGEX_STATE} / {@code RESTORE_REGEX_STATE} opcodes.
  • + *
+ * + *

Important ordering constraint: When a subroutine returns a value containing + * lazy {@link org.perlonjava.runtime.specialvariables.ScalarSpecialVariable} references (e.g., $1), + * those must be materialized via {@link RuntimeCode#materializeSpecialVarsInResult} + * BEFORE calling {@link #restore()}, otherwise the restored (caller's) state would be + * read instead of the subroutine's state. + * + * @see org.perlonjava.runtime.regex.RuntimeRegex for the global static fields being snapshotted */ public class RegexState { public final Matcher globalMatcher; diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index 90cb537e0..77469eda3 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -1708,8 +1708,12 @@ public boolean defined() { } /** - * Method to apply (execute) a subroutine reference. - * Invokes the method associated with the code object, passing the RuntimeArray and RuntimeContextType as arguments. + * Invokes the JVM-compiled method associated with this code object. + * + *

Regex state scoping ($1, $&, etc.) is NOT handled here. For JVM-compiled code + * it is emitted directly into the generated method by {@code EmitterMethodCreator} + * ({@code regexStateSlot} save/restore). For interpreted code, {@code InterpretedCode} + * overrides this method and delegates to {@code BytecodeInterpreter.execute()}. * * @param a the RuntimeArray containing the arguments for the subroutine * @param callContext the context in which the subroutine is called @@ -1780,6 +1784,11 @@ public RuntimeList apply(String subroutineName, RuntimeArray a, int callContext) } } + /** + * Replace lazy {@link ScalarSpecialVariable} references ($1, $&, etc.) in a return list + * with concrete {@link RuntimeScalar} copies. Must be called BEFORE {@link RegexState#restore()} + * so that the values reflect the subroutine's regex state, not the caller's. + */ public static void materializeSpecialVarsInResult(RuntimeList result) { List elems = result.elements; for (int i = 0; i < elems.size(); i++) {