diff --git a/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java b/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java index ed6bd473c..b80192f60 100644 --- a/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java +++ b/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java @@ -311,15 +311,10 @@ private static void emitLogicalOperatorSimple(EmitterVisitor emitterVisitor, Bin rewritten = true; } - // For RUNTIME context, preserve it; otherwise use SCALAR for boolean evaluation - int operandContext = emitterVisitor.ctx.contextType == RuntimeContextType.RUNTIME - ? RuntimeContextType.RUNTIME - : RuntimeContextType.SCALAR; - resultRef = emitterVisitor.ctx.javaClassInfo.acquireSpillRefOrAllocate(emitterVisitor.ctx.symbolTable); - // Evaluate LHS and store it. - node.left.accept(emitterVisitor.with(operandContext)); + // Evaluate LHS in SCALAR context (for boolean test) and store it. + node.left.accept(emitterVisitor.with(RuntimeContextType.SCALAR)); emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, resultRef); // Boolean test on the stored LHS. @@ -327,8 +322,12 @@ private static void emitLogicalOperatorSimple(EmitterVisitor emitterVisitor, Bin mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/RuntimeBase", getBoolean, "()Z", false); mv.visitJumpInsn(compareOpcode, endLabel); - // LHS didn't short-circuit: evaluate RHS, overwrite result. - node.right.accept(emitterVisitor.with(operandContext)); + // LHS didn't short-circuit: evaluate RHS in current context (may be RUNTIME at sub exit). + // For RUNTIME context, preserve it; otherwise use SCALAR for boolean evaluation. + int rhsContext = emitterVisitor.ctx.contextType == RuntimeContextType.RUNTIME + ? RuntimeContextType.RUNTIME + : RuntimeContextType.SCALAR; + node.right.accept(emitterVisitor.with(rhsContext)); emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, resultRef); // Return whichever side won the short-circuit. diff --git a/src/main/java/org/perlonjava/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/regex/RegexPreprocessor.java index 9dd956897..78d4d8871 100644 --- a/src/main/java/org/perlonjava/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/regex/RegexPreprocessor.java @@ -77,6 +77,10 @@ static String preProcessRegex(String s, RegexFlags regexFlags) { captureGroupCount = 0; deferredUnicodePropertyEncountered = false; + // First, escape invalid quantifier braces (Perl compatibility) + // DISABLED: Causes test regressions - needs more work + // s = escapeInvalidQuantifierBraces(s); + s = convertPythonStyleGroups(s); s = transformSimpleConditionals(s); s = removeUnderscoresFromEscapes(s); @@ -93,6 +97,214 @@ static String preProcessRegex(String s, RegexFlags regexFlags) { return result; } + /** + * Escape unescaped braces that don't form valid quantifiers. + * Perl allows invalid quantifier braces and treats them as literals. + * Java Pattern.compile() rejects them, so we must escape them. + * + * Valid quantifiers: {n}, {n,}, {n,m} where n and m are non-negative integers + * Invalid quantifiers: {(.*?)}, {abc}, {}, {,5}, etc. + * + * IMPORTANT: This is a high-risk preprocessing step that modifies brace characters. + * Known edge cases that must be handled correctly: + * + * 1. ESCAPE SEQUENCES WITH BRACES (must NOT be escaped): + * - \N{name} - Named Unicode character (e.g., \N{LATIN SMALL LETTER A}) + * - \x{...} - Hexadecimal character code (e.g., \x{1F600}) + * - \o{...} - Octal character code (e.g., \o{777}) + * - \p{...} - Unicode property (e.g., \p{Letter}) + * - \P{...} - Negated Unicode property (e.g., \P{Number}) + * - \g{...} - Named or relative backreference (e.g., \g{name}, \g{-1}) + * Currently handled: N, x, o, p, P, g + * + * 2. CHARACTER CLASSES (braces inside [...] are always literal): + * - [a{3}] means "match 'a', '{', '3', or '}'" not "match 'aaa'" + * - Nested classes like [a-z[0-9]{3}] must track nesting depth + * + * 3. VALID QUANTIFIERS (must NOT be escaped): + * - {n} - exactly n times (e.g., a{3}) + * - {n,} - n or more times (e.g., a{2,}) + * - {n,m} - between n and m times (e.g., a{2,5}) + * + * 4. ALREADY ESCAPED BRACES (must NOT be double-escaped): + * - \{ and \} should remain as-is + * - Track backslash escaping carefully to avoid double-escaping + * + * 5. POSSESSIVE AND LAZY QUANTIFIERS: + * - {n}+ (possessive) and {n}? (lazy) should work with valid quantifiers + * + * POTENTIAL ISSUES NOT YET HANDLED: + * - Extended bracketed character classes: (?[...]) may contain braces + * - Conditional patterns: (?(condition){yes}{no}) uses braces for branches + * - Subroutine definitions: (?(DEFINE)(?...)) may have complex nesting + * - Code blocks: (?{...}) and (??{...}) use braces but are handled elsewhere + * - Named capture definitions: (?...) - are braces allowed in names? + * - Unicode named sequences: \N{...} may contain nested braces in some contexts + * + * If new regex features are added that use braces, this function MUST be updated. + * Test changes thoroughly with unit/regex/unescaped_braces.t and regex test suite. + */ + private static String escapeInvalidQuantifierBraces(String pattern) { + StringBuilder result = new StringBuilder(); + boolean inCharClass = false; + boolean escaped = false; + + for (int i = 0; i < pattern.length(); i++) { + char c = pattern.charAt(i); + + // Handle escape sequences + if (escaped) { + result.append(c); + + // Check if this is an escape sequence that uses braces: \N{...}, \x{...}, \o{...}, \p{...}, \P{...}, \g{...} + if ((c == 'N' || c == 'x' || c == 'o' || c == 'p' || c == 'P' || c == 'g') && + i + 1 < pattern.length() && pattern.charAt(i + 1) == '{') { + // Skip the entire escape sequence with braces + result.append('{'); + i++; // Move past '{' + int braceDepth = 1; + i++; // Move to first character inside braces + + while (i < pattern.length() && braceDepth > 0) { + char ch = pattern.charAt(i); + result.append(ch); + if (ch == '\\' && i + 1 < pattern.length()) { + // Skip escaped character inside the escape sequence + i++; + if (i < pattern.length()) { + result.append(pattern.charAt(i)); + } + } else if (ch == '{') { + braceDepth++; + } else if (ch == '}') { + braceDepth--; + } + i++; + } + i--; // Back up one since the loop will increment + } + + escaped = false; + continue; + } + + if (c == '\\') { + result.append(c); + escaped = true; + continue; + } + + // Track character class boundaries (braces inside [...] are always literal) + if (c == '[') { + inCharClass = true; + result.append(c); + continue; + } + if (c == ']') { + inCharClass = false; + result.append(c); + continue; + } + + // Only process braces outside character classes + if (!inCharClass && c == '{') { + // Look ahead to check if this is a valid quantifier + int closePos = findMatchingCloseBraceForEscape(pattern, i); + if (closePos > 0 && isValidQuantifierContent(pattern, i + 1, closePos)) { + result.append(c); // Keep valid quantifier as-is + } else { + result.append("\\{"); // Escape invalid quantifier + } + } else if (!inCharClass && c == '}') { + // Check if this closes a quantifier that we kept unescaped + if (!closesValidQuantifier(result, pattern, i)) { + result.append("\\}"); // Escape unmatched closing brace + } else { + result.append(c); + } + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Find the position of closing brace that matches opening brace at pos. + * Returns -1 if no matching brace found. + */ + private static int findMatchingCloseBraceForEscape(String pattern, int openPos) { + for (int i = openPos + 1; i < pattern.length(); i++) { + char c = pattern.charAt(i); + if (c == '\\') { + i++; // Skip escaped character + continue; + } + if (c == '}') { + return i; + } + } + return -1; // No closing brace found + } + + /** + * Check if content between braces forms a valid quantifier. + * Valid: {n}, {n,}, {n,m} where n and m are non-negative integers + * Invalid: {(.*?)}, {abc}, {}, {,5}, etc. + */ + private static boolean isValidQuantifierContent(String pattern, int start, int end) { + if (start >= end) { + return false; // Empty braces {} + } + + String content = pattern.substring(start, end); + + // Check for {n}, {n,}, or {n,m} pattern + if (content.matches("\\d+")) { + return true; // {n} + } + if (content.matches("\\d+,")) { + return true; // {n,} + } + if (content.matches("\\d+,\\d+")) { + return true; // {n,m} + } + + return false; + } + + /** + * Check if closing brace at position closePos closes a valid quantifier + * that we kept unescaped in the result buffer. + */ + private static boolean closesValidQuantifier(StringBuilder result, String pattern, int closePos) { + // Find the most recent unescaped opening brace in result + int openPos = -1; + for (int i = result.length() - 1; i >= 0; i--) { + if (result.charAt(i) == '{') { + // Check if it's escaped + int backslashCount = 0; + for (int j = i - 1; j >= 0 && result.charAt(j) == '\\'; j--) { + backslashCount++; + } + if (backslashCount % 2 == 0) { + // Even number of backslashes (or zero) means { is not escaped + openPos = i; + break; + } + } + } + + if (openPos < 0) { + return false; // No unescaped opening brace found + } + + // Extract content and validate + String content = result.substring(openPos + 1); + return content.matches("\\d+") || content.matches("\\d+,") || content.matches("\\d+,\\d+"); + } + /** * Expand characters with multi-character case folds into alternations. * For example: ß → (?:ß|ss|SS|Ss|sS) diff --git a/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java index 71b6b2915..2cc6db9be 100644 --- a/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java @@ -19,49 +19,53 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) char nextChar = s.charAt(offset); // Check for numeric backreferences vs octal escapes - // In Perl: \400, \600, \777 are octals (> 255), not backreferences - // But \1-\9 followed by non-octal digits are backreferences + // In Perl: + // - \1 through \9 are backreferences (when groups exist) + // - \10, \11, etc. are also backreferences (when groups exist) + // - \0 through \377 (up to 3 digits) are octal escapes (values 0-255) + // - \400 and above are octal escapes (values > 255) + // - If no groups exist, \1-\9 are treated as octals, not errors + // + // Key insight: A sequence like \337 is a 3-digit octal (decimal 223 = ß) + // It should NOT be treated as backreference \3 followed by literal "37" + // + // Strategy: + // 1. Check if we have a valid 3-digit octal sequence -> always treat as octal + // 2. If we have 1-2 digits starting with \1-\9: + // - If capture groups exist -> treat as backreference + // - If no capture groups exist -> treat as octal boolean isOctalNotBackref = false; - if (nextChar >= '1' && nextChar <= '9') { - // Check if this might be a 3-digit octal > 255 - if (nextChar >= '1' && nextChar <= '7' && offset + 2 < length) { - int d1 = nextChar - '0'; + if (nextChar >= '0' && nextChar <= '7') { + // Potential octal - check if we have 2 more octal digits + if (offset + 2 < length) { char c2 = s.charAt(offset + 1); - char c3 = offset + 2 < length ? s.charAt(offset + 2) : '\0'; + char c3 = s.charAt(offset + 2); if (c2 >= '0' && c2 <= '7' && c3 >= '0' && c3 <= '7') { - int octalValue = d1 * 64 + (c2 - '0') * 8 + (c3 - '0'); - if (octalValue > 255) { - // This is an octal escape, not a backreference - // Fall through to octal handling below at line ~320 - // Leave the backslash in sb for the octal handler to manage - // offset stays pointing to the first octal digit ('4' in \400) - isOctalNotBackref = true; - } - // else: It's a 3-digit octal <= 255, treat as backreference - // (Perl's behavior: \1-\377 are backreferences if groups exist) + // We have 3 octal digits - this is ALWAYS an octal escape + // Example: \337, \123, \400, etc. + isOctalNotBackref = true; } } + // Note: If we have fewer than 3 octal digits, we'll check for backreferences below + // Example: \1, \12 could be backreferences if groups exist, octals if not } if (!isOctalNotBackref && nextChar >= '1' && nextChar <= '9') { - // This is a backreference like \1, \2, etc. - int refNum = nextChar - '0'; - - // Check if we have ANY capture groups at all - // If there are no groups, this is always an error - // But if there are groups, allow forward references + // Check if we have capture groups if (RegexPreprocessor.captureGroupCount == 0) { - sb.setLength(sb.length() - 1); // Remove the backslash - RegexPreprocessor.regexError(s, offset + 1, "Reference to nonexistent group"); + // No capture groups - treat as octal + // Fall through to octal handling below + isOctalNotBackref = true; + } else { + // This is a backreference like \1, \2, etc. + // Forward references are allowed when there are capture groups + // Perl allows forward references like (\3|b)\2(a) where \3 refers to group 3 + // which hasn't been captured yet. This is valid and the reference just won't match + // until group 3 is actually captured. + sb.append(nextChar); + return offset; } - // Forward references are allowed when there are capture groups - // Perl allows forward references like (\3|b)\2(a) where \3 refers to group 3 - // which hasn't been captured yet. This is valid and the reference just won't match - // until group 3 is actually captured. - - sb.append(nextChar); - return offset; } if (nextChar == 'k' && offset + 1 < length && s.charAt(offset + 1) == '\'') { // Handle \k'name' backreference (Perl syntax) @@ -374,21 +378,22 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) sb.setLength(sb.length() - 1); // Remove the backslash sb.append(String.format("\\x{%X}", octalValue)); offset += octalLength - 1; // -1 because caller will increment - } else if (octalValue <= 255 && octalLength == 3) { - // Standard 3-digit octal, prepend 0 for Java + } else if (octalLength == 3) { + // 3-digit octal, prepend 0 for Java + // Java requires \0nnn format sb.append('0'); sb.append(Character.toChars(c2)); - } else if (c2 == '0' && octalLength == 1) { - // Single \0 becomes \00 - sb.append('0'); - sb.append('0'); - } else if (c2 >= '1' && c2 <= '3' && octalLength == 3) { - // 3-digit octal starting with 1-3, prepend 0 - sb.append('0'); - sb.append(Character.toChars(c2)); - } else { - // Short octal or single digit, pass through - sb.append(Character.toChars(c2)); + // The remaining 2 digits will be added by caller's loop + } else if (octalLength == 2) { + // 2-digit octal like \12 becomes \012 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); + offset += octalLength - 1; // Skip the second digit + } else if (octalLength == 1) { + // Single digit octal: \0 through \7 + // Convert to 2-digit format for Java: \00 through \07 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); } } else if (c2 == '8' || c2 == '9') { // \8 and \9 are not valid octals - treat as literal digits @@ -610,25 +615,23 @@ static int handleRegexCharacterClassEscape(int offset, String s, StringBuilder s sb.append(String.format("x{%X}", octalValue)); offset += octalLength - 1; // -1 because outer loop will increment lastChar = octalValue; - } else if (octalValue <= 255 && octalLength == 3) { - // Standard 3-digit octal, prepend 0 for Java + } else if (octalLength == 3) { + // 3-digit octal, prepend 0 for Java sb.append('0'); sb.append(Character.toChars(c2)); lastChar = octalValue; - } else if (c2 == '0' && octalLength == 1) { - // Single \0 becomes \00 - sb.append('0'); - sb.append('0'); - lastChar = 0; - } else if (c2 >= '1' && c2 <= '3' && octalLength == 3) { - // 3-digit octal starting with 1-3, prepend 0 - sb.append('0'); - sb.append(Character.toChars(c2)); + } else if (octalLength == 2) { + // 2-digit octal like \12 becomes \012 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); + offset += octalLength - 1; // Skip the second digit + lastChar = octalValue; + } else if (octalLength == 1) { + // Single digit octal: \0 through \7 + // Convert to 2-digit format for Java: \00 through \07 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); lastChar = octalValue; - } else { - // Short octal or single digit, pass through - sb.append(Character.toChars(c2)); - lastChar = c2; } } else if (c2 == '8' || c2 == '9') { // \8 and \9 are not valid octals - treat as literal digits diff --git a/src/main/java/org/perlonjava/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/regex/RuntimeRegex.java index a65296840..6e29f039e 100644 --- a/src/main/java/org/perlonjava/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/regex/RuntimeRegex.java @@ -25,6 +25,9 @@ */ public class RuntimeRegex extends RuntimeBase implements RuntimeScalarReference { + // Debug flag for regex compilation (set at class load time) + private static final boolean DEBUG_REGEX = System.getenv("DEBUG_REGEX") != null; + // Constants for regex pattern flags private static final int CASE_INSENSITIVE = Pattern.CASE_INSENSITIVE; private static final int MULTILINE = Pattern.MULTILINE; @@ -80,11 +83,20 @@ public RuntimeRegex() { * @throws IllegalStateException if regex compilation fails. */ public static RuntimeRegex compile(String patternString, String modifiers) { + // Debug logging + if (DEBUG_REGEX) { + System.err.println("RuntimeRegex.compile: pattern=" + patternString + " modifiers=" + modifiers); + System.err.println(" caller stack: " + Thread.currentThread().getStackTrace()[2]); + } + String cacheKey = patternString + "/" + modifiers; // Check if the regex is already cached RuntimeRegex regex = regexCache.get(cacheKey); if (regex == null) { + if (DEBUG_REGEX) { + System.err.println(" cache miss, compiling new regex"); + } regex = new RuntimeRegex(); if (patternString != null && patternString.contains("\\Q")) { @@ -102,6 +114,11 @@ public static RuntimeRegex compile(String patternString, String modifiers) { try { javaPattern = preProcessRegex(patternString, regex.regexFlags); + // Debug logging + if (DEBUG_REGEX) { + System.err.println(" preprocessed pattern=" + javaPattern); + } + // Track if preprocessing deferred user-defined Unicode properties. // These need to be resolved later, once the corresponding Perl subs are defined. regex.deferredUserDefinedUnicodeProperties = RegexPreprocessor.hadDeferredUnicodePropertyEncountered(); @@ -149,6 +166,11 @@ public static RuntimeRegex compile(String patternString, String modifiers) { if (regexCache.size() < MAX_REGEX_CACHE_SIZE) { regexCache.put(cacheKey, regex); } + } else { + // Debug logging for cache hit + if (DEBUG_REGEX) { + System.err.println(" cache hit, reusing cached regex"); + } } return regex; } @@ -357,7 +379,8 @@ public static RuntimeBase matchRegex(RuntimeScalar quotedRegex, RuntimeScalar st } // Fast path: no alarm active, use direct matching - return matchRegexDirect(quotedRegex, string, ctx); + RuntimeBase result = matchRegexDirect(quotedRegex, string, ctx); + return result; } /** @@ -367,6 +390,12 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc RuntimeRegex regex = resolveRegex(quotedRegex); regex = ensureCompiledForRuntime(regex); + // Debug logging + if (DEBUG_REGEX) { + System.err.println("matchRegexDirect: pattern=" + regex.pattern.pattern() + + " input=" + string.toString() + " ctx=" + ctx); + } + if (regex.regexFlags.isMatchExactlyOnce() && regex.matched) { // m?PAT? already matched once; now return false if (ctx == RuntimeContextType.LIST) { @@ -503,6 +532,11 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc posScalar.set(scalarUndef); } + // Debug logging + if (DEBUG_REGEX) { + System.err.println(" match result: found=" + found); + } + if (!found) { // No match: scalar match vars ($`, $&, $') should become undef. // Keep lastSuccessful* and the previous globalMatcher intact so @-/@+ do not get clobbered @@ -540,6 +574,11 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc } if (ctx == RuntimeContextType.LIST) { + // In LIST context: return captured groups, or (1) for success with no captures (non-global) + if (found && result.elements.isEmpty() && !regex.regexFlags.isGlobalMatch()) { + // Non-global match with no captures in LIST context returns (1) + result.elements.add(RuntimeScalarCache.getScalarInt(1)); + } return result; } else if (ctx == RuntimeContextType.SCALAR) { return RuntimeScalarCache.getScalarBoolean(found);