From 4894ef8a8555ca65194f16e96e404b2aab8b8ffd Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 25 Apr 2026 09:51:53 +0200 Subject: [PATCH] fix(regex): \cX inside character class consumes next char even if special MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a Perl regex character class, \cX is the control-character escape that consumes the next source character regardless of what it is, including regex-significant ones like ], \, [, ^, _ (e.g. \c] = U+001D, \c\ = U+001C). The preprocessor's character-class bracket-end finder treated `\c` as a plain one-char escape, so `\c]` ended the character class prematurely. The class-content emitter likewise emitted `\c` literally, leaving Java's regex engine to reinterpret sequences like `\c\[` as control-backslash followed by a (nested) opening bracket — producing a misleading "Unclosed character class" error from Pattern.compile. Two fixes: - handleCharacterClass(): when scanning for the closing ], skip the character following a `\c` escape so `\c]` is not mistaken for a class terminator. - handleRegexCharacterClassEscape(): when emitting an escape, recognise `\cX` and convert it to `\x{HH}` (uc(X) XOR 0x40, low byte) so the Java regex engine sees a plain hex escape and never has to reason about the Perl-specific consume-next-char semantics. This unblocks XML::Dumper's strip-control-chars regex on line 685 of XML/Dumper.pm and removes a cascading failure that aborted 8 of the 32 test files in `jcpan -t Data::Serializer` mid-run. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 4 ++-- .../runtime/regex/RegexPreprocessor.java | 5 +++++ .../regex/RegexPreprocessorHelper.java | 20 +++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 1d55383c5..17f7e77cd 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "6c981ee03"; + public static final String gitCommitId = "016a235c7"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 25 2026 09:42:01"; + public static final String buildTimestamp = "Apr 25 2026 10:11:42"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java index eaf15a371..db5582c46 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java @@ -1379,6 +1379,11 @@ private static int handleCharacterClass(String s, boolean flag_xx, StringBuilder char ch = s.charAt(bracketEnd); if (inEscape) { inEscape = false; + // \cX consumes the next character (control-character escape), + // even if that character is ']' or '\' (e.g., \c], \c\). + if (ch == 'c' && bracketEnd + 1 < length) { + bracketEnd++; // skip the control-char target + } } else if (ch == '\\') { inEscape = true; } else if (ch == '[') { diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java index 6e5d9b676..e3d601b59 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java @@ -848,6 +848,26 @@ static int handleRegexCharacterClassEscape(int offset, String s, StringBuilder s lastChar = 0x08; // Backspace character for range validation first = false; afterCaret = false; + } else if (s.codePointAt(offset) == 'c' && offset + 1 < length) { + // \cX control-character escape: consume the next character + // and convert to a hex escape so Java doesn't mis-parse + // sequences like \c\, \c[, \c] (where the next char is + // special inside a character class). + // Perl semantics: ord(uc(X)) XOR 0x40, low 8 bits. + int ctrl = s.codePointAt(offset + 1); + if (ctrl >= 'a' && ctrl <= 'z') { + ctrl = ctrl - 'a' + 'A'; // upper-case ASCII letter + } + int value = (ctrl ^ 0x40) & 0xFF; + // Remove the backslash that was already appended + sb.setLength(sb.length() - 1); + sb.append(String.format("\\x{%X}", value)); + offset++; // skip past the control-char target (outer loop bumps past it) + lastChar = value; + first = false; + afterCaret = false; + wasEscape = true; + break; } else { int c2 = s.codePointAt(offset); if (c2 >= '0' && c2 <= '7') {