diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 1b6139207..1c8294666 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "b037509d0"; + public static final String gitCommitId = "4aafb6057"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). diff --git a/src/main/java/org/perlonjava/frontend/parser/ParseHeredoc.java b/src/main/java/org/perlonjava/frontend/parser/ParseHeredoc.java index c66b04e5f..972bf6ed4 100644 --- a/src/main/java/org/perlonjava/frontend/parser/ParseHeredoc.java +++ b/src/main/java/org/perlonjava/frontend/parser/ParseHeredoc.java @@ -10,10 +10,12 @@ import org.perlonjava.frontend.lexer.LexerTokenType; import org.perlonjava.runtime.runtimetypes.PerlCompilerException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import static org.perlonjava.frontend.parser.StringParser.parseRawString; +import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8; public class ParseHeredoc { static OperatorNode parseHeredoc(Parser parser, String tokenText) { @@ -212,6 +214,16 @@ else if (currentIndex >= tokens.size() || String string = content.toString(); if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("Final heredoc content: <<" + string + ">>"); + // Without `use utf8`, convert Unicode chars back to UTF-8 byte values, + // matching Perl 5's treatment of source bytes as Latin-1/octets. + // Skip if source is already ISO-8859-1 (isByteStringSource) — chars already + // represent raw byte values and need no conversion. + if (!parser.ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) + && !parser.ctx.compilerOptions.isUnicodeSource + && !parser.ctx.compilerOptions.isByteStringSource) { + string = convertToOctets(string); + } + // Rewrite the heredoc node, according to the delimiter Node operand = null; switch (delimiter) { @@ -293,4 +305,19 @@ public static void restoreHeredocStateIfNeeded(Parser parser, List parser.getHeredocNodes().addAll(savedHeredocNodes); } } + + /** + * Convert a Unicode string back to UTF-8 byte values. + * Without `use utf8`, Perl treats source bytes as Latin-1/octets. + * Since Java reads source files as UTF-8 and decodes multi-byte sequences + * into single characters, we need to reverse this for Perl compatibility. + */ + private static String convertToOctets(String str) { + byte[] utf8Bytes = str.getBytes(StandardCharsets.UTF_8); + StringBuilder octetString = new StringBuilder(utf8Bytes.length); + for (byte b : utf8Bytes) { + octetString.append((char) (b & 0xFF)); + } + return octetString.toString(); + } } diff --git a/src/main/java/org/perlonjava/frontend/parser/Variable.java b/src/main/java/org/perlonjava/frontend/parser/Variable.java index 971b0c0f1..afe97e754 100644 --- a/src/main/java/org/perlonjava/frontend/parser/Variable.java +++ b/src/main/java/org/perlonjava/frontend/parser/Variable.java @@ -925,8 +925,9 @@ public static Node parseBracedVariable(Parser parser, String sigil, boolean isSt if (TokenUtils.peek(parser).text.equals("}")) { TokenUtils.consume(parser, LexerTokenType.OPERATOR, "}"); - // Issue ambiguity warning if needed - if (isAmbiguous) { + // Issue ambiguity warning if needed (not inside string interpolation, + // matching Perl 5 which only warns in code context) + if (isAmbiguous && !isStringInterpolation) { String accessType = ""; if (operand instanceof BinaryOperatorNode binOp) { if (binOp.operator.equals("[")) { diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java b/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java index 1f96ff5a1..1faa7423a 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java @@ -349,27 +349,10 @@ public static RuntimeList valid(RuntimeArray args, int ctx) { String string = scalar.toString(); if (scalar.type == BYTE_STRING) { - // For byte strings, check if the bytes form valid UTF-8 - // Extract raw byte values and try to decode as UTF-8 - byte[] bytes = new byte[string.length()]; - for (int i = 0; i < string.length(); i++) { - char c = string.charAt(i); - if (c > 0xFF) { - // Byte string should not contain chars > 0xFF - // This is an inconsistent state - return RuntimeScalarCache.scalarFalse.getList(); - } - bytes[i] = (byte) c; - } - CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - try { - decoder.decode(ByteBuffer.wrap(bytes)); - return RuntimeScalarCache.scalarTrue.getList(); - } catch (CharacterCodingException e) { - return RuntimeScalarCache.scalarFalse.getList(); - } + // For byte strings (UTF-8 flag off), Perl always returns true. + // The bytes are not claiming to be UTF-8, so they are considered + // valid in their native encoding (Latin-1/bytes). + return RuntimeScalarCache.scalarTrue.getList(); } else { // For character strings (UTF-8 flag on), check if all characters are valid // Unicode code points. Java strings contain UTF-16 code units, which