From 613e45a4a5aaa5d78cff4c6e90a26b7fc817052f Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Wed, 29 Apr 2026 22:31:51 +0200 Subject: [PATCH] fix(pack): byte-counted slash construct must not re-encode as UTF-8 pack("V/Z*", $bytes) and the other "len/string" constructs computed the length prefix from str.getBytes(UTF-8), while the actual payload was emitted by PackWriter.writeString() using ISO-8859-1 (one byte per Latin-1 char). For a string of high-Latin-1 bytes (e.g. UTF-8 output of utf8::encode), the length prefix came out doubled and the payload was followed by zero padding to match. Concretely, for a 12-byte string this produced: 19000000 c3a9...c3a9 00 0000000000000000000000 instead of the correct: 0d000000 c3a9...c3a9 00 Match the byte/char count writeString actually uses: in byte mode use the ISO-8859-1 byte count, otherwise use the Java char count (which matches writeString's per-char output for plain Latin-1 and its per-codepoint output for high-Unicode strings). This is exactly what BSON::PP triggers via pack("V/Z*", $utf8_encoded) when round-tripping any non-ASCII string through BSON. The fix takes `./jcpan -t BSON` from 71/72 failing test files down to 12/72. Adds src/test/resources/unit/pack/slash_string.t covering V/Z*, V/a*, V/A*, n/Z*, round-trip, and the exact BSON wire-format shape. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 6 +- .../operators/pack/PackGroupHandler.java | 12 ++-- src/test/resources/unit/pack/slash_string.t | 59 +++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) create mode 100644 src/test/resources/unit/pack/slash_string.t diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 287225589..5dd980f0f 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,14 +33,14 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "d5085fda8"; + public static final String gitCommitId = "723dfee80"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitDate = "2026-04-29"; + public static final String gitCommitDate = "2026-04-30"; /** * Build timestamp in Perl 5 "Compiled at" format (e.g., "Apr 7 2026 11:20:00"). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 30 2026 08:15:13"; + public static final String buildTimestamp = "Apr 30 2026 10:12:03"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/operators/pack/PackGroupHandler.java b/src/main/java/org/perlonjava/runtime/operators/pack/PackGroupHandler.java index fe8d91f43..a85e18f9d 100644 --- a/src/main/java/org/perlonjava/runtime/operators/pack/PackGroupHandler.java +++ b/src/main/java/org/perlonjava/runtime/operators/pack/PackGroupHandler.java @@ -340,10 +340,14 @@ public static GroupResult handleSlashConstruct(String template, int position, in if (stringCount >= 0) { effectiveCount = stringCount; } else { - byte[] strBytes = byteMode - ? str.getBytes(StandardCharsets.ISO_8859_1) - : str.getBytes(StandardCharsets.UTF_8); - effectiveCount = strBytes.length; + // Match the byte/char count used by PackWriter.writeString: + // - byte mode (U0): raw bytes via ISO-8859-1 + // - normal mode: one unit per Java character (writeString writes + // one byte per ISO-8859-1 char, or one codepoint for high Unicode) + int unitCount = byteMode + ? str.getBytes(StandardCharsets.ISO_8859_1).length + : str.length(); + effectiveCount = unitCount; if (stringFormat == 'Z') { effectiveCount++; // Include null terminator in count } diff --git a/src/test/resources/unit/pack/slash_string.t b/src/test/resources/unit/pack/slash_string.t new file mode 100644 index 000000000..bdf2a48c5 --- /dev/null +++ b/src/test/resources/unit/pack/slash_string.t @@ -0,0 +1,59 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Test::More tests => 7; + +# Tests for the "len/Z*" / "len/a*" / "len/A*" pack constructs with byte +# strings (Latin-1 high bytes). Regression for a bug where the slash +# construct used getBytes(UTF-8) to compute the length prefix while +# writeString() emitted ISO-8859-1 bytes, producing a wrong (doubled) +# length and trailing zero padding. This is what BSON::PP triggers via +# pack("V/Z*", $utf8_encoded_string). + +my $latin = "\xc3\xa9\xc3\xa9\xc3\xa9"; # 6 bytes, no utf8 flag + +is( + unpack("H*", pack("V/Z*", $latin)), + "07000000c3a9c3a9c3a900", + 'V/Z* length prefix counts bytes (not re-encoded UTF-8) for Z*', +); + +is( + unpack("H*", pack("V/a*", $latin)), + "06000000c3a9c3a9c3a9", + 'V/a* length prefix counts bytes for a*', +); + +is( + unpack("H*", pack("V/A*", $latin)), + "06000000c3a9c3a9c3a9", + 'V/A* length prefix counts bytes for A*', +); + +is( + unpack("H*", pack("n/Z*", $latin)), + "0007c3a9c3a9c3a900", + 'n/Z* length prefix counts bytes for Z*', +); + +# Round-trip through unpack +{ + my $p = pack("V/a*", $latin); + my ($got) = unpack("V/a*", $p); + is($got, $latin, 'V/a* round-trips a Latin-1 byte string'); +} + +# Mirror BSON::PP's exact use: a 0x02 (string) field in a tiny BSON doc. +# Field value is "ééééée" already utf8-encoded to 12 bytes. The BSON +# string framing should report length=13 (12 bytes + NUL) and emit +# exactly 13 bytes of payload. +{ + my $v = "\xc3\xa9" x 6; # 12 bytes, utf8 flag off + my $p = pack("V/Z*", $v); + is(length($p), 4 + 13, 'V/Z* total length is 4 (len prefix) + bytes + NUL'); + is( + unpack("H*", $p), + "0d000000" . ("c3a9" x 6) . "00", + 'V/Z* matches BSON wire format', + ); +}