From 1216197940551f81633d667fa0f460a6b84f8826 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Thu, 9 Apr 2026 10:44:46 +0200 Subject: [PATCH 1/4] docs: Perl::Tidy test support plan - 5/44 files pass, path to fix Investigation of ./jcpan -t Perl::Tidy (v20260204) identified 5 blockers: 1. DESTROY singleton (Critical): Formatter and Tokenizer use closure counters decremented in DESTROY. Since PerlOnJava does not call DESTROY, the 2nd+ perltidy() call per process dies. Affects 36/44 test files (~555 subtests). Fix: 2-line overlay in Perl/Tidy.pm. 2. Option parsing (Moderate): perltidyrc string ref options (-dac, -bl) not applied. Possibly Getopt::Long negatable boolean handling. 3. Wide char alignment (Low): Unicode display width miscalculation. 4. EOL handling (Low): t/test-eol.t produces no output. 5. DEBUG output (Low): debugfile scalar ref returns undef. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/modules/perl_tidy.md | 278 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 dev/modules/perl_tidy.md diff --git a/dev/modules/perl_tidy.md b/dev/modules/perl_tidy.md new file mode 100644 index 000000000..e00e92e78 --- /dev/null +++ b/dev/modules/perl_tidy.md @@ -0,0 +1,278 @@ +# Perl::Tidy Support Plan + +## Goal + +Make `./jcpan -t Perl::Tidy` run without errors on PerlOnJava. + +## Current Status + +**Version:** Perl-Tidy-20260204 (SHANCOCK/Perl-Tidy-20260204.tar.gz) +**Install:** Succeeds — 16 files installed to `~/.perlonjava/lib/` +**Tests:** 5/44 files pass, 39/44 fail (14/49 subtests fail, but most files die mid-run) + +### Test Results Summary + +| Category | Files | Result | +|----------|-------|--------| +| Passing | t/filter_example.t, t/test.t, t/testsa.t, t/testss.t, t/zero.t | 5 OK | +| Snippet tests (DESTROY) | t/snippets1.t–t/snippets33.t (33 files) | 33 FAIL | +| Wide char tests (DESTROY + alignment) | t/testwide.t, t/testwide-passthrough.t, t/testwide-tidy.t | 3 FAIL | +| Filter/option tests | t/atee.t | 1 FAIL (1/2 subtests) | +| EOL tests | t/test-eol.t | 1 FAIL (0/4 subtests ran) | +| Debug output test | t/test_DEBUG.t | 1 FAIL (1/2 subtests) | + +## Blockers + +### Blocker 1: Missing DESTROY Breaks Singleton Guards (Critical — 36+ test files) + +**Symptom:** +``` +Attempt to create more than 1 object in Perl::Tidy::Formatter, which is not a true class yet + at .../Perl/Tidy/Formatter.pm line 1108. +``` + +This error kills the 2nd (and all subsequent) calls to `perltidy()` within a +single process. Since each snippet test file calls `perltidy()` 8–20 times in +a loop, only the first test passes per file. + +**Root cause:** `Perl::Tidy::Formatter` and `Perl::Tidy::Tokenizer` use +closure-scoped instance counters that are incremented in `new()` and +decremented in `DESTROY()`. PerlOnJava does not call `DESTROY`, so the counter +never resets to 0. + +**Formatter.pm singleton pattern:** +```perl +{ ## begin closure to count instances + my $_count = 0; + sub _increment_count { return ++$_count } + sub _decrement_count { return --$_count } +} + +sub DESTROY { + my $self = shift; + _decrement_count(); + return; +} + +sub new { + ... + if ( _increment_count() > 1 ) { + confess "Attempt to create more than 1 object..."; + } + ... +} +``` + +`Perl::Tidy::Tokenizer` has an identical pattern (lines 271–284, guard at +line 676). + +**Other DESTROY methods in Perl::Tidy:** 10 other classes have empty DESTROY +methods (only to prevent AUTOLOAD dispatch) — these are safe with missing +DESTROY. Only `Formatter` and `Tokenizer` have functional DESTROY code. + +**Impact:** ~555 subtests across 36 test files never run. + +**Fix (Bundled overlay — Perl/Tidy.pm):** + +Patch `Perl::Tidy.pm`'s `perltidy()` function to explicitly call +`_decrement_count()` on Formatter and Tokenizer before returning. This +compensates for the missing DESTROY call with a 2-line surgical change. + +In `Perl/Tidy.pm`, add before the final return in `perltidy()` (~line 1395): +```perl +# PerlOnJava: DESTROY not called on JVM — manually reset singleton counters +Perl::Tidy::Formatter::_decrement_count(); +Perl::Tidy::Tokenizer::_decrement_count(); +``` + +This works because `_decrement_count` is a package-scoped sub (not a lexical), +so it's callable from outside the class. + +**Alternative approaches:** +- **Patch Formatter.pm and Tokenizer.pm** — change `new()` to not rely on + DESTROY (e.g., reset `$_count` before incrementing). Requires patching 2 + files instead of 1. +- **Implement DESTROY in PerlOnJava** — the most complete fix but a very large + undertaking (see `dev/design/destroy_weaken_plan.md`). + +**Effort:** Low — 2 lines added to one file. + +### Blocker 2: perltidyrc String Ref Option Parsing (Moderate — affects first test in some snippet files + t/atee.t) + +**Symptom (t/atee.t):** +``` +# Test 1 got: "# block comment\n\n=pod\nsome pod\n=cut\n\nprint \"hello world\\n\";\n$xx++; # side comment\n" +# Expected: "\nprint \"hello world\\n\";\n$xx++;\n" +``` + +The test passes options via `perltidyrc => \$params` where `$params` is a +string like `"-dac -tac -D -g"`. The formatter runs (whitespace is adjusted) +but option-specific features (delete comments, brace placement) don't take +effect. + +**Root cause hypothesis:** The options are passed through +`expand_command_abbreviations` and then `Getopt::Long::GetOptions`. Possible +PerlOnJava issues: +1. **`Getopt::Long` `!` (negatable boolean) handling** — delete options use + `'delete-block-comments' => '!'`. If PerlOnJava's Getopt::Long doesn't + handle `!` correctly, options are silently ignored. +2. **Abbreviation expansion** — `-dac` must be expanded to + `--delete-block-comments --delete-side-comments --delete-pod`. If the + expansion regex fails, unexpanded `-dac` is passed to GetOptions which + doesn't recognize it. +3. **Hash key population** — The expanded options may not populate + `$rOpts->{'delete-block-comments'}` correctly. + +**Affected tests:** +- t/atee.t (test 1 of 2) — `-dac -tac -D -g` options +- t/snippets18.t (test 1 — braces.braces1) — `-bl -asbl` (Allman brace style) +- t/snippets22.t (test 1 — bbhb.bbhb2) — `-bbhb` (break before hash brace) +- t/snippets30.t, t/snippets31.t — first test also fails with formatting diffs + +**Investigation needed:** +```bash +# Test if atomic options work (no abbreviation expansion needed): +./jperl -e ' + use Perl::Tidy; + my $src = "# comment\nprint 1;\n"; + my $out; + Perl::Tidy::perltidy( + source => \$src, destination => \$out, + perltidyrc => \"--delete-block-comments", + argv => "" + ); + print $out; +' +# Expected: "\nprint 1;\n" (comment deleted) +# If comment remains: Getopt::Long ! handling is broken +# If comment deleted: abbreviation expansion is broken +``` + +**Fix approach:** Once root cause is confirmed, either: +- Fix `Getopt::Long` `!` handling in PerlOnJava's bundled Getopt::Long +- Fix abbreviation expansion regex in Perl::Tidy (less likely needed) +- Patch Perl::Tidy.pm's option processing to work around the issue + +**Effort:** Medium — requires debugging the option parsing pipeline. + +### Blocker 3: Wide Character Alignment (Low — 3 test files) + +**Symptom (t/testwide.t):** +```perl +# Got: 4-space indent (standard formatting) +# Expected: right-aligned to opening parenthesis +``` + +The formatter doesn't compute correct display widths for multi-byte Unicode +characters (Cyrillic, Polish, German umlauts). This causes hash value +alignment to use standard indentation instead of right-aligning to the `(` +column. + +**Affected tests:** t/testwide.t (3/3 fail), t/testwide-passthrough.t (3/4 +fail), t/testwide-tidy.t (3/4 fail) + +**Root cause hypothesis:** Perl::Tidy measures string widths using `length()` +which in Perl returns codepoint count. For alignment purposes, display width +matters. Wide characters (CJK) take 2 columns, while most Latin/Cyrillic +take 1. The issue may be in how PerlOnJava handles `length()` on decoded +Unicode strings, or in how Perl::Tidy's alignment calculations interact with +PerlOnJava's string handling. + +Note: Tests 1-2 in testwide.t both fail with the same alignment issue, and +test 3 fails due to the DESTROY singleton bug (Blocker 1). Fixing Blocker 1 +won't fix tests 1-2 but will allow test 3 to run. + +**Effort:** Medium — requires investigating string width calculations. + +### Blocker 4: EOL Handling (Low — 1 test file) + +**Symptom (t/test-eol.t):** All 4 subtests produce no output (0 tests ran). + +The test likely checks for correct handling of different line endings (CR, LF, +CRLF). The test may die early or produce no TAP output at all. + +**Investigation needed:** +```bash +cd /path/to/Perl-Tidy-build && PERL5LIB="./blib/lib:./blib/arch" \ + /path/to/jperl t/test-eol.t 2>&1 +``` + +**Effort:** Low–Medium — needs investigation. + +### Blocker 5: DEBUG File Output (Low — 1 test file) + +**Symptom (t/test_DEBUG.t):** Test 2 expects debug output via `debugfile => +\$string` but gets `undef`. + +**Root cause:** The `debugfile` parameter to `perltidy()` writes debug/token +type information. The output goes to a `Perl::Tidy::Debugger` object which +writes to the file handle. The output may not be reaching the scalar ref. + +**Effort:** Low — likely a minor I/O or filehandle issue. + +## Implementation Plan + +### Phase 1: Fix DESTROY Singleton (unblocks ~555 subtests) + +1. **Create bundled overlay** of `Perl/Tidy.pm` + - Copy upstream `Perl/Tidy.pm` (v20260204) to `src/main/perl/lib/Perl/Tidy.pm` + - Add `_decrement_count()` calls before `perltidy()`'s return points + - Mark changes with `# PerlOnJava:` comments + - Store diff in `dev/patches/cpan/Perl-Tidy-20260204/` + +2. **Verify:** Re-run `./jcpan -t Perl::Tidy` — expect snippet tests to + progress past first test case in each file. + +3. **Run `make`** — ensure no regressions in PerlOnJava's own tests. + +### Phase 2: Fix Option Parsing (unblocks ~5 first-test failures) + +1. **Investigate** Getopt::Long `!` negatable boolean handling + - Test atomic options (no abbreviation needed) vs abbreviated options + - If `!` handling is broken, fix in `src/main/perl/lib/Getopt/Long.pm` + - If abbreviation expansion is broken, investigate Perl::Tidy's regex + +2. **Verify:** t/atee.t test 1 passes, snippets18/22/30/31 first tests pass + +### Phase 3: Fix Wide Character Alignment (nice to have) + +1. **Investigate** string width computation for Unicode characters +2. May require changes to PerlOnJava's `length()` or Perl::Tidy's alignment code +3. **Verify:** t/testwide.t, t/testwide-passthrough.t, t/testwide-tidy.t + +### Phase 4: Fix EOL and DEBUG (nice to have) + +1. Investigate t/test-eol.t — likely minor I/O issue +2. Investigate t/test_DEBUG.t — debug file handle output + +## Expected Results After Phase 1 + +With the DESTROY fix alone, the test results should improve dramatically: + +| Before | After (estimated) | +|--------|-------------------| +| 5/44 files pass | ~35/44 files pass | +| 14/49 subtests fail | TBD (most snippet tests should fully pass) | +| Result: FAIL | Closer to PASS | + +The snippet tests that also have first-test formatting failures +(snippets18, 22, 28, 30, 31, 32, 33) may still fail a few subtests due to +Blocker 2, but they will progress past the first test case. + +## Dependency on Other Work + +- **DESTROY implementation** (`dev/design/destroy_weaken_plan.md`): Would fix + this and all other DESTROY-dependent CPAN modules generically. However, it's + a large project. The targeted Perl::Tidy.pm overlay is the pragmatic + short-term fix. +- **Perl::Critic** (`dev/modules/perl_critic.md`): Already installed + (99.9% pass rate). Its `RequireTidyCode` policy fails because Perl::Tidy's + `Formatter::initialize_self_vars` exceeds the JVM 255-argument method + signature limit. That issue is separate from the test suite failures + documented here. + +## Related Documents + +- [cpan_patch_plan.md](cpan_patch_plan.md) — CPAN patching strategy (Option A: Bundled Overlays) +- [perl_critic.md](perl_critic.md) — Perl::Critic support (uses Perl::Tidy optionally) +- `dev/design/destroy_weaken_plan.md` — DESTROY/weaken implementation design From 8aebbb300545fbae0f0d3170d9acbe6eac2463b2 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Thu, 9 Apr 2026 11:34:42 +0200 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20\G=20regex=20anchor=20=E2=80=94=20re?= =?UTF-8?q?spect=20pos()=20in=20all=20match=20contexts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the \G assertion in the regex engine: 1. \G when pos() is undef: Previously, \G was only checked when pos() was defined (isPosDefined). When pos() was undef, the \G anchor check was skipped entirely, allowing the regex to scan forward and match at any position. Now \G always anchors at startPos (which defaults to 0 when pos() is undef), matching Perl behavior. 2. \G in non-/g matches: Previously, pos() was only looked up for /g matches. \G in non-/g matches (e.g. $str =~ /\Gfoo/) always started from position 0 regardless of pos(). Now pos() is looked up whenever \G is present in the pattern, matching Perl behavior where \G anchors at pos() even without /g. These fixes are critical for Perl::Tidy compatibility: - Fix 1 corrects option parsing (parse_args uses \G/gc tokenizer) - Fix 2 corrects the tokenizer signature detection (\G in non-/g) Perl::Tidy test results improve from 5/44 to 7/44 passing files. All remaining failures are DESTROY-related (singleton counter). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 4 +- .../runtime/regex/RuntimeRegex.java | 10 ++- src/test/resources/unit/regex/regex_g_pos.t | 76 +++++++++++++++++++ 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 3e56195dd..eeb0e13bc 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "6858b39e6"; + public static final String gitCommitId = "121619794"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 9 2026 10:09:12"; + public static final String buildTimestamp = "Apr 9 2026 11:21:54"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java index e4fc5171e..6a1800b28 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java @@ -598,12 +598,13 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc // hexPrinter(inputStr); - // Only look up pos() for /g matches - non-/g matches always start from 0 + // Look up pos() for /g matches and for non-/g matches that use \G. + // In Perl, \G anchors at pos() even in non-/g matches (e.g. $str =~ /\Gfoo/). RuntimeScalar posScalar = null; boolean isPosDefined = false; int startPos = 0; - if (regex.regexFlags.isGlobalMatch()) { + if (regex.regexFlags.isGlobalMatch() || regex.useGAssertion) { // Use RuntimePosLvalue to get the current position posScalar = RuntimePosLvalue.pos(string); isPosDefined = posScalar.getDefinedBoolean(); @@ -611,7 +612,7 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc // Check if previous call had zero-length match at this position (for SCALAR context) // This prevents infinite loops in: while ($str =~ /pat/g) - if (ctx == RuntimeContextType.SCALAR) { + if (regex.regexFlags.isGlobalMatch() && ctx == RuntimeContextType.SCALAR) { String patternKey = regex.patternString; if (RuntimePosLvalue.hadZeroLengthMatchAt(string, startPos, patternKey)) { // Previous match was zero-length at this position - fail to break loop @@ -647,7 +648,8 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc try { while (matcher.find()) { // If \G is used, ensure the match starts at the expected position - if (regex.useGAssertion && isPosDefined && matcher.start() != startPos) { + // When pos() is undefined, \G anchors at position 0 (startPos defaults to 0) + if (regex.useGAssertion && matcher.start() != startPos) { break; } diff --git a/src/test/resources/unit/regex/regex_g_pos.t b/src/test/resources/unit/regex/regex_g_pos.t index 9d637f5e4..6550d4f82 100644 --- a/src/test/resources/unit/regex/regex_g_pos.t +++ b/src/test/resources/unit/regex/regex_g_pos.t @@ -79,6 +79,82 @@ $pattern = qr/\G(\d{3})/; # Use a capture group # Non-global match should not use \G $string =~ /$pattern/; ok(!($1 ne '123'), 'Non-global match does not use \\G, matched \'123\''); +################### +# \G anchoring when pos() is undefined +# \G should anchor at position 0 when pos is undef, not scan forward + +# \G(\s+) should NOT match "-dac -tac" at pos 0 (no space at pos 0) +my $cfg = "-dac -tac"; +if ($cfg =~ /\G(\s+)/gc) { + ok(0, '\\G(\\s+) should not match when no space at pos 0'); +} else { + ok(1, '\\G(\\s+) correctly fails when pos is undef and no space at pos 0'); +} + +# \G([a-z]+) should NOT match "-dac -tac" at pos 0 (dash at pos 0) +pos($cfg) = undef; +if ($cfg =~ /\G([a-z]+)/gc) { + ok(0, '\\G([a-z]+) should not match when no letter at pos 0'); +} else { + ok(1, '\\G([a-z]+) correctly fails when pos is undef and no letter at pos 0'); +} + +# \G(-) SHOULD match "-dac -tac" at pos 0 (dash at pos 0) +pos($cfg) = undef; +if ($cfg =~ /\G(-)/gc) { + ok($1 eq '-' && pos($cfg) == 1, '\\G(-) correctly matches dash at pos 0'); +} else { + ok(0, '\\G(-) should match dash at pos 0'); +} + +# Simulate parse_args pattern: multiple \G/gc alternations on same string +pos($cfg) = undef; +my @tokens; +my $part = ""; +while (1) { + if ($cfg =~ /\G([\"\'])/gc) { + # quote + } + elsif ($cfg =~ /\G(\s+)/gc) { + push @tokens, $part if length($part); + $part = ""; + } + elsif ($cfg =~ /\G(.)/gc) { + $part .= $1; + } + else { + push @tokens, $part if length($part); + last; + } +} +ok(scalar(@tokens) == 2 && $tokens[0] eq '-dac' && $tokens[1] eq '-tac', + '\\G/gc tokenizer correctly splits "-dac -tac" into two tokens'); + +################### +# \G in non-/g matches should still anchor at pos() +# This is used by Perl::Tidy's tokenizer for signature detection + +my $line = "sub foo(\$bar) { }"; +pos($line) = 7; # after "sub foo" +ok($line =~ /\G\s*\(/, '\\G in non-/g match anchors at pos() - matches ( at pos 7'); + +pos($line) = 7; +ok(!($line =~ /\Gx/), '\\G in non-/g match anchors at pos() - fails when char does not match'); + +# \G in non-/g should not change pos() +pos($line) = 7; +$line =~ /\G\s*\(/; +ok(pos($line) == 7, '\\G non-/g match does not change pos()'); + +# \G with capture in non-/g match +my $data = "hello world"; +pos($data) = 6; +if ($data =~ /\G(\w+)/) { + ok($1 eq 'world', '\\G non-/g match with capture works at pos 6'); +} else { + ok(0, '\\G non-/g match with capture should match at pos 6'); +} + ################### # End of Perl `pos` and `\G` Tests From 35c9ee0d5bf4b433def26999d6c4f5b38c35f381 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Thu, 9 Apr 2026 11:35:28 +0200 Subject: [PATCH 3/4] docs: update Perl::Tidy plan with \G fix results (5/44 -> 7/44) Updated plan doc to reflect the two \G regex fixes applied: 1. \G when pos() is undef - anchors at 0 instead of scanning forward 2. \G in non-/g matches - now respects pos() like Perl does Documented remaining DESTROY blocker as the sole remaining issue blocking 33+ test files (snippet tests, wide char, EOL). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/modules/perl_tidy.md | 205 ++++++++++----------------------------- 1 file changed, 51 insertions(+), 154 deletions(-) diff --git a/dev/modules/perl_tidy.md b/dev/modules/perl_tidy.md index e00e92e78..c1bc211f0 100644 --- a/dev/modules/perl_tidy.md +++ b/dev/modules/perl_tidy.md @@ -8,22 +8,60 @@ Make `./jcpan -t Perl::Tidy` run without errors on PerlOnJava. **Version:** Perl-Tidy-20260204 (SHANCOCK/Perl-Tidy-20260204.tar.gz) **Install:** Succeeds — 16 files installed to `~/.perlonjava/lib/` -**Tests:** 5/44 files pass, 39/44 fail (14/49 subtests fail, but most files die mid-run) +**Tests:** 7/44 files pass, 37/44 fail -### Test Results Summary +### Test Results Summary (after \G fixes) | Category | Files | Result | |----------|-------|--------| -| Passing | t/filter_example.t, t/test.t, t/testsa.t, t/testss.t, t/zero.t | 5 OK | +| Passing | t/atee.t, t/filter_example.t, t/test.t, t/test_DEBUG.t, t/testsa.t, t/testss.t, t/zero.t | 7 OK | | Snippet tests (DESTROY) | t/snippets1.t–t/snippets33.t (33 files) | 33 FAIL | -| Wide char tests (DESTROY + alignment) | t/testwide.t, t/testwide-passthrough.t, t/testwide-tidy.t | 3 FAIL | -| Filter/option tests | t/atee.t | 1 FAIL (1/2 subtests) | -| EOL tests | t/test-eol.t | 1 FAIL (0/4 subtests ran) | -| Debug output test | t/test_DEBUG.t | 1 FAIL (1/2 subtests) | +| Wide char tests (DESTROY) | t/testwide.t (2/3 pass), t/testwide-passthrough.t (2/6), t/testwide-tidy.t (2/6) | 3 FAIL | +| EOL tests (DESTROY) | t/test-eol.t (1/4 pass) | 1 FAIL | -## Blockers +### Progress Tracking -### Blocker 1: Missing DESTROY Breaks Singleton Guards (Critical — 36+ test files) +| Date | Milestone | Tests Passing | +|------|-----------|---------------| +| 2025-04-08 | Initial investigation | 5/44 | +| 2025-04-09 | \G regex fixes (pos undef + non-/g) | 7/44 | + +## Fixes Applied + +### Fix 1: \G Regex Anchor — pos() undef case (2025-04-09) + +**File:** `src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java` (line 651) + +**Problem:** When `pos()` was undef, the `\G` anchor check was skipped entirely +(`if (regex.useGAssertion && isPosDefined && matcher.start() != startPos)`). +This allowed `\G(\s+)` to scan forward and match whitespace anywhere in the +string, even though `\G` should anchor at position 0 when pos is undef. + +**Impact:** Perl::Tidy's `parse_args` function uses `\G/gc` patterns to +tokenize option strings. With broken `\G`, options like `-dac` were silently +dropped, causing t/atee.t to fail. + +**Fix:** Removed `isPosDefined` from the condition. When pos is undef, +`startPos` defaults to 0, so `\G` correctly anchors at 0. + +### Fix 2: \G in Non-/g Matches (2025-04-09) + +**File:** `src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java` (line 607) + +**Problem:** `pos()` was only looked up for `/g` matches. In Perl, `\G` +should anchor at `pos()` even in non-`/g` matches (e.g. `$str =~ /\Gfoo/`). +PerlOnJava was ignoring pos entirely for non-/g matches containing `\G`. + +**Impact:** Perl::Tidy's tokenizer uses `\G` in non-/g matches for signature +detection (line 10060: `$input_line =~ /\G\s*\(/`). Without this fix, +subroutine signatures like `sub foo($bar, %opts)` were misidentified as +prototypes, causing t/filter_example.t to fail. + +**Fix:** Changed the pos() lookup condition from `isGlobalMatch()` to +`isGlobalMatch() || useGAssertion`, so pos is looked up whenever `\G` is +present in the pattern. + +## Remaining Blocker: Missing DESTROY (33+ test files) **Symptom:** ``` @@ -70,7 +108,7 @@ line 676). methods (only to prevent AUTOLOAD dispatch) — these are safe with missing DESTROY. Only `Formatter` and `Tokenizer` have functional DESTROY code. -**Impact:** ~555 subtests across 36 test files never run. +**Impact:** ~555 subtests across 33+ test files never run. **Fix (Bundled overlay — Perl/Tidy.pm):** @@ -85,131 +123,8 @@ Perl::Tidy::Formatter::_decrement_count(); Perl::Tidy::Tokenizer::_decrement_count(); ``` -This works because `_decrement_count` is a package-scoped sub (not a lexical), -so it's callable from outside the class. - -**Alternative approaches:** -- **Patch Formatter.pm and Tokenizer.pm** — change `new()` to not rely on - DESTROY (e.g., reset `$_count` before incrementing). Requires patching 2 - files instead of 1. -- **Implement DESTROY in PerlOnJava** — the most complete fix but a very large - undertaking (see `dev/design/destroy_weaken_plan.md`). - **Effort:** Low — 2 lines added to one file. -### Blocker 2: perltidyrc String Ref Option Parsing (Moderate — affects first test in some snippet files + t/atee.t) - -**Symptom (t/atee.t):** -``` -# Test 1 got: "# block comment\n\n=pod\nsome pod\n=cut\n\nprint \"hello world\\n\";\n$xx++; # side comment\n" -# Expected: "\nprint \"hello world\\n\";\n$xx++;\n" -``` - -The test passes options via `perltidyrc => \$params` where `$params` is a -string like `"-dac -tac -D -g"`. The formatter runs (whitespace is adjusted) -but option-specific features (delete comments, brace placement) don't take -effect. - -**Root cause hypothesis:** The options are passed through -`expand_command_abbreviations` and then `Getopt::Long::GetOptions`. Possible -PerlOnJava issues: -1. **`Getopt::Long` `!` (negatable boolean) handling** — delete options use - `'delete-block-comments' => '!'`. If PerlOnJava's Getopt::Long doesn't - handle `!` correctly, options are silently ignored. -2. **Abbreviation expansion** — `-dac` must be expanded to - `--delete-block-comments --delete-side-comments --delete-pod`. If the - expansion regex fails, unexpanded `-dac` is passed to GetOptions which - doesn't recognize it. -3. **Hash key population** — The expanded options may not populate - `$rOpts->{'delete-block-comments'}` correctly. - -**Affected tests:** -- t/atee.t (test 1 of 2) — `-dac -tac -D -g` options -- t/snippets18.t (test 1 — braces.braces1) — `-bl -asbl` (Allman brace style) -- t/snippets22.t (test 1 — bbhb.bbhb2) — `-bbhb` (break before hash brace) -- t/snippets30.t, t/snippets31.t — first test also fails with formatting diffs - -**Investigation needed:** -```bash -# Test if atomic options work (no abbreviation expansion needed): -./jperl -e ' - use Perl::Tidy; - my $src = "# comment\nprint 1;\n"; - my $out; - Perl::Tidy::perltidy( - source => \$src, destination => \$out, - perltidyrc => \"--delete-block-comments", - argv => "" - ); - print $out; -' -# Expected: "\nprint 1;\n" (comment deleted) -# If comment remains: Getopt::Long ! handling is broken -# If comment deleted: abbreviation expansion is broken -``` - -**Fix approach:** Once root cause is confirmed, either: -- Fix `Getopt::Long` `!` handling in PerlOnJava's bundled Getopt::Long -- Fix abbreviation expansion regex in Perl::Tidy (less likely needed) -- Patch Perl::Tidy.pm's option processing to work around the issue - -**Effort:** Medium — requires debugging the option parsing pipeline. - -### Blocker 3: Wide Character Alignment (Low — 3 test files) - -**Symptom (t/testwide.t):** -```perl -# Got: 4-space indent (standard formatting) -# Expected: right-aligned to opening parenthesis -``` - -The formatter doesn't compute correct display widths for multi-byte Unicode -characters (Cyrillic, Polish, German umlauts). This causes hash value -alignment to use standard indentation instead of right-aligning to the `(` -column. - -**Affected tests:** t/testwide.t (3/3 fail), t/testwide-passthrough.t (3/4 -fail), t/testwide-tidy.t (3/4 fail) - -**Root cause hypothesis:** Perl::Tidy measures string widths using `length()` -which in Perl returns codepoint count. For alignment purposes, display width -matters. Wide characters (CJK) take 2 columns, while most Latin/Cyrillic -take 1. The issue may be in how PerlOnJava handles `length()` on decoded -Unicode strings, or in how Perl::Tidy's alignment calculations interact with -PerlOnJava's string handling. - -Note: Tests 1-2 in testwide.t both fail with the same alignment issue, and -test 3 fails due to the DESTROY singleton bug (Blocker 1). Fixing Blocker 1 -won't fix tests 1-2 but will allow test 3 to run. - -**Effort:** Medium — requires investigating string width calculations. - -### Blocker 4: EOL Handling (Low — 1 test file) - -**Symptom (t/test-eol.t):** All 4 subtests produce no output (0 tests ran). - -The test likely checks for correct handling of different line endings (CR, LF, -CRLF). The test may die early or produce no TAP output at all. - -**Investigation needed:** -```bash -cd /path/to/Perl-Tidy-build && PERL5LIB="./blib/lib:./blib/arch" \ - /path/to/jperl t/test-eol.t 2>&1 -``` - -**Effort:** Low–Medium — needs investigation. - -### Blocker 5: DEBUG File Output (Low — 1 test file) - -**Symptom (t/test_DEBUG.t):** Test 2 expects debug output via `debugfile => -\$string` but gets `undef`. - -**Root cause:** The `debugfile` parameter to `perltidy()` writes debug/token -type information. The output goes to a `Perl::Tidy::Debugger` object which -writes to the file handle. The output may not be reaching the scalar ref. - -**Effort:** Low — likely a minor I/O or filehandle issue. - ## Implementation Plan ### Phase 1: Fix DESTROY Singleton (unblocks ~555 subtests) @@ -225,40 +140,22 @@ writes to the file handle. The output may not be reaching the scalar ref. 3. **Run `make`** — ensure no regressions in PerlOnJava's own tests. -### Phase 2: Fix Option Parsing (unblocks ~5 first-test failures) - -1. **Investigate** Getopt::Long `!` negatable boolean handling - - Test atomic options (no abbreviation needed) vs abbreviated options - - If `!` handling is broken, fix in `src/main/perl/lib/Getopt/Long.pm` - - If abbreviation expansion is broken, investigate Perl::Tidy's regex - -2. **Verify:** t/atee.t test 1 passes, snippets18/22/30/31 first tests pass - -### Phase 3: Fix Wide Character Alignment (nice to have) +### Phase 2: Wide Character Alignment (nice to have) 1. **Investigate** string width computation for Unicode characters 2. May require changes to PerlOnJava's `length()` or Perl::Tidy's alignment code 3. **Verify:** t/testwide.t, t/testwide-passthrough.t, t/testwide-tidy.t -### Phase 4: Fix EOL and DEBUG (nice to have) - -1. Investigate t/test-eol.t — likely minor I/O issue -2. Investigate t/test_DEBUG.t — debug file handle output - ## Expected Results After Phase 1 With the DESTROY fix alone, the test results should improve dramatically: | Before | After (estimated) | |--------|-------------------| -| 5/44 files pass | ~35/44 files pass | -| 14/49 subtests fail | TBD (most snippet tests should fully pass) | +| 7/44 files pass | ~38/44 files pass | +| 4/53 subtests fail | TBD (most snippet tests should fully pass) | | Result: FAIL | Closer to PASS | -The snippet tests that also have first-test formatting failures -(snippets18, 22, 28, 30, 31, 32, 33) may still fail a few subtests due to -Blocker 2, but they will progress past the first test case. - ## Dependency on Other Work - **DESTROY implementation** (`dev/design/destroy_weaken_plan.md`): Would fix From 7d0f1e5a08f206b7ad125c40a2c662953367c3b1 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Thu, 9 Apr 2026 12:33:10 +0200 Subject: [PATCH 4/4] =?UTF-8?q?fix:=20-T=20file=20test=20=E2=80=94=20UTF-8?= =?UTF-8?q?=20aware=20text/binary=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The -T heuristic was too strict for UTF-8 files. It treated all bytes >= 128 as non-text, requiring > 70% printable ASCII to classify as text. Files with significant UTF-8 content (e.g. Cyrillic, Polish, German umlauts) were misclassified as binary, causing Perl::Tidy to skip them with "Non-text (override with -f)". Now matches Perl pp_fttext heuristic: - Valid UTF-8 multi-byte sequences are treated as text (not odd) - Invalid high-bit bytes are counted as odd - Binary if odd * 3 > length (same 1/3 threshold as Perl) Fixes testwide-passthrough.t and testwide-tidy.t file-to-file tests. All 48 subtests that run now pass (0 failures). Remaining 37/44 test file failures are purely DESTROY-related. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/core/Configuration.java | 4 +- .../runtime/operators/FileTestOperator.java | 64 ++++++++++++++++--- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index eeb0e13bc..ad63a9f40 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "121619794"; + public static final String gitCommitId = "35c9ee0d5"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). @@ -48,7 +48,7 @@ public final class Configuration { * Parsed by App::perlbrew and other tools via: perl -V | grep "Compiled at" * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String buildTimestamp = "Apr 9 2026 11:21:54"; + public static final String buildTimestamp = "Apr 9 2026 12:17:38"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/operators/FileTestOperator.java b/src/main/java/org/perlonjava/runtime/operators/FileTestOperator.java index 03dacada5..82163d26a 100644 --- a/src/main/java/org/perlonjava/runtime/operators/FileTestOperator.java +++ b/src/main/java/org/perlonjava/runtime/operators/FileTestOperator.java @@ -777,23 +777,67 @@ private static RuntimeScalar isTextOrBinaryFromHandle(CustomFileChannel cfc, boo /** * Common heuristic for text/binary detection. + * Matches Perl's pp_fttext heuristic from pp_sys.c: + * - "odd" chars = null bytes, invalid high-bit bytes, and control chars + * (0-31) except \n, \r, \t, \b (8), \f (12), and ESC (27) + * - Valid UTF-8 multi-byte sequences are treated as text (not odd) + * - File is binary if odd * 3 > length (more than 1/3 odd characters) */ private static RuntimeScalar analyzeTextBinary(byte[] buffer, int length, boolean checkForText) { - int textChars = 0; - int totalChars = 0; + int odd = 0; for (int i = 0; i < length; i++) { - if (buffer[i] == 0) { - return checkForText ? scalarFalse : scalarTrue; // Binary file - } - if ((buffer[i] >= 32 && buffer[i] <= 126) || buffer[i] == '\n' || buffer[i] == '\r' || buffer[i] == '\t') { - textChars++; + int b = buffer[i] & 0xFF; // treat as unsigned + if (b == 0) { + // Null byte — immediately binary + return checkForText ? scalarFalse : scalarTrue; + } else if (b >= 128) { + // Check if this starts a valid UTF-8 multi-byte sequence + int seqLen = utf8SequenceLength(buffer, i, length); + if (seqLen > 1) { + // Valid UTF-8 sequence — skip remaining bytes, not odd + i += seqLen - 1; + } else { + // Invalid high-bit byte — odd + odd++; + } + } else if (b < 32 + && b != '\n' && b != '\r' && b != '\t' + && b != '\b' && b != '\f' && b != 27) { + // Control characters (except common whitespace and ESC) are "odd" + odd++; } - totalChars++; } - double textRatio = (double) textChars / totalChars; - return getScalarBoolean(checkForText ? textRatio > 0.7 : textRatio <= 0.7); + // Perl: odd * 3 > len means binary + boolean isBinary = odd * 3 > length; + return getScalarBoolean(checkForText ? !isBinary : isBinary); + } + + /** + * Determines the length of a valid UTF-8 sequence starting at the given position. + * Returns the sequence length (2-4) if valid, or 1 if invalid. + */ + private static int utf8SequenceLength(byte[] buffer, int pos, int length) { + int b = buffer[pos] & 0xFF; + int seqLen; + + if ((b & 0xE0) == 0xC0) { + seqLen = 2; // 110xxxxx — 2-byte sequence + } else if ((b & 0xF0) == 0xE0) { + seqLen = 3; // 1110xxxx — 3-byte sequence + } else if ((b & 0xF8) == 0xF0) { + seqLen = 4; // 11110xxx — 4-byte sequence + } else { + return 1; // Not a valid UTF-8 start byte + } + + // Verify continuation bytes (10xxxxxx) + if (pos + seqLen > length) return 1; // Not enough bytes + for (int j = 1; j < seqLen; j++) { + if ((buffer[pos + j] & 0xC0) != 0x80) return 1; // Invalid continuation + } + return seqLen; } /**