From a319f7e74f169819794110842b52728010a68d64 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 16:15:26 +0200 Subject: [PATCH 01/13] perf: reduce apply() bytecode by improving TempLocalCountVisitor accuracy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends TempLocalCountVisitor to accurately count temporary local variable allocations that bypass the spill pool (i.e. always call allocateLocalVariable() directly), allowing the pre-init buffer in EmitterMethodCreator to shrink from +256 to +32. Changes to TempLocalCountVisitor: - BinaryOperatorNode("("): +1 per sub call (callContextSlot) - BinaryOperatorNode("..", "..."): +3 per flip-flop (flipFlopIdSlot + leftSlot + rightSlot; overcounts for list-context range, which is safe) - BinaryOperatorNode("xor", "^^"): +1 (leftVar always direct) - OperatorNode("eval", "evalbytes"): +4 (evalResultSlot + cfSlot + labelSlot + typeSlot; was +1) - For1Node: +4 (preEvalListLocal + iteratorIndex + savedLoopVarIndex + foreachRegexStateLocal; was +1) - For3Node: +1 (regexStateLocal or conditionResultReg; was 0) Reduces EmitterMethodCreator pre-init buffer from 256 to 32. Infrastructure slots (~22: tailCallCodeRefSlot, controlFlowActionSlot, 16 spill slots, returnValueSlot, dynamicIndex, etc.) plus any remaining uncounted patterns are absorbed by the 32-slot buffer. Effect on a simple sub like `sub foo { my $x = shift; $x + 1 }`: Before: 0 (visitor) + 256 = 256 null-init bytecodes → ~773 bytes After: 1 (visitor) + 32 = 33 null-init bytecodes → ~99 bytes All unit tests pass at buffer=32. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/jvm/EmitterMethodCreator.java | 2 +- .../analysis/TempLocalCountVisitor.java | 73 +++++++++++++++---- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index 2cd11941b..b4bfb0f35 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -597,7 +597,7 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean TempLocalCountVisitor tempCountVisitor = new TempLocalCountVisitor(); ast.accept(tempCountVisitor); - int preInitTempLocalsCount = tempCountVisitor.getMaxTempCount() + 256; // Buffer for uncounted allocations + int preInitTempLocalsCount = tempCountVisitor.getMaxTempCount() + 32; // Buffer for infrastructure + uncounted allocations for (int i = preInitTempLocalsStart; i < preInitTempLocalsStart + preInitTempLocalsCount; i++) { mv.visitInsn(Opcodes.ACONST_NULL); mv.visitVarInsn(Opcodes.ASTORE, i); diff --git a/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java b/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java index 42a388691..5b2cd0bee 100644 --- a/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java +++ b/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java @@ -8,6 +8,13 @@ *

* This is used to pre-initialize the correct number of slots to avoid * VerifyError when slots are in TOP state. + *

+ * Counting rules — only patterns that call allocateLocalVariable() directly + * (bypassing the spill pool) are counted here. Operators that go through + * acquireSpillSlot() first are covered by the fixed +N buffer in + * EmitterMethodCreator. Infrastructure slots (tailCallCodeRefSlot, + * controlFlowActionSlot, 16 spill slots, returnValueSlot, etc.) are also + * absorbed by that buffer. */ public class TempLocalCountVisitor implements Visitor { private int tempCount = 0; @@ -34,13 +41,28 @@ private void countTemp() { @Override public void visit(BinaryOperatorNode node) { - // Logical operators (&&, ||, //) allocate a temp for left operand - if (node.operator.equals("&&") || node.operator.equals("||") || node.operator.equals("//")) { - countTemp(); - } - // Dereference operations (->) allocate temps for complex access patterns - if (node.operator.equals("->")) { - countTemp(); // Conservative: 1 temp for dereference + switch (node.operator) { + // Logical operators (&&, ||, //) store LHS in a spill slot for short-circuit. + // Using acquireSpillSlot() first, but count conservatively in case pool is full. + case "&&", "||", "//" -> countTemp(); + + // Dereference (->) conservatively counts 1 temp for method/element access. + case "->" -> countTemp(); + + // Sub-call apply operator: always allocates callContextSlot directly. + case "(" -> countTemp(); + + // Flip-flop operators in scalar context always allocate 3 slots directly: + // flipFlopIdSlot, leftSlot, rightSlot. Overcounts for list-context range + // usage, but that is safe. + case "..", "..." -> { + countTemp(); // flipFlopIdSlot + countTemp(); // leftSlot + countTemp(); // rightSlot + } + + // xor/^^ always allocates leftVar directly (no spill). + case "xor", "^^" -> countTemp(); } if (node.left != null) node.left.accept(this); if (node.right != null) node.right.accept(this); @@ -57,8 +79,17 @@ public void visit(BlockNode node) { @Override public void visit(For1Node node) { - // For loops may allocate temp for array storage - countTemp(); + // foreach loop. Guaranteed direct allocations: + // preEvalListLocal (always, unless preEvaluatedArrayIndex >= 0) + // iteratorIndex (always) + // Common conditional direct allocations: + // savedLoopVarIndex (when loop var is an existing lexical) + // foreachRegexStateLocal (when body contains regex) + // Count 4 conservatively to cover the most common cases. + countTemp(); // preEvalListLocal + countTemp(); // iteratorIndex + countTemp(); // savedLoopVarIndex / dynamicIndex (common) + countTemp(); // foreachRegexStateLocal (if body has regex) if (node.variable != null) node.variable.accept(this); if (node.list != null) node.list.accept(this); if (node.body != null) node.body.accept(this); @@ -66,6 +97,11 @@ public void visit(For1Node node) { @Override public void visit(For3Node node) { + // while/for/do-while loop. Conditional direct allocations: + // regexStateLocal (when body contains regex) + // conditionResultReg (non-void context with a condition) + // Count 1 conservatively. + countTemp(); if (node.initialization != null) node.initialization.accept(this); if (node.condition != null) node.condition.accept(this); if (node.increment != null) node.increment.accept(this); @@ -83,13 +119,18 @@ public void visit(ListNode node) { @Override public void visit(OperatorNode node) { - // local() allocates a temp for dynamic variable tracking - if ("local".equals(node.operator)) { - countTemp(); - } - // eval block allocates temps for exception handling - if ("eval".equals(node.operator)) { - countTemp(); + switch (node.operator) { + // local() allocates a temp for dynamic variable tracking. + case "local" -> countTemp(); + + // eval/evalbytes always allocate 4 slots directly: + // evalResultSlot, cfSlot, labelSlot, typeSlot. + case "eval", "evalbytes" -> { + countTemp(); // evalResultSlot + countTemp(); // cfSlot + countTemp(); // labelSlot + countTemp(); // typeSlot + } } if (node.operand != null) { node.operand.accept(this); From febb01cefa652aaa5b72b8301693445d5f7d2644 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 16:37:55 +0200 Subject: [PATCH 02/13] docs: add evaluation guide and Phase 1 progress tracking to reduce-apply-bytecode Adds a new "How to Evaluate Results" section to dev/design/reduce-apply-bytecode.md with: - Six-step evaluation recipe: bytecode size measurement (JPERL_BYTECODE_SIZE_DEBUG), timing commands, correctness verification, C1 JIT failure detection via PrintCompilation and JFR, reference benchmarks, and per-phase size targets. - Reference benchmarks from the Phase 1 completion run (2026-05-01): 96_is_deteministic_value.t: 11.8s, 8/8; 76joins.t: 9.3s, 27/27; 8,264 methods, 4.0 MB total, avg 511 bytes, largest 36,554 bytes, ~56% total bytecode reduction vs old buffer=256. - Note that the largest method (36,554 bytes) is driven by per-call-site TAILCALL trampoline code (Phase 2 target), not by the pre-init buffer. - "Progress Tracking" section showing Phase 1 complete, next steps for Phase 2 (trampoline extraction) and Phase 3 (eval prologue extraction). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 170 ++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index 830b0faf3..8658c985d 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -630,6 +630,176 @@ JPERL_DISABLE_INTERPRETER_FALLBACK=1 timeout 60 ./jperl -e 'sub f{my $x=shift; $ --- +## How to Evaluate Results + +This section explains how to measure progress after completing a phase and +interpret what the numbers mean in terms of the C1 JIT failure risk. + +### 1. Measure apply() method bytecode sizes + +```bash +DBIX_DIR=/Users/fglock/projects/PerlOnJava2/cpan_build_dir/DBIx-Class-0.082844 +JPERL=/path/to/your/jperl # use the current build's jperl + +cd "$DBIX_DIR" +JPERL_BYTECODE_SIZE_DEBUG=1 timeout 120 "$JPERL" -Ilib -It/lib \ + t/96_is_deteministic_value.t > /dev/null 2> /tmp/96_sizes.txt + +# Top-10 largest apply() methods +grep 'method=apply\b' /tmp/96_sizes.txt \ + | awk -F'code_bytes=' '{print $2}' | sort -rn | head -10 + +# Summary statistics +grep 'method=apply\b' /tmp/96_sizes.txt \ + | awk -F'code_bytes=' '{sum+=$2; count++; if($2>max)max=$2} + END{printf "methods=%d total=%dKB avg=%d max=%d\n", + count, sum/1024, sum/count, max}' +``` + +**What to look for:** + +- **`max`**: The largest `apply()` method. C1 register allocation failures + have been observed for methods above ~9,000 bytes when the JVM is + CPU-starved. Each Phase reduces the max (Phase 2 removes ~65 bytes per + call site, which is the dominant term for large methods). +- **`avg`**: The average method size. Dominated by the pre-init buffer before + Phase 1; dominated by call-site trampoline overhead before Phase 2. +- **`total`**: Total class-file footprint for all Perl subs. Drives heap + pressure and permgen/metaspace usage. + +### 2. Measure test timing + +```bash +# Always kill orphaned JVMs before timing — they starve the JIT +pkill -9 -f "perlonjava-.*\.jar.*\.t\b" 2>/dev/null +ps aux | awk '$3 > 20 {print $2, $3, $11}' | grep -v WindowServer | grep -v Spotlight + +cd "$DBIX_DIR" +time timeout 120 "$JPERL" -Ilib -It/lib t/96_is_deteministic_value.t +time timeout 120 "$JPERL" -Ilib -It/lib t/76joins.t +``` + +**Pass/fail thresholds** (clean machine, no competing JVMs): + +| Test | Expected real time | Concern threshold | Failure threshold | +|------|--------------------|-------------------|-------------------| +| `t/96_is_deteministic_value.t` | < 15 s | > 30 s | > 120 s (harness kills at 300 s) | +| `t/76joins.t` | < 15 s | > 30 s | > 120 s | + +If timing exceeds the concern threshold on a clean machine, the C1 JIT may +be failing on large methods. Confirm with step 4 below. + +### 3. Verify correctness + +```bash +cd "$DBIX_DIR" +"$JPERL" -Ilib -It/lib t/96_is_deteministic_value.t | grep -E "^(ok|not ok|1\.\.)" +"$JPERL" -Ilib -It/lib t/76joins.t | grep -E "^(ok|not ok|1\.\.)" +``` + +Expected: `1..8` with 8 `ok` lines for `96_is_det`, `1..27` with 27 `ok` +lines for `76joins`. + +### 4. Check for C1 JIT failures (optional deep-dive) + +```bash +# Add -XX:+PrintCompilation to jperl JVM opts +JPERL_OPTS="-XX:+PrintCompilation" timeout 120 "$JPERL" -Ilib -It/lib \ + t/96_is_deteministic_value.t 2>&1 | grep "made not entrant\|COMPILE SKIPPED\|out of virtual" + +# Or use JFR (Java Flight Recorder) for full JIT event trace +JPERL_OPTS="-XX:StartFlightRecording=filename=/tmp/96det.jfr,duration=120s" \ + timeout 120 "$JPERL" -Ilib -It/lib t/96_is_deteministic_value.t 2>/dev/null +# Then open /tmp/96det.jfr in JDK Mission Control and inspect: +# JVM Internals → JIT Compilation → Compilation Failures +# (look for "out of virtual registers in linear scan") +``` + +A C1 failure shows as `COMPILE SKIPPED` or `made not entrant` for the +SQL::Abstract `_expand_expr` method in `PrintCompilation` output. + +### 5. Reference benchmarks (Phase 1 complete, buffer=32) + +Measured on 2026-05-01 with PerlOnJava4 after merging PR #650: + +| Metric | Value | +|--------|-------| +| `t/96_is_deteministic_value.t` — real time | 11.8 s | +| `t/96_is_deteministic_value.t` — all tests | 8/8 pass | +| `t/76joins.t` — real time | 9.3 s | +| `t/76joins.t` — all tests | 27/27 pass | +| Total `apply()` methods compiled | 8,264 | +| Total `apply()` code bytes | 4.0 MB | +| Average method size | 511 bytes | +| Minimum method size | 200 bytes | +| Largest `apply()` method | 36,554 bytes | +| Estimated savings vs old buffer=256 | ~5.4 MB (~56% reduction) | + +**Notes on the baseline:** +- The design doc's original baseline (Largest=9,377 bytes, 980 methods) was + measured on an older build before the TAILCALL trampoline (~65–130 bytes + per call site) was added to every call site. PerlOnJava4 therefore has + larger methods per call-heavy sub. +- The 36,554-byte largest method did not cause C1 failures in this run + because the machine was under low load. Under CPU contention from orphaned + JVMs (the scenario that caused the original timeout incidents), C1 may + still fail on such methods. **Phase 2 (trampoline extraction) is the key + fix for the largest methods.** +- Phase 1 gives dramatic size reduction for simple subs (minimum dropped + from ~648 bytes → 200 bytes) and a 672-byte reduction for every method + regardless of complexity. + +### 6. Targets for subsequent phases + +| Phase | Target max method size | Expected timing | +|-------|------------------------|-----------------| +| Phase 1 complete (current) | 36,554 bytes | ~12 s (clean) | +| After Phase 2 (trampoline extraction) | ~20,000 bytes* | ~10 s (clean) | +| After Phase 3 (eval prologue) | ~20,000 bytes | ~10 s (clean) | + +*Estimate: largest method has ~250+ call sites × 65 bytes saved = ~16 KB +removed. Actual number of call sites in `_expand_expr` to be measured in +Phase 2. + +--- + +## Progress Tracking + +### Current Status: Phase 1 complete (2026-05-01) + +### Completed Phases + +- [x] **Phase 1: Extend TempLocalCountVisitor + reduce buffer 256→32** (2026-05-01) + - Extended `TempLocalCountVisitor` to count: sub calls (+1 each), eval (+4), + foreach (+4), while/for (+1), flip-flop (+3), xor (+1) + - Reduced `EmitterMethodCreator` pre-init buffer from `+256` to `+32` + - Files: `TempLocalCountVisitor.java`, `EmitterMethodCreator.java` + - PR: #650 + - Result: ~56% total bytecode reduction; both DBIx::Class timing tests pass + in < 15 s; all `make` unit tests pass + +### Next Steps + +1. **Phase 2**: Extract per-call-site TAILCALL trampoline to + `RuntimeCode.resolveTailCalls()` — this is the key fix for large methods + (the 36,554-byte maximum is driven by call-site trampoline code) +2. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win +3. Re-run the timing tests under simulated CPU pressure to confirm C1 failure + rate has dropped (requires running ~10 orphan JVMs to create contention) + +### Open Questions + +- The largest `apply()` method is now 36,554 bytes — well above the 9,377-byte + threshold from the original C1 failure analysis. However, the threshold may + differ between the PerlOnJava2 era and current PerlOnJava4 (different JVM, + different method structure). Phase 2 should reduce this significantly; + measure again after Phase 2 to see if the threshold is now a concern. +- Should `JPERL_SPILL_SLOTS=8` be tested after Phase 2? Reducing the spill + pool from 16 to 8 would save 24 bytes per method (8 fewer null-inits); + measure whether it causes any spill overflow failures first. + +--- + ## Related Files | File | Role | From 35eaa273338a7c15faed3622a05624bc2d423cba Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 16:53:53 +0200 Subject: [PATCH 03/13] perf: Phase 2 - extract per-call-site TAILCALL trampoline to resolveTailCalls() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the ~83-byte inline TAILCALL trampoline emitted at every JVM sub call site with a single INVOKESTATIC to the new RuntimeCode.resolveTailCalls() helper. Add `resolveTailCalls(RuntimeList result, int callContext)` static helper: - Loops while the result is a TAILCALL control-flow marker - Each iteration calls `apply(codeRef, "tailcall", args, callContext)`, which already sets `inTailCallTrampoline` to preserve the existing deep-recursion counter behaviour (tail calls don't inflate the depth counter) - Returns the first non-TAILCALL result to the call site Before the `ASTORE controlFlowTempSlot`, prepend: ILOAD callContextSlot INVOKESTATIC RuntimeCode.resolveTailCalls(RuntimeList, I)RuntimeList Remove the 65-byte inline trampoline block that previously followed `isNonLocalGoto → IFEQ notControlFlow`: - TAILCALL type check (CHECKCAST + getControlFlowType + ordinal + ICONST_4 + IF_ICMPNE) - tailcallLoop label + extract codeRef/args + INVOKESTATIC apply("tailcall", …) - re-check isNonLocalGoto + re-check TAILCALL + IF_ICMPEQ tailcallLoop - two GOTO blockDispatcher branches + notTailcall label After the change the per-call-site sequence is: INVOKESTATIC apply(…) ← unchanged ILOAD callContextSlot ← new INVOKESTATIC resolveTailCalls(…) ← new (~5 bytes replaces ~65 bytes) ASTORE controlFlowTempSlot ALOAD controlFlowTempSlot INVOKEVIRTUAL isNonLocalGoto() IFEQ notControlFlow GOTO blockDispatcher notControlFlow: ALOAD controlFlowTempSlot - Per-call-site bytecode: ~83 bytes → ~18 bytes (~65 bytes saved per site) - Large methods with many sub calls (e.g., SQL::Abstract closures with 250+ call sites) shrink by ~16 KB, reducing C1 JIT "out of virtual registers" failures on CPU-starved machines - tailCallCodeRefSlot / tailCallArgsSlot kept: still needed by the returnLabel trampoline in EmitterMethodCreator for goto &NAME in sub bodies - All `make` unit tests pass - tail_calls.t: 7/7, subroutine.t: 39/39 - goto &sub chains (factorial, multi-hop, @_ aliasing) verified correct - LAST/NEXT/REDO after tail-called sub return verified correct - Both JVM and --interpreter backends pass Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 42 +++++--- .../backend/jvm/EmitSubroutine.java | 102 +++--------------- .../runtime/runtimetypes/RuntimeCode.java | 26 +++++ 3 files changed, 69 insertions(+), 101 deletions(-) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index 8658c985d..639471286 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -765,7 +765,7 @@ Phase 2. ## Progress Tracking -### Current Status: Phase 1 complete (2026-05-01) +### Current Status: Phase 2 complete (2026-05-01) ### Completed Phases @@ -778,25 +778,39 @@ Phase 2. - Result: ~56% total bytecode reduction; both DBIx::Class timing tests pass in < 15 s; all `make` unit tests pass +- [x] **Phase 2: Extract per-call-site TAILCALL trampoline** (2026-05-01) + - Added `RuntimeCode.resolveTailCalls(RuntimeList, int)` static helper + that resolves TAILCALL chains, replacing the ~65-byte inline trampoline + loop previously emitted at every JVM call site + - Modified `EmitSubroutine.handleApplyOperator()`: removed the 80-130 byte + inline trampoline block (lines 765–850 in old code), replaced with a + single `INVOKESTATIC resolveTailCalls` call + `ILOAD callContextSlot` + prepended before the `ASTORE controlFlowTempSlot` + - Per-call-site bytecode: ~83 bytes → ~18 bytes (saves ~65 bytes per site) + - Files: `RuntimeCode.java`, `EmitSubroutine.java` + - All `make` tests pass; `tail_calls.t` (7/7), `subroutine.t` (39/39) pass + - Measured on `core_subroutine_refs.t`: 1,014 methods, avg 488 bytes, max 7,812 bytes + - `goto &sub` chains verified correct (factorial via tail calls, multi-hop chains, + `@_` aliasing preservation, LAST/NEXT/REDO after tail-called sub return) + ### Next Steps -1. **Phase 2**: Extract per-call-site TAILCALL trampoline to - `RuntimeCode.resolveTailCalls()` — this is the key fix for large methods - (the 36,554-byte maximum is driven by call-site trampoline code) -2. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win -3. Re-run the timing tests under simulated CPU pressure to confirm C1 failure +1. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win +2. Re-run the timing tests under simulated CPU pressure to confirm C1 failure rate has dropped (requires running ~10 orphan JVMs to create contention) +3. Consider reducing `JPERL_SPILL_SLOTS` from 16 to 8 after verifying no + spill overflow failures; saves 24 bytes per method ### Open Questions -- The largest `apply()` method is now 36,554 bytes — well above the 9,377-byte - threshold from the original C1 failure analysis. However, the threshold may - differ between the PerlOnJava2 era and current PerlOnJava4 (different JVM, - different method structure). Phase 2 should reduce this significantly; - measure again after Phase 2 to see if the threshold is now a concern. -- Should `JPERL_SPILL_SLOTS=8` be tested after Phase 2? Reducing the spill - pool from 16 to 8 would save 24 bytes per method (8 fewer null-inits); - measure whether it causes any spill overflow failures first. +- The largest `apply()` method before Phase 2 was 36,554 bytes. Phase 2 + estimates ~65 bytes × N call sites removed. For a method with ~250 call + sites, this is ~16 KB saved, bringing it to ~20 KB — still above the + 9,377-byte C1 threshold from the original analysis. However, the threshold + may differ for PerlOnJava4's method structure; re-measure after Phase 3. +- Should `JPERL_SPILL_SLOTS=8` be tested? Reducing the spill pool from 16 + to 8 would save 24 bytes per method; measure whether it causes overflow + failures first. --- diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java index 5521dba62..3d629c7d7 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java @@ -750,7 +750,18 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod int belowResultStackLevel = 0; JavaClassInfo.SpillRef[] baseSpills = new JavaClassInfo.SpillRef[0]; - // Store result in temp slot + // Resolve any pending TAILCALL chain before storing the result. + // resolveTailCalls() loops until the result is no longer TAILCALL, + // replacing ~65 bytes of inline per-call-site trampoline with a + // single INVOKESTATIC. (Phase 2 of perf/reduce-apply-bytecode.) + mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + mv.visitMethodInsn(Opcodes.INVOKESTATIC, + "org/perlonjava/runtime/runtimetypes/RuntimeCode", + "resolveTailCalls", + "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;I)Lorg/perlonjava/runtime/runtimetypes/RuntimeList;", + false); + + // Store resolved (never TAILCALL) result in temp slot mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); // If the caller kept values on the JVM operand stack below the call result (e.g. a left operand), @@ -760,7 +771,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, baseSpills[i]); } - // Load and check if it's a control flow marker + // Load and check if it's a control flow marker (LAST/NEXT/REDO/GOTO/RETURN) mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/runtimetypes/RuntimeList", @@ -769,91 +780,8 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod false); mv.visitJumpInsn(Opcodes.IFEQ, notControlFlow); - // Marked: check if TAILCALL (handle locally with trampoline) - Label tailcallLoop = new Label(); - Label notTailcall = new Label(); - - // Check if type is TAILCALL - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - mv.visitTypeInsn(Opcodes.CHECKCAST, "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList"); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList", - "getControlFlowType", - "()Lorg/perlonjava/runtime/runtimetypes/ControlFlowType;", - false); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/ControlFlowType", - "ordinal", - "()I", - false); - mv.visitInsn(Opcodes.ICONST_4); // TAILCALL.ordinal() = 4 - mv.visitJumpInsn(Opcodes.IF_ICMPNE, notTailcall); - - // TAILCALL trampoline loop - handle tail calls at the call site - mv.visitLabel(tailcallLoop); - - // Extract codeRef and args from the marker - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - mv.visitTypeInsn(Opcodes.CHECKCAST, "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList"); - mv.visitInsn(Opcodes.DUP); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList", - "getTailCallCodeRef", - "()Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", - false); - mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.tailCallCodeRefSlot); - - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList", - "getTailCallArgs", - "()Lorg/perlonjava/runtime/runtimetypes/RuntimeArray;", - false); - mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.tailCallArgsSlot); - - // Call target: RuntimeCode.apply(codeRef, "tailcall", args, context) - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.tailCallCodeRefSlot); - mv.visitLdcInsn("tailcall"); - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.tailCallArgsSlot); - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); // context of the original call site - mv.visitMethodInsn(Opcodes.INVOKESTATIC, - "org/perlonjava/runtime/runtimetypes/RuntimeCode", - "apply", - "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;Ljava/lang/String;Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;I)Lorg/perlonjava/runtime/runtimetypes/RuntimeList;", - false); - - // Store result to controlFlowTempSlot - mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - - // Check if result is still a control flow marker - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/RuntimeList", - "isNonLocalGoto", - "()Z", - false); - mv.visitJumpInsn(Opcodes.IFEQ, notControlFlow); // Not marked, done - - // Marked: check if still TAILCALL - mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - mv.visitTypeInsn(Opcodes.CHECKCAST, "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList"); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/RuntimeControlFlowList", - "getControlFlowType", - "()Lorg/perlonjava/runtime/runtimetypes/ControlFlowType;", - false); - mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, - "org/perlonjava/runtime/runtimetypes/ControlFlowType", - "ordinal", - "()I", - false); - mv.visitInsn(Opcodes.ICONST_4); // TAILCALL.ordinal() = 4 - mv.visitJumpInsn(Opcodes.IF_ICMPEQ, tailcallLoop); // Still TAILCALL, loop - - // Not TAILCALL - different marker (LAST/NEXT/REDO/GOTO), dispatch it - mv.visitJumpInsn(Opcodes.GOTO, blockDispatcher); - - // Not TAILCALL initially - jump to block dispatcher - mv.visitLabel(notTailcall); + // resolveTailCalls() already handled TAILCALL; any remaining marker is + // LAST/NEXT/REDO/GOTO/RETURN — dispatch it. mv.visitJumpInsn(Opcodes.GOTO, blockDispatcher); // Not a control flow marker - load it back and continue diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index 051a0ef7e..f46275ba9 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -3062,6 +3062,32 @@ public static RuntimeList apply(RuntimeScalar runtimeScalar, String subroutineNa throw new PerlCompilerException("Not a CODE reference"); } + /** + * Resolve any pending TAILCALL markers in {@code result}, looping until + * the call chain terminates or a non-TAILCALL marker is returned. + * + *

This is the runtime companion to the inline trampoline that was previously + * emitted at every JVM call site by {@code EmitSubroutine.handleApplyOperator()}. + * Moving the loop here reduces generated bytecode by ~65 bytes per Perl sub call. + * + * @param result the {@code RuntimeList} returned by the most recent {@code apply()} call + * @param callContext the call context (SCALAR / LIST / VOID) of the original site + * @return the final non-TAILCALL result, or a non-TAILCALL control-flow marker + * (LAST / NEXT / REDO / GOTO / RETURN) ready for the call-site dispatcher + */ + public static RuntimeList resolveTailCalls(RuntimeList result, int callContext) { + while (result instanceof RuntimeControlFlowList cfList + && cfList.getControlFlowType() == ControlFlowType.TAILCALL) { + RuntimeScalar codeRef = cfList.getTailCallCodeRef(); + RuntimeArray args = cfList.getTailCallArgs(); + // args may theoretically be null (defensive); treat as empty args list + result = apply(codeRef, "tailcall", + args != null ? (RuntimeBase) args : new RuntimeArray(), + callContext); + } + return result; + } + // Method to apply (execute) a subroutine reference (legacy method for compatibility) public static RuntimeList apply(RuntimeScalar runtimeScalar, String subroutineName, RuntimeBase list, int callContext) { From 47fd63ac0dd286f928c4966b0ccc9b8dacb6660d Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 17:10:52 +0200 Subject: [PATCH 04/13] =?UTF-8?q?perf:=20fix=20Phase=202=20hot-path=20regr?= =?UTF-8?q?ession=20=E2=80=94=20move=20resolveTailCalls()=20inside=20contr?= =?UTF-8?q?ol-flow=20branch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial Phase 2 commit called resolveTailCalls() unconditionally at every call site (before the isNonLocalGoto() check), adding an extra INVOKESTATIC + instanceof to the common path. This reverts that regression. resolveTailCalls() is now invoked only inside the isNonLocalGoto()==true branch, which executes only when apply() returned a control-flow marker (LAST/NEXT/REDO/GOTO/RETURN/TAILCALL) — a rare event in normal execution. ``` apply(...) ASTORE controlFlowTempSlot ALOAD controlFlowTempSlot INVOKEVIRTUAL isNonLocalGoto() <- one virtual call, returns false IFEQ notControlFlow <- branch taken, nothing else executes notControlFlow: ALOAD controlFlowTempSlot ``` ``` INVOKEVIRTUAL isNonLocalGoto() <- returns true (control flow detected) ALOAD controlFlowTempSlot ILOAD callContextSlot INVOKESTATIC resolveTailCalls() <- resolves TAILCALL chain ASTORE controlFlowTempSlot INVOKEVIRTUAL isNonLocalGoto() <- re-check (chain may have ended normally) IFEQ notControlFlow GOTO blockDispatcher ``` | Version | Bytes/site | Hot-path overhead | |----------------|------------|-------------------| | Phase 1 | ~83 bytes | 1 virtual call | | Phase 2 (bad) | ~23 bytes | 1 static + 1 virtual call (+1 extra) | | Phase 2 (this) | ~38 bytes | 1 virtual call (identical to Phase 1) | Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 14 +++--- .../backend/jvm/EmitSubroutine.java | 44 ++++++++++++------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index 639471286..e94c72c99 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -780,16 +780,16 @@ Phase 2. - [x] **Phase 2: Extract per-call-site TAILCALL trampoline** (2026-05-01) - Added `RuntimeCode.resolveTailCalls(RuntimeList, int)` static helper - that resolves TAILCALL chains, replacing the ~65-byte inline trampoline + that resolves TAILCALL chains; replaces the ~65-byte inline trampoline loop previously emitted at every JVM call site - - Modified `EmitSubroutine.handleApplyOperator()`: removed the 80-130 byte - inline trampoline block (lines 765–850 in old code), replaced with a - single `INVOKESTATIC resolveTailCalls` call + `ILOAD callContextSlot` - prepended before the `ASTORE controlFlowTempSlot` - - Per-call-site bytecode: ~83 bytes → ~18 bytes (saves ~65 bytes per site) + - Modified `EmitSubroutine.handleApplyOperator()`: `resolveTailCalls()` is + called only inside the `isNonLocalGoto()==true` branch (rare), so the + common path (no control flow) has **zero extra overhead** vs Phase 1 — + it remains: `apply()` → `ASTORE` → `ALOAD` → `isNonLocalGoto()` → branch + - Per-call-site bytecode: ~83 bytes → ~38 bytes (saves ~45 bytes per site) - Files: `RuntimeCode.java`, `EmitSubroutine.java` - All `make` tests pass; `tail_calls.t` (7/7), `subroutine.t` (39/39) pass - - Measured on `core_subroutine_refs.t`: 1,014 methods, avg 488 bytes, max 7,812 bytes + - Measured on `core_subroutine_refs.t`: 1,014 methods, avg 499 bytes, max 7,992 bytes - `goto &sub` chains verified correct (factorial via tail calls, multi-hop chains, `@_` aliasing preservation, LAST/NEXT/REDO after tail-called sub return) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java index 3d629c7d7..6a512079f 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java @@ -750,18 +750,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod int belowResultStackLevel = 0; JavaClassInfo.SpillRef[] baseSpills = new JavaClassInfo.SpillRef[0]; - // Resolve any pending TAILCALL chain before storing the result. - // resolveTailCalls() loops until the result is no longer TAILCALL, - // replacing ~65 bytes of inline per-call-site trampoline with a - // single INVOKESTATIC. (Phase 2 of perf/reduce-apply-bytecode.) - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); - mv.visitMethodInsn(Opcodes.INVOKESTATIC, - "org/perlonjava/runtime/runtimetypes/RuntimeCode", - "resolveTailCalls", - "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;I)Lorg/perlonjava/runtime/runtimetypes/RuntimeList;", - false); - - // Store resolved (never TAILCALL) result in temp slot + // Store result in temp slot mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); // If the caller kept values on the JVM operand stack below the call result (e.g. a left operand), @@ -771,7 +760,33 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, baseSpills[i]); } - // Load and check if it's a control flow marker (LAST/NEXT/REDO/GOTO/RETURN) + // Fast path: check if the result is any kind of control flow marker. + // This is the common case (no control flow) — branch skips everything below. + mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); + mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, + "org/perlonjava/runtime/runtimetypes/RuntimeList", + "isNonLocalGoto", + "()Z", + false); + mv.visitJumpInsn(Opcodes.IFEQ, notControlFlow); + + // A control-flow marker was returned. It might be TAILCALL (goto &sub), + // LAST/NEXT/REDO/GOTO, or RETURN. Resolve any TAILCALL chain first, then + // re-check whether the final result still carries a marker that the + // block-level dispatcher needs to handle. + // Keeping resolveTailCalls() inside this branch means the common case + // (no control flow) incurs zero extra overhead. (Phase 2 of + // perf/reduce-apply-bytecode.) + mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); + mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + mv.visitMethodInsn(Opcodes.INVOKESTATIC, + "org/perlonjava/runtime/runtimetypes/RuntimeCode", + "resolveTailCalls", + "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;I)Lorg/perlonjava/runtime/runtimetypes/RuntimeList;", + false); + mv.visitVarInsn(Opcodes.ASTORE, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); + + // Re-check: a TAILCALL chain may have ended in a normal return. mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/runtimetypes/RuntimeList", @@ -780,8 +795,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod false); mv.visitJumpInsn(Opcodes.IFEQ, notControlFlow); - // resolveTailCalls() already handled TAILCALL; any remaining marker is - // LAST/NEXT/REDO/GOTO/RETURN — dispatch it. + // Still a marker (LAST/NEXT/REDO/GOTO/RETURN) — dispatch it. mv.visitJumpInsn(Opcodes.GOTO, blockDispatcher); // Not a control flow marker - load it back and continue From ec4f520335e06c00db2b0b42dbd844250f37ed1e Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 19:24:20 +0200 Subject: [PATCH 05/13] fix: eliminate VerifyError from int-typed callContextSlot in JVM bytecode emitter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `callContextSlot` variable was allocated via `allocateLocalVariable()` and stored with `ISTORE` (int type) in four places: - EmitSubroutine.handleApplyOperator() — sub calls: func(args) - Dereference.java — method calls: $obj->method(args) - EmitOperator.handleSubstrOperator() — substr built-in - EmitOperator.handleOperator() — prototype-based operators The pre-initialisation loop in EmitterMethodCreator initialises every temp slot as `ACONST_NULL / ASTORE` (reference type). When any of these `callContextSlot` slots was in the pre-init range, two code paths (one that executed `ISTORE callContextSlot` and one that jumped past it) met at the `blockDispatcher` merge point with incompatible types (int vs null-reference). The JVM verifier threw: VerifyError: Bad local variable type This VerifyError was caught by `executeCodeImpl` which re-ran the main script body through the interpreter, calling `plan tests => 23` a second time — causing "You tried to plan twice" in DBIx::Class torture.t. Fix: replace each `ISTORE callContextSlot` + later `ILOAD callContextSlot` with a direct call to `pushCallContext()` at every use site. `pushCallContext()` emits either an `LDC` constant or `ILOAD 2` (the callContext method parameter); both are side-effect-free and can be re-emitted inline without storing in a local slot. TempLocalCountVisitor: the `->` and `(` cases previously reflected the (now-removed) int slot allocation. They are KEPT because the count controls the pre-init range in EmitterMethodCreator — removing them would shrink that range and leave reference-typed slots in TOP state, causing a second VerifyError ("Type top … is not assignable to reference type"). Fixes the regression introduced in perf/reduce-apply-bytecode Phase 2 (blockDispatcher sharing across call sites). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../perlonjava/backend/jvm/Dereference.java | 11 +++---- .../perlonjava/backend/jvm/EmitOperator.java | 16 ++------- .../backend/jvm/EmitSubroutine.java | 33 ++++++++++++------- .../analysis/TempLocalCountVisitor.java | 13 ++++++-- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/Dereference.java b/src/main/java/org/perlonjava/backend/jvm/Dereference.java index 00ca80c7a..255237083 100644 --- a/src/main/java/org/perlonjava/backend/jvm/Dereference.java +++ b/src/main/java/org/perlonjava/backend/jvm/Dereference.java @@ -965,10 +965,9 @@ static void handleArrowOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod } } - // Save the call context into a local slot for the TAILCALL trampoline. - int callContextSlot = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); - emitterVisitor.pushCallContext(); - mv.visitVarInsn(Opcodes.ISTORE, callContextSlot); + // The call context is NOT stored in a local slot (see EmitSubroutine.java comment for why: + // storing an int into a pre-null-initialised slot causes VerifyError at merge points). + // pushCallContext() is a side-effect-free inline emission and can be called at each use point. // Allocate a unique callsite ID for inline method caching int callsiteId = nextMethodCallsiteId++; @@ -984,7 +983,7 @@ static void handleArrowOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod mv.visitVarInsn(Opcodes.ALOAD, methodSlot); mv.visitVarInsn(Opcodes.ALOAD, subSlot); mv.visitVarInsn(Opcodes.ALOAD, argsArraySlot); - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); // push saved call context + emitterVisitor.pushCallContext(); // push call context mv.visitMethodInsn( Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", @@ -1066,7 +1065,7 @@ static void handleArrowOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.tailCallCodeRefSlot); mv.visitLdcInsn("tailcall"); mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.tailCallArgsSlot); - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); // context of the original call site + emitterVisitor.pushCallContext(); // context of the original call site mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", "apply", diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java b/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java index 0d880b7aa..681d25675 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java @@ -321,12 +321,6 @@ static void handleSubstrOperator(EmitterVisitor emitterVisitor, OperatorNode nod EmitterVisitor scalarVisitor = emitterVisitor.with(RuntimeContextType.SCALAR); EmitterVisitor listVisitor = emitterVisitor.with(RuntimeContextType.LIST); if (node.operand instanceof ListNode operand) { - // Push context - emitterVisitor.pushCallContext(); - - int callContextSlot = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); - emitterVisitor.ctx.mv.visitVarInsn(Opcodes.ISTORE, callContextSlot); - // Create array for varargs operators MethodVisitor mv = emitterVisitor.ctx.mv; @@ -370,7 +364,7 @@ static void handleSubstrOperator(EmitterVisitor emitterVisitor, OperatorNode nod index++; } - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + emitterVisitor.pushCallContext(); mv.visitVarInsn(Opcodes.ALOAD, argsArraySlot); // Check if warnings are enabled at compile time @@ -404,12 +398,6 @@ static void handleOperator(EmitterVisitor emitterVisitor, OperatorNode node) { EmitterVisitor scalarVisitor = emitterVisitor.with(RuntimeContextType.SCALAR); EmitterVisitor listVisitor = emitterVisitor.with(RuntimeContextType.LIST); if (node.operand instanceof ListNode operand) { - // Push context - emitterVisitor.pushCallContext(); - - int callContextSlot = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); - emitterVisitor.ctx.mv.visitVarInsn(Opcodes.ISTORE, callContextSlot); - // Create array for varargs operators MethodVisitor mv = emitterVisitor.ctx.mv; @@ -453,7 +441,7 @@ static void handleOperator(EmitterVisitor emitterVisitor, OperatorNode node) { index++; } - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + emitterVisitor.pushCallContext(); mv.visitVarInsn(Opcodes.ALOAD, argsArraySlot); emitOperator(node, emitterVisitor); diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java index 6a512079f..207f7f670 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitSubroutine.java @@ -497,14 +497,23 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod if (CompilerOptions.DEBUG_ENABLED) emitterVisitor.ctx.logDebug("handleApplyElementOperator " + node + " in context " + emitterVisitor.ctx.contextType); MethodVisitor mv = emitterVisitor.ctx.mv; - // Capture the call context into a local slot early. - // IMPORTANT: Do not leave the context int on the JVM operand stack while evaluating - // subroutine arguments. Argument evaluation may trigger non-local control flow - // propagation (e.g. last/next/redo) which jumps out of the expression; any stray - // stack items would then break ASM frame merging. - int callContextSlot = emitterVisitor.ctx.symbolTable.allocateLocalVariable(); - emitterVisitor.pushCallContext(); - mv.visitVarInsn(Opcodes.ISTORE, callContextSlot); + // Note: The call context is NOT stored in a local variable slot. + // pushCallContext() emits either a compile-time constant (LDC) or loads the + // callContext method parameter (ILOAD 2). Both are side-effect-free and can be + // re-emitted at the exact moment the value is needed on the JVM operand stack, + // so there is no need to stash the int in a slot. + // + // Storing it in a slot would be WRONG: the pre-initialisation loop in + // EmitterMethodCreator initialises every temporary slot to null (ACONST_NULL / + // ASTORE) so that reference slots are never in TOP state at merge points. + // An int slot initialised that way acquires the reference type "null" from the + // pre-init path. At a merge point (e.g. blockDispatcher) that is reachable from + // both a path that executed ISTORE-callContextSlot and a path through a + // conditional branch that skipped it, the JVM verifier sees conflicting types + // (int vs null-reference) and throws VerifyError: "Bad local variable type". + // This VerifyError triggers the interpreter-fallback which re-runs the main + // script body, calling plan() a second time and causing the "tried to plan + // twice" error in DBIx::Class torture.t (perf/reduce-apply-bytecode Phase 2). String subroutineName = ""; if (node.left instanceof OperatorNode operatorNode && operatorNode.operator.equals("&")) { @@ -593,7 +602,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod if (node.left instanceof SubroutineNode subNode && subNode.useTryCatch) { mv.visitVarInsn(Opcodes.ALOAD, codeRefSlot); mv.visitVarInsn(Opcodes.ALOAD, 1); // caller's @_ (slot 1) - shared, not copied - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + emitterVisitor.pushCallContext(); mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", "apply", @@ -622,7 +631,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod if (node.getBooleanAnnotation("shareCallerArgs")) { mv.visitVarInsn(Opcodes.ALOAD, codeRefSlot); mv.visitVarInsn(Opcodes.ALOAD, 1); // caller's @_ (slot 1) - shared, not copied - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + emitterVisitor.pushCallContext(); mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", "apply", @@ -711,7 +720,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod mv.visitVarInsn(Opcodes.ALOAD, codeRefSlot); mv.visitVarInsn(Opcodes.ALOAD, nameSlot); mv.visitVarInsn(Opcodes.ALOAD, argsArraySlot); - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); // Push call context to stack + emitterVisitor.pushCallContext(); // Push call context to stack mv.visitMethodInsn( Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", @@ -778,7 +787,7 @@ static void handleApplyOperator(EmitterVisitor emitterVisitor, BinaryOperatorNod // (no control flow) incurs zero extra overhead. (Phase 2 of // perf/reduce-apply-bytecode.) mv.visitVarInsn(Opcodes.ALOAD, emitterVisitor.ctx.javaClassInfo.controlFlowTempSlot); - mv.visitVarInsn(Opcodes.ILOAD, callContextSlot); + emitterVisitor.pushCallContext(); mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/runtimetypes/RuntimeCode", "resolveTailCalls", diff --git a/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java b/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java index 5b2cd0bee..699e892e6 100644 --- a/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java +++ b/src/main/java/org/perlonjava/frontend/analysis/TempLocalCountVisitor.java @@ -46,10 +46,19 @@ public void visit(BinaryOperatorNode node) { // Using acquireSpillSlot() first, but count conservatively in case pool is full. case "&&", "||", "//" -> countTemp(); - // Dereference (->) conservatively counts 1 temp for method/element access. + // Dereference (->): the callContextSlot was removed to prevent VerifyError + // (int slot pre-initialised as null reference). However we KEEP this count + // because it serves as a pre-init-range proxy: Dereference.java still + // allocates reference-typed slots when the spill pool overflows, and the + // pre-init loop (EmitterMethodCreator) must cover those slots. Removing + // this count would shrink the pre-init range, leaving later-allocated + // reference slots in TOP state and causing a different VerifyError. case "->" -> countTemp(); - // Sub-call apply operator: always allocates callContextSlot directly. + // Sub-call apply operator: callContextSlot was removed (same reason as "->"). + // We KEEP this count for the same pre-init-range reason: other code paths + // inside handleApplyOperator allocate reference slots when the spill pool + // overflows, so we need enough pre-init coverage. case "(" -> countTemp(); // Flip-flop operators in scalar context always allocate 3 slots directly: From 15e2c4cb92999e855aab6116b2adea8d3a06b259 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 19:25:03 +0200 Subject: [PATCH 06/13] docs: update design doc with Phase 2 bugfix for callContextSlot VerifyError Document the root cause (int slot pre-initialised as null-reference causing VerifyError at blockDispatcher merge point) and the fix (pushCallContext() at each use site; keep TempLocalCountVisitor counts for pre-init range). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index e94c72c99..41876440d 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -793,6 +793,31 @@ Phase 2. - `goto &sub` chains verified correct (factorial via tail calls, multi-hop chains, `@_` aliasing preservation, LAST/NEXT/REDO after tail-called sub return) +- [x] **Phase 2 bugfix: VerifyError from int callContextSlot** (2026-05-01) + - Root cause: `callContextSlot` was allocated via `allocateLocalVariable()` and + stored with `ISTORE` (int type) in four emitter sites: + `EmitSubroutine.handleApplyOperator()`, `Dereference.java` method calls, + `EmitOperator.handleSubstrOperator()`, `EmitOperator.handleOperator()`. + - The pre-init loop (EmitterMethodCreator) initialises every temp slot as + `ACONST_NULL / ASTORE` (reference type). When a callContextSlot (int) in the + pre-init range was written by some code paths but not others, the verifier + found "int vs null-reference" at the `blockDispatcher` merge point and threw + `VerifyError: Bad local variable type`. + - This VerifyError propagated out of the compiled main body through + `executeCodeImpl`, which transparently re-ran the script body via the + interpreter — calling `plan tests => 23` a second time → "You tried to plan + twice" in DBIx::Class `t/multi_create/torture.t`. + - Fix: replace every `ISTORE callContextSlot` + `ILOAD callContextSlot` with + inline calls to `pushCallContext()` at each use site; `pushCallContext()` emits + either an `LDC` constant or `ILOAD 2` (no slot allocation needed). + - `TempLocalCountVisitor`: the `->` and `(` count entries are KEPT (even though + the int slot is removed) because they control the pre-init range — removing + them shrank the range and caused a second VerifyError ("Type top … is not + assignable to reference type" at slot 143 in torture.t's main body). + - Files: `EmitSubroutine.java`, `Dereference.java`, `EmitOperator.java`, + `TempLocalCountVisitor.java` + - DBIx::Class `t/multi_create/torture.t` now passes all 23 tests (JVM backend) + ### Next Steps 1. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win From c8c6cc642abcaa8a03c6f7064c701fbea7af32de Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 20:33:42 +0200 Subject: [PATCH 07/13] docs: record DBIx::Class full-suite PASS as safe revert point DBIx::Class 0.082844: 314 files, 13858 tests, PASS (1618 s wall). Commit e3b987640 is the earliest known-good SHA on this branch. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index 41876440d..b65813b8b 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -818,6 +818,26 @@ Phase 2. `TempLocalCountVisitor.java` - DBIx::Class `t/multi_create/torture.t` now passes all 23 tests (JVM backend) +### Safe Revert Point — DBIx::Class full suite PASS (2026-05-01) + +**Commit: `e3b987640`** (HEAD of `perf/reduce-apply-bytecode` after Phase 2 bugfix) + +Full DBIx::Class 0.082844 test suite run on this commit: + +``` +Files=314, Tests=13858, 1618 wallclock secs (3.49 usr + 1.64 sys = 5.14 CPU) +Result: PASS + RIBASUSHI/DBIx-Class-0.082844.tar.gz + /usr/bin/make test -- OK +``` + +All 13,858 tests pass. Two files have TODO-passed tests (expected): +- `t/sqlmaker/limit_dialects/generic_subq.t` — TODO 9, 11, 13, 15, 17 +- `t/storage/txn_scope_guard.t` — TODO 13, 15, 17 + +This is the earliest known-good commit for DBIx::Class on this branch. +If any subsequent change causes a regression, revert to this SHA. + ### Next Steps 1. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win From ea6fe16da9afd770e191cad5d6901533bf44602d Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 21:54:01 +0200 Subject: [PATCH 08/13] fix: restore correct tied-scalar FETCH count for fallback compound assigns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a compound assignment operator like `|=`, `&=`, `^=`, `**=`, `<<=`, `>>=`, or `.=` is applied to a tied scalar variable, Perl semantics require exactly two FETCHes: one to read the current value (for the RHS of the base operation) and one to return the result to the caller. The fast-path operators that go through dedicated *Assign methods (addAssign, subtractAssign, etc.) already return the arg1 (lvalue) object directly — so the caller's assignment triggers the 2nd FETCH correctly. The fallback path in handleCompoundAssignment() was using the return value of set() instead of the original lvalue. For TIED_SCALAR variables, set() calls tiedStore() which returns the stored value (a plain RuntimeScalar copy), not the tied object. So the 2nd FETCH never fired, leaving tied-scalar compound assigns with fetch=1 rather than the required fetch=2. This caused 9 tests in op/bop.t to fail on the perf/reduce-apply-bytecode branch (491/522 passing vs. the expected 500/522 baseline), even though they were accidentally passing before (the tied scalar was being unwrapped by a later assignment in is(), which accidentally triggered the 2nd FETCH). Fix: keep leftSlot live across the set() call, discard set()'s return value, then reload leftSlot so the lvalue (TIED_SCALAR) propagates to the caller — exactly as the *Assign methods do. For non-tied lvalues the object identity is unchanged so there is no behavioral difference. Result: op/bop.t now passes 500/522 (matching the Phase 1 baseline). Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/jvm/EmitBinaryOperator.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitBinaryOperator.java b/src/main/java/org/perlonjava/backend/jvm/EmitBinaryOperator.java index 5d3ed0610..a0b6d3ad4 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitBinaryOperator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitBinaryOperator.java @@ -289,9 +289,7 @@ static void handleCompoundAssignment(EmitterVisitor emitterVisitor, BinaryOperat if (pooledRight) { emitterVisitor.ctx.javaClassInfo.releaseSpillSlot(); } - if (pooledLeft) { - emitterVisitor.ctx.javaClassInfo.releaseSpillSlot(); - } + // Note: leftSlot is released AFTER the assignment so we can reload it below // perform the operation // Note: operands are already on the stack (left DUPped, then right) String baseOperator = node.operator.substring(0, node.operator.length() - 1); @@ -319,7 +317,16 @@ static void handleCompoundAssignment(EmitterVisitor emitterVisitor, BinaryOperat } else { mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/runtimetypes/RuntimeScalar", "set", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } - + // Discard set()/setPreservingByteString() return value and reload leftObj. + // This matches how *Assign methods (addAssign, etc.) return arg1 directly — + // for TIED_SCALAR lvalues the caller will trigger a 2nd FETCH when it reads + // the result, giving the correct Perl semantics (fetch=2 for compound assigns). + mv.visitInsn(Opcodes.POP); + mv.visitVarInsn(Opcodes.ALOAD, leftSlot); + if (pooledLeft) { + emitterVisitor.ctx.javaClassInfo.releaseSpillSlot(); + } + // For string concat assign (.=), invalidate pos() since string was modified if (node.operator.equals(".=")) { mv.visitInsn(Opcodes.DUP); From b27b24a1e53edb316d1c624d36d6ec10daa35e42 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 1 May 2026 23:33:24 +0200 Subject: [PATCH 09/13] =?UTF-8?q?perf:=20Phase=204=20=E2=80=94=20route=20o?= =?UTF-8?q?versized=20eval-generated=20methods=20to=20interpreter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Methods generated by Sub::Quote / Moo via string eval can be 9–36 KB. The C1 JIT either fails to compile them efficiently or triggers ASM frame-compute crashes under CPU pressure, causing t/cdbi/68-inflate_has_a.t to time out at 300 s in CI when other JVMs are competing for the CPU. Phase 4 adds a method-size threshold check in getBytecodeInternal(): - After generating bytecode, parse the class file to find the largest method - If the code is eval-generated (fileName starts with "(eval ") AND the largest method exceeds JPERL_MAX_METHOD_BYTES (default: 9 000 bytes), throw a RuntimeException("...requires interpreter fallback") which is then handled by createRuntimeCode() to route to the interpreter - Fast path: if classData.length <= threshold, skip parsing entirely (99% of classes are small; no overhead for them) Why eval-only? Regular Perl files can also have methods up to ~16 KB (e.g. unit test bodies in local.t) that work correctly on the JVM backend but would fail on the interpreter due to known parity gaps. The C1 failure problem exists exclusively in Sub::Quote string-eval'd code, so restricting the threshold to eval-generated code avoids false positives. Key changes: - EmitterMethodCreator.java: add Phase 4 threshold check - Parse bytecode to find max method size (only when isEvalCode + large) - Throw RuntimeException with "requires interpreter fallback" in message - Pre-existing RuntimeException catch re-throws cleanly without wrapping - EmitterMethodCreator.java: improve debug output - JPERL_BYTECODE_SIZE_DEBUG output now correctly scoped inside size-check - JPERL_MAX_METHOD_BYTES env var overrides the 9 000-byte default - dev/design/reduce-apply-bytecode.md: add Phase 4 design and update progress tracking, mark Phase 4 complete Verified: - make: all unit tests PASS (including local.t 74/74, destroy_collections.t) - t/cdbi/68-inflate_has_a.t: PASS 6/6 (with threshold firing 3 times for 14 392 / 13 449 / 15 668-byte Sub::Quote-generated methods) Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/reduce-apply-bytecode.md | 150 ++++++++++++++++-- .../backend/jvm/EmitterMethodCreator.java | 69 +++++++- 2 files changed, 199 insertions(+), 20 deletions(-) diff --git a/dev/design/reduce-apply-bytecode.md b/dev/design/reduce-apply-bytecode.md index b65813b8b..1f3c280f4 100644 --- a/dev/design/reduce-apply-bytecode.md +++ b/dev/design/reduce-apply-bytecode.md @@ -441,6 +441,86 @@ timeout 60 ./jperl t/96_is_deteministic_value.t # DBIx::Class --- +### Phase 4 — Method-Size Threshold: Route Oversized Methods to Interpreter (key fix) + +**Goal**: Prevent C1 JIT register-allocation failures for methods that remain +above the ~9 KB threshold after Phases 1–3. + +**Why Phase 3 is not enough**: The 7 methods > 9 KB are Sub::Quote / Moo +generated string-eval'd subs. Their size is dominated by inlined attribute +accessor logic — genuine Perl code, not overhead. No overhead-extraction +technique can reduce them below 9 KB. + +**Key insight**: PerlOnJava already has a working interpreter fallback. +When the JVM rejects bytecode (VerifyError, "Method too large", frame-compute +crash), `PerlLanguageProvider.checkForInterpreterFallback()` routes the code +through the PerlOnJava interpreter instead. The interpreter is ~5–20× slower +than JIT-compiled code — but the C1 failure path is ~50× slower. For the +7 large methods, interpreter mode is *faster* than the C1 failure scenario. + +**Design**: + +After measuring `code_bytes` in `EmitterMethodCreator.getBytecodeInternal()`, +if the largest method exceeds a configurable threshold, throw a specially-tagged +runtime exception that `checkForInterpreterFallback` already handles: + +```java +// EmitterMethodCreator.getBytecodeInternal(), inside the BYTECODE_SIZE_DEBUG block +// (or always, regardless of debug): +if (maxCodeLen > MAX_BYTECODE_FOR_JIT) { + throw new RuntimeException( + "Method too large for reliable JIT: " + maxCodeLen + + " bytes (threshold=" + MAX_BYTECODE_FOR_JIT + "). " + + "requires interpreter fallback"); +} +``` + +`checkForInterpreterFallback` already matches `"requires interpreter fallback"`: + +```java +// PerlLanguageProvider.java (already present): +msg.contains("requires interpreter fallback") // → returns true +``` + +The default threshold is configurable via env var: + +```java +private static final int MAX_BYTECODE_FOR_JIT = + Integer.parseInt(System.getenv().getOrDefault("JPERL_MAX_METHOD_BYTES", "10000")); +``` + +A value of 10,000 bytes catches all 7 current offenders (9,043–36,554 bytes) +while leaving the 7,804-byte method on the JVM-compiled path. + +**Impact**: + +- The 7 oversized methods run via PerlOnJava interpreter: ~5–20× slower than + JIT but ~2–10× faster than C1 failure. For `t/cdbi/68-inflate_has_a.t`, + worst-case timing increases from ~10 s to ~100 s — well within the 300 s + harness limit even under heavy load. +- All other methods (7,539 of 7,546) continue on the JVM-compiled fast path. +- C1 failure mode is eliminated entirely for the affected methods. + +**Measuring the threshold**: + +```bash +# After implementing, confirm which methods are affected: +JPERL_BYTECODE_SIZE_DEBUG=1 JPERL_MAX_METHOD_BYTES=10000 timeout 60 \ + ./jperl -Ilib -It/lib t/96_is_deteministic_value.t 2>&1 \ + | grep "Method too large\|BYTECODE_SIZE method" | sort -u | head -20 + +# Tune threshold up or down based on timing results: +JPERL_MAX_METHOD_BYTES=15000 # catches only 4 largest +JPERL_MAX_METHOD_BYTES=10000 # catches all 7 above 9 KB (recommended default) +JPERL_MAX_METHOD_BYTES=8000 # conservative; catches 8 methods +``` + +**Risk**: Low. The interpreter fallback is already battle-tested. The only +behavioural change is that 7 methods run ~10× slower. If those methods happen +to be in hot loops, performance degrades gracefully rather than catastrophically. + +--- + ### Phase 3 — Extract Eval Block Prologue/Epilogue (medium impact, low risk) **Goal**: Replace inline 4–7 instruction sequences in the eval prologue/epilogue @@ -749,23 +829,57 @@ Measured on 2026-05-01 with PerlOnJava4 after merging PR #650: from ~648 bytes → 200 bytes) and a 672-byte reduction for every method regardless of complexity. -### 6. Targets for subsequent phases +### 6. Measured state and targets for subsequent phases -| Phase | Target max method size | Expected timing | -|-------|------------------------|-----------------| -| Phase 1 complete (current) | 36,554 bytes | ~12 s (clean) | -| After Phase 2 (trampoline extraction) | ~20,000 bytes* | ~10 s (clean) | -| After Phase 3 (eval prologue) | ~20,000 bytes | ~10 s (clean) | +**Post-Phase-2 measurement** (2026-05-01, `t/96_is_deteministic_value.t`): -*Estimate: largest method has ~250+ call sites × 65 bytes saved = ~16 KB -removed. Actual number of call sites in `_expand_expr` to be measured in -Phase 2. +| Metric | Value | +|--------|-------| +| Total methods | 7,546 | +| Total code | 3,191 KB | +| Average method size | 433 bytes | +| Methods > 9 KB | **7** | +| Largest method | **36,554 bytes** | + +The 7 methods above the C1 threshold are Sub::Quote / Moo generated +string-eval'd subs — Moo attribute accessors with inlined type-checking. +Their size is dominated by genuine user-code complexity, not overhead. +Phase 3 (eval extraction, ~41 bytes/eval block) will not move their needle. + +**Phase 4 is therefore the key fix** for the load-induced timeout: + +| Phase | Effect on C1 failures | Expected timing (load) | +|-------|----------------------|------------------------| +| Phase 1+2 complete | 7 methods still above 9 KB | passes clean, fails under load | +| After Phase 3 (eval prologue) | 7 methods still above 9 KB | same | +| After Phase 4 (size threshold) | 0 methods trigger C1 failure | passes under load | --- ## Progress Tracking -### Current Status: Phase 2 complete (2026-05-01) +### Current Status: Phase 4 complete (2026-05-01) + +All planned phases for preventing load-induced CI timeouts are now implemented. +Phase 3 (eval prologue extraction) remains as an optional further-reduction step. + +**Post-Phase-4 functional results** (local test run): + +- `make` unit tests: PASS (all ~600 unit tests including local.t, destroy_collections.t) +- `t/cdbi/68-inflate_has_a.t`: PASS 6/6 subtests +- Phase 4 threshold fires for 3 Sub::Quote methods (14,392 / 13,449 / 15,668 bytes) +- Regular test file methods (up to 16 KB) are NOT affected (not eval-generated) + +**Phase 4 implementation summary** (2026-05-01): +- Added method-size threshold check to `EmitterMethodCreator.getBytecodeInternal()` +- Only applies to **eval-generated code** (`fileName` starts with `"(eval "`) +- Default threshold: 9,000 bytes (catches all 7 Sub::Quote methods, min 9,043 bytes) +- Configurable via `JPERL_MAX_METHOD_BYTES` env var (set to 0 to disable) +- Threshold exception propagates cleanly, handled by `createRuntimeCode()` with + message: "Note: JVM compilation needs interpreter fallback (Method too large for + reliable JIT: N bytes (threshold=9000). requires interpreter fallback)." +- Fast path: skips bytecode parsing entirely when `classData.length <= threshold` + (99% of classes), keeping overhead negligible ### Completed Phases @@ -838,11 +952,21 @@ All 13,858 tests pass. Two files have TODO-passed tests (expected): This is the earliest known-good commit for DBIx::Class on this branch. If any subsequent change causes a regression, revert to this SHA. +- [x] **Phase 4: Method-size threshold — route large eval-generated methods to interpreter** (2026-05-01) + - Added method-size threshold check in `EmitterMethodCreator.getBytecodeInternal()` + - Only applies to eval-generated code (`fileName` starts with `"(eval "`) + - Default threshold: 9,000 bytes — catches all 7 Sub::Quote/Moo methods + - Configurable via `JPERL_MAX_METHOD_BYTES` env var + - Fast path: no parsing when `classData.length <= threshold` + - DBIx::Class `t/cdbi/68-inflate_has_a.t`: PASS 6/6; Phase 4 fires for 3 large evals + - Files: `EmitterMethodCreator.java` + ### Next Steps -1. **Phase 3**: Extract eval prologue/epilogue sequences — small but easy win -2. Re-run the timing tests under simulated CPU pressure to confirm C1 failure - rate has dropped (requires running ~10 orphan JVMs to create contention) +1. **Phase 3 (optional)**: Extract eval prologue/epilogue sequences — small win, + does not affect C1 failures but reduces overall bytecode slightly +2. Re-run full DBIx::Class suite with Phase 4 to confirm no regressions + (previous full PASS was at commit `e3b987640` / `3a1707b56`) 3. Consider reducing `JPERL_SPILL_SLOTS` from 16 to 8 after verifying no spill overflow failures; saves 24 bytes per method diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index b4bfb0f35..0ce41550e 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -1150,10 +1150,42 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean cw.visitEnd(); classData = cw.toByteArray(); // Generate the bytecode + // Phase 4: Method-size threshold check. + // Parse the generated bytecode to find the largest method's code length. + // If the code comes from a string eval (fileName = "(eval N)") AND the largest + // method exceeds JPERL_MAX_METHOD_BYTES (default 9 000), trigger interpreter + // fallback rather than letting the C1 JIT fail under register pressure. + // + // We restrict the threshold to eval-generated code because: + // 1. The C1 JIT failures are caused by Sub::Quote / Moo string-eval'd methods. + // 2. Regular Perl files can also have large methods (e.g. unit test bodies up to + // 16 KB) that compile and run correctly on the JVM backend but fail on the + // interpreter due to known parity gaps (local, tied, etc.). + // 3. String-eval'd code that triggers this threshold is exactly the Sub::Quote + // path where the interpreter fallback is already known to work. + // + // This path also produces JPERL_BYTECODE_SIZE_DEBUG output when that env var is set. String bytecodeSizeDebug = System.getenv("JPERL_BYTECODE_SIZE_DEBUG"); - if (bytecodeSizeDebug != null && !bytecodeSizeDebug.isEmpty()) { + String maxMethodBytesEnv = System.getenv("JPERL_MAX_METHOD_BYTES"); + // Default: 9 000 bytes — catches all 7 Sub::Quote methods (smallest: 9 043 B). + // Set to 0 to disable, or Integer.MAX_VALUE to disable the threshold entirely. + int maxMethodBytesThreshold = (maxMethodBytesEnv != null && !maxMethodBytesEnv.isEmpty()) + ? Integer.parseInt(maxMethodBytesEnv) : 9_000; + boolean sizeDebugEnabled = bytecodeSizeDebug != null && !bytecodeSizeDebug.isEmpty(); + + // Determine if the source code is from a string eval ("(eval N)" filename). + // Only eval-generated methods have the C1 failure problem we're trying to fix. + String srcFileName = ctx.compilerOptions != null ? ctx.compilerOptions.fileName : null; + boolean isEvalCode = srcFileName != null && srcFileName.startsWith("(eval "); + + // Only parse when debug is on (always parse for debug), OR when the source is eval + // code and the class is large enough to potentially exceed the threshold. + // Fast path: if classData is smaller than the threshold, no single method can exceed it. + if (sizeDebugEnabled || (isEvalCode && classData.length > maxMethodBytesThreshold)) { try { - System.err.println("BYTECODE_SIZE class=" + className + " bytes=" + classData.length); + if (sizeDebugEnabled) { + System.err.println("BYTECODE_SIZE class=" + className + " bytes=" + classData.length); + } java.io.DataInputStream in = new java.io.DataInputStream(new java.io.ByteArrayInputStream(classData)); int magic = in.readInt(); @@ -1271,18 +1303,36 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean if (codeLen > maxCodeLen) { maxCodeLen = codeLen; } - boolean isApply = "apply".equals(mName); - boolean isLarge = codeLen >= 30000; - if (isApply || isLarge) { - System.err.println("BYTECODE_SIZE method=" + mName + mDesc + " code_bytes=" + codeLen); + if (sizeDebugEnabled) { + boolean isApply = "apply".equals(mName); + boolean isLarge = codeLen >= 30000; + if (isApply || isLarge) { + System.err.println("BYTECODE_SIZE method=" + mName + mDesc + " code_bytes=" + codeLen); + } } } } - if (maxCodeLen >= 0) { + if (sizeDebugEnabled && maxCodeLen >= 0) { System.err.println("BYTECODE_SIZE max_code_bytes=" + maxCodeLen); } + + // Phase 4: if the largest method exceeds the threshold, route to interpreter. + // The C1 JIT fails for methods above ~9-10 KB under register pressure, causing + // 50x slowdowns. The interpreter is only ~5-20x slower than compiled code, + // so it is always preferable to a C1 failure. + if (maxCodeLen > maxMethodBytesThreshold) { + throw new RuntimeException( + "Method too large for reliable JIT: " + maxCodeLen + + " bytes (threshold=" + maxMethodBytesThreshold + "). " + + "requires interpreter fallback"); + } + } catch (RuntimeException re) { + // Re-throw RuntimeException so the threshold check above propagates. + // (The outer catch in getBytecodeInternal/caller will route to interpreter.) + throw re; } catch (Throwable ignored) { + // Ignore parse errors; class will be loaded as-is } } @@ -1503,6 +1553,11 @@ protected Class loadClass(String name, boolean resolve) throws ClassNotFoundE // Let this propagate so getBytecode() can attempt large-code refactoring and retry. throw e; } catch (RuntimeException e) { + // Phase 4: threshold exception — let it propagate cleanly without wrapping so that + // createRuntimeCode() catches it and routes to the interpreter with a clear message. + if (e.getMessage() != null && e.getMessage().contains("requires interpreter fallback")) { + throw e; + } // Enhanced error message with debugging information StringBuilder errorMsg = new StringBuilder(); errorMsg.append(String.format( From aed4a090328241372f049370c2d5949405fc9f65 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 2 May 2026 08:49:39 +0200 Subject: [PATCH 10/13] refactor: clean up Phase 4 comment/structure in EmitterMethodCreator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganise the three-level guard comment block to match the guard-order list used in the design doc, and add explicit closing-brace comments so the nesting is immediately legible. Also rename local variable `env` → `maxBytesEnv` to avoid shadowing the `String[] env` declared earlier in the same method (the old name caused a compile error introduced in the previous structural refactor attempt). No behaviour change; build and tests pass. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/jvm/EmitterMethodCreator.java | 83 ++++++++++--------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index 0ce41550e..f48cb8f6b 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -1151,49 +1151,57 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean classData = cw.toByteArray(); // Generate the bytecode // Phase 4: Method-size threshold check. - // Parse the generated bytecode to find the largest method's code length. - // If the code comes from a string eval (fileName = "(eval N)") AND the largest - // method exceeds JPERL_MAX_METHOD_BYTES (default 9 000), trigger interpreter - // fallback rather than letting the C1 JIT fail under register pressure. // - // We restrict the threshold to eval-generated code because: - // 1. The C1 JIT failures are caused by Sub::Quote / Moo string-eval'd methods. - // 2. Regular Perl files can also have large methods (e.g. unit test bodies up to - // 16 KB) that compile and run correctly on the JVM backend but fail on the - // interpreter due to known parity gaps (local, tied, etc.). - // 3. String-eval'd code that triggers this threshold is exactly the Sub::Quote - // path where the interpreter fallback is already known to work. + // The C1 JIT fails under register pressure for Sub::Quote / Moo methods + // compiled from string eval. Route them to the interpreter proactively. // - // This path also produces JPERL_BYTECODE_SIZE_DEBUG output when that env var is set. + // Guard order (outermost first, cheapest first): + // 1. Filename: only eval-generated code ("(eval N)") + // 2. Class size: classData.length > threshold (O(1) skip for small classes) + // 3. Parse: scan the class file to get the *exact* Code-attribute length + // + // Why we cannot use classData.length alone (step 2) as the final decision: + // Sub::Quote generates classes where the constant pool is 1.3×–4.9× the + // method code size (measured: 9 KB code → 23 KB class, 7.7 KB code → 32 KB + // class). There is no classData.length threshold that catches all code > 9 KB + // without also catching many code < 7 KB. Step 3 (parsing) is required. + // The parse is cheap (O(classData.length), a few KB) and only reaches step 3 + // for the small number of large eval-generated classes. + + // Step 1 — filename check (O(1), no class parsing) + String srcFileName = ctx.compilerOptions != null ? ctx.compilerOptions.fileName : null; + boolean isEvalCode = srcFileName != null && srcFileName.startsWith("(eval "); + String bytecodeSizeDebug = System.getenv("JPERL_BYTECODE_SIZE_DEBUG"); - String maxMethodBytesEnv = System.getenv("JPERL_MAX_METHOD_BYTES"); - // Default: 9 000 bytes — catches all 7 Sub::Quote methods (smallest: 9 043 B). - // Set to 0 to disable, or Integer.MAX_VALUE to disable the threshold entirely. - int maxMethodBytesThreshold = (maxMethodBytesEnv != null && !maxMethodBytesEnv.isEmpty()) - ? Integer.parseInt(maxMethodBytesEnv) : 9_000; boolean sizeDebugEnabled = bytecodeSizeDebug != null && !bytecodeSizeDebug.isEmpty(); - // Determine if the source code is from a string eval ("(eval N)" filename). - // Only eval-generated methods have the C1 failure problem we're trying to fix. - String srcFileName = ctx.compilerOptions != null ? ctx.compilerOptions.fileName : null; - boolean isEvalCode = srcFileName != null && srcFileName.startsWith("(eval "); + // Early exit: not eval-generated code and debug not requested — nothing to do. + if (sizeDebugEnabled || isEvalCode) { + // Default: 9 000 bytes — catches all 7 Sub::Quote methods (smallest: 9 043 B). + // Override with JPERL_MAX_METHOD_BYTES (large number disables the threshold). + int maxMethodBytesThreshold = 9_000; + if (isEvalCode) { + String maxBytesEnv = System.getenv("JPERL_MAX_METHOD_BYTES"); + if (maxBytesEnv != null && !maxBytesEnv.isEmpty()) maxMethodBytesThreshold = Integer.parseInt(maxBytesEnv); + } - // Only parse when debug is on (always parse for debug), OR when the source is eval - // code and the class is large enough to potentially exceed the threshold. - // Fast path: if classData is smaller than the threshold, no single method can exceed it. - if (sizeDebugEnabled || (isEvalCode && classData.length > maxMethodBytesThreshold)) { - try { - if (sizeDebugEnabled) { - System.err.println("BYTECODE_SIZE class=" + className + " bytes=" + classData.length); - } + // Step 2 — class size pre-filter (O(1)). + // classData.length >= method_code_length always, so if classData is small + // enough no method inside can exceed the threshold — skip parsing. + if (sizeDebugEnabled || classData.length > maxMethodBytesThreshold) { + // Step 3 — parse the class file to get the exact method code length. + try { + if (sizeDebugEnabled) { + System.err.println("BYTECODE_SIZE class=" + className + " bytes=" + classData.length); + } - java.io.DataInputStream in = new java.io.DataInputStream(new java.io.ByteArrayInputStream(classData)); - int magic = in.readInt(); - if (magic != 0xCAFEBABE) { - throw new RuntimeException("Bad class magic"); - } - in.readUnsignedShort(); // minor - in.readUnsignedShort(); // major + java.io.DataInputStream in = new java.io.DataInputStream(new java.io.ByteArrayInputStream(classData)); + int magic = in.readInt(); + if (magic != 0xCAFEBABE) { + throw new RuntimeException("Bad class magic"); + } + in.readUnsignedShort(); // minor + in.readUnsignedShort(); // major int cpCount = in.readUnsignedShort(); String[] utf8 = new String[cpCount]; @@ -1334,7 +1342,8 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean } catch (Throwable ignored) { // Ignore parse errors; class will be loaded as-is } - } + } // end step-2 if (classData.length > threshold) + } // end step-1 if (isEvalCode || sizeDebugEnabled) if (asmDebug && asmDebugClassMatches && asmDebugClassFilter != null && !asmDebugClassFilter.isEmpty()) { try { From 6f3115fa692dce510708619ff2a4eb7340c50d01 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 2 May 2026 08:51:42 +0200 Subject: [PATCH 11/13] docs: expand Phase 4 gate comment so developers don't remove it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the terse 10-line comment with a full explanation covering: - The exact bug it prevents (DBIx::Class timeout: C1 JIT "out of virtual registers" → 50× slowdown → TAP SIGKILL after 300 s) - Why the interpreter is preferable to C1 failure (~5–20× vs ~50×) - Which 7 methods are affected (Sub::Quote/Moo, 9–36 KB) - How the "requires interpreter fallback" exception propagates through getBytecodeInternal() up to checkForInterpreterFallback() - Why the isEvalCode guard must not be removed (regular .t files have equally large methods but fail on the interpreter) - Why classData.length alone cannot serve as the final threshold Also add a targeted warning at the throw site noting that the literal substring "requires interpreter fallback" is load-bearing. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/jvm/EmitterMethodCreator.java | 107 ++++++++++++++---- 1 file changed, 87 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index f48cb8f6b..85501264b 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -1150,23 +1150,84 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean cw.visitEnd(); classData = cw.toByteArray(); // Generate the bytecode - // Phase 4: Method-size threshold check. + // ╔══════════════════════════════════════════════════════════════════════╗ + // ║ Phase 4 — Method-Size Threshold: Route Oversized Eval Methods to ║ + // ║ the Interpreter. ║ + // ║ ║ + // ║ DO NOT REMOVE THIS BLOCK. Without it, the DBIx::Class test suite ║ + // ║ times out (200+ s instead of ~7 s) under even moderate CPU load. ║ + // ╚══════════════════════════════════════════════════════════════════════╝ // - // The C1 JIT fails under register pressure for Sub::Quote / Moo methods - // compiled from string eval. Route them to the interpreter proactively. + // === THE BUG THIS PREVENTS === // - // Guard order (outermost first, cheapest first): - // 1. Filename: only eval-generated code ("(eval N)") - // 2. Class size: classData.length > threshold (O(1) skip for small classes) - // 3. Parse: scan the class file to get the *exact* Code-attribute length + // Symptom: t/96_is_deteministic_value.t (DBIx::Class) is killed by the + // TAP harness after 300 s with no output. // - // Why we cannot use classData.length alone (step 2) as the final decision: - // Sub::Quote generates classes where the constant pool is 1.3×–4.9× the - // method code size (measured: 9 KB code → 23 KB class, 7.7 KB code → 32 KB - // class). There is no classData.length threshold that catches all code > 9 KB - // without also catching many code < 7 KB. Step 3 (parsing) is required. - // The parse is cheap (O(classData.length), a few KB) and only reaches step 3 - // for the small number of large eval-generated classes. + // Root-cause cascade: + // 1. CPU pressure slows JVM warm-up → methods stay in interpreted mode + // longer than usual before becoming "hot". + // 2. Sub::Quote / Moo string-eval'd accessor methods become hot and are + // submitted to the C1 JIT compiler. + // 3. C1 fails with: "out of virtual registers in linear scan" + // (observed at compileIds 4980 and 6224 in C1 compile logs). + // 4. JVM marks those methods as "not compilable"; they run permanently + // in interpreted mode — ~50× slower than compiled code. + // 5. The hot path through SQL::Abstract grinds to a halt; test takes + // 200+ seconds → TAP harness SIGKILL. + // + // === WHY THE INTERPRETER IS BETTER HERE === + // + // The PerlOnJava interpreter is ~5–20× slower than JIT-compiled code. + // The C1 failure path is ~50× slower. + // For the 7 affected methods, using the interpreter is therefore + // *faster* than letting the JVM attempt (and fail) JIT compilation. + // + // === WHICH METHODS ARE AFFECTED === + // + // Exactly 7 Sub::Quote / Moo-generated methods in the DBIx::Class run + // exceed the 9 KB threshold. The smallest is 9,043 bytes; the largest + // is 36,554 bytes. These methods contain inlined attribute-accessor + // logic — genuine Perl code, not PerlOnJava overhead — so no bytecode- + // reduction technique can shrink them below the C1 danger zone. + // + // === HOW THE FALLBACK IS TRIGGERED === + // + // Throwing a RuntimeException whose message contains the literal string + // "requires interpreter fallback" + // is the established protocol in this codebase. The exception propagates + // up through getBytecodeInternal() (note the early re-throw in its + // RuntimeException catch) and is caught by createRuntimeCode(), which + // delegates to PerlLanguageProvider.checkForInterpreterFallback(). + // That method matches the substring and re-runs the code through the + // PerlOnJava interpreter instead of the JVM class loader. + // + // === WHY EVAL-GENERATED CODE ONLY (isEvalCode guard) === + // + // Regular Perl source files (e.g. unit-test bodies) also produce large + // methods — up to 16 KB — but they run correctly on the JVM backend. + // Those same files FAIL on the interpreter due to known parity gaps + // (e.g. "local", tied variables). Applying the threshold to non-eval + // code would therefore swap a "works fine" JVM path for a broken + // interpreter path, causing regressions in dozens of unit tests. + // String-eval'd code (fileName starts with "(eval ") is exactly the + // Sub::Quote / Moo path where the interpreter fallback is known to work. + // + // === GUARD ORDER (cheapest first) === + // + // Step 1 — filename check (O(1)): isEvalCode = fileName.startsWith("(eval ") + // Step 2 — class-size pre-filter (O(1)): classData.length > threshold + // (a class file is always ≥ its largest method's code, so a + // small class cannot contain an oversized method — skip parse) + // Step 3 — parse the class file to get the *exact* Code-attribute length + // + // Why classData.length alone (step 2) cannot be the final gate: + // Sub::Quote classes have a constant pool that is 1.3×–4.9× the method + // code size (measured: 9 KB code → 23 KB class; 7.7 KB code → 32 KB + // class). No single classData.length cutoff can reliably separate + // "code > 9 KB" from "code < 7 KB" — the ranges overlap when the + // constant pool varies. Step 3 (bytecode parsing) is required. + // The parse itself is cheap: O(classData.length), a few microseconds, + // and it is only reached for the small number of large eval classes. // Step 1 — filename check (O(1), no class parsing) String srcFileName = ctx.compilerOptions != null ? ctx.compilerOptions.fileName : null; @@ -1325,10 +1386,16 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean System.err.println("BYTECODE_SIZE max_code_bytes=" + maxCodeLen); } - // Phase 4: if the largest method exceeds the threshold, route to interpreter. - // The C1 JIT fails for methods above ~9-10 KB under register pressure, causing - // 50x slowdowns. The interpreter is only ~5-20x slower than compiled code, - // so it is always preferable to a C1 failure. + // Phase 4 — trigger interpreter fallback for oversized eval methods. + // + // The literal substring "requires interpreter fallback" in the exception + // message is load-bearing: PerlLanguageProvider.checkForInterpreterFallback() + // matches it to route this code to the PerlOnJava interpreter instead of the + // JVM class loader. Do NOT change this string without updating that method. + // + // Why this is safe: C1 JIT fails (~50× slowdown) for these methods anyway; + // the interpreter (~5–20× slower than JIT) is faster than the failure path. + // See the large comment block above for the full explanation. if (maxCodeLen > maxMethodBytesThreshold) { throw new RuntimeException( "Method too large for reliable JIT: " + maxCodeLen @@ -1336,8 +1403,8 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean + "requires interpreter fallback"); } } catch (RuntimeException re) { - // Re-throw RuntimeException so the threshold check above propagates. - // (The outer catch in getBytecodeInternal/caller will route to interpreter.) + // Re-throw so the "requires interpreter fallback" exception above reaches + // createRuntimeCode() → checkForInterpreterFallback(). Do NOT swallow it. throw re; } catch (Throwable ignored) { // Ignore parse errors; class will be loaded as-is From 567a650cc969511ad5a7afdd1f47b9c7649590a8 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 2 May 2026 08:54:34 +0200 Subject: [PATCH 12/13] docs: replace box header in Phase 4 comment with plain text The ASCII box took 7 lines for what two lines say just as clearly. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../perlonjava/backend/jvm/EmitterMethodCreator.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java index 85501264b..6669946e1 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitterMethodCreator.java @@ -1150,13 +1150,9 @@ private static byte[] getBytecodeInternal(EmitterContext ctx, Node ast, boolean cw.visitEnd(); classData = cw.toByteArray(); // Generate the bytecode - // ╔══════════════════════════════════════════════════════════════════════╗ - // ║ Phase 4 — Method-Size Threshold: Route Oversized Eval Methods to ║ - // ║ the Interpreter. ║ - // ║ ║ - // ║ DO NOT REMOVE THIS BLOCK. Without it, the DBIx::Class test suite ║ - // ║ times out (200+ s instead of ~7 s) under even moderate CPU load. ║ - // ╚══════════════════════════════════════════════════════════════════════╝ + // Phase 4 — Method-Size Threshold: Route Oversized Eval Methods to the Interpreter. + // DO NOT REMOVE: without this block the DBIx::Class test suite times out + // (200+ s instead of ~7 s) under even moderate CPU load. // // === THE BUG THIS PREVENTS === // From 10b504c129efc2d4a89e7c6589ff34e723635710 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 2 May 2026 10:00:39 +0200 Subject: [PATCH 13/13] =?UTF-8?q?fix:=20overload=20fallback=20=E2=80=94=20?= =?UTF-8?q?use=20overload;=20(no=20ops)=20should=20not=20throw?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard Perl behaviour when a class calls `use overload;` with no operator arguments: - The package IS registered as overloaded (the `((` marker is set) - But NO actual operator methods are defined - String/numeric operators on such objects silently fall back to native comparison — no "no method found" error is raised PerlOnJava was incorrectly throwing "Operation 'eq': no method found, left argument in overloaded package …" for blessed objects whose class had `use overload;` with no operators (e.g. Image::ExifTool). Root cause ---------- `OverloadContext.allowsFallbackAutogen()` returned `true` only when `hasFallbackGlob=true` AND `fallbackValue=1` (i.e. explicit `fallback => 1`). With `use overload;` (no args) there is no `()` glob so `hasFallbackGlob=false` and the method returned `false`, causing `throwIfFallbackDenied` in CompareOperators to raise the error. Fix --- Add a `hasAnyOperatorMethod` boolean to `OverloadContext`. Computed once in `prepare()` by scanning `globalCodeRefs` for any `(x` key (excluding the `((` marker and `()` fallback) in the full MRO. `allowsFallbackAutogen()` now returns `true` when `!hasAnyOperatorMethod`, regardless of the fallback setting. This matches the four real-world cases confirmed against system Perl: use overload; # no ops → eq works, no throw use overload '+' => sub {}; # one op → eq throws use overload '+' => sub {}, fb => 1; # one op + fb1 → eq works, no throw use overload fallback => 0; # no ops + fb0 → eq throws (fb=0 path) Verified -------- - New subtests added to src/test/resources/unit/overload/fallback.t pass under both system Perl and jperl - Full `make` (all unit shards) passes Fixes the Image::ExifTool Apple.t failure seen on master where `ImageInfo($file)` returned a blessed ExifTool object and the caller compared it with `eq`. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../runtime/operators/CompareOperators.java | 18 ++-- .../runtime/runtimetypes/OverloadContext.java | 84 +++++++++++++++++-- src/test/resources/unit/overload/fallback.t | 80 ++++++++++++++++++ 3 files changed, 166 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/perlonjava/runtime/operators/CompareOperators.java b/src/main/java/org/perlonjava/runtime/operators/CompareOperators.java index 677d633f2..9e6a085aa 100644 --- a/src/main/java/org/perlonjava/runtime/operators/CompareOperators.java +++ b/src/main/java/org/perlonjava/runtime/operators/CompareOperators.java @@ -447,11 +447,14 @@ public static RuntimeScalar eq(RuntimeScalar runtimeScalar, RuntimeScalar arg2) // Try nomethod fallback (may throw if fallback=0) result = OverloadContext.tryTwoArgumentNomethod(runtimeScalar, arg2, blessId, blessId2, "eq"); if (result != null) return result; - // Master's tryTwoArgumentNomethod only throws when fallback=>0; - // when fallback is undef / absent, Perl 5 still reports "no method - // found". Keep our throwIfFallbackDenied guard so - // DBIC t/storage/txn.t test 90 still sees the expected exception. - // See commit 1869badd2 for rationale. + // tryTwoArgumentNomethod only throws when fallback=>0. + // When at least one operator is defined but fallback is undef/absent, + // Perl 5 reports "no method found" — throwIfFallbackDenied enforces + // this (DBIC t/storage/txn.t test 90, commit 1869badd2). + // Exception: if the package defines NO operators at all (e.g. + // "use overload;"), Perl silently falls through to native string + // comparison; allowsFallbackAutogen() returns true in that case and + // throwIfFallbackDenied is a no-op. throwIfFallbackDenied(runtimeScalar, blessId, arg2, blessId2, "eq"); } @@ -482,8 +485,9 @@ public static RuntimeScalar ne(RuntimeScalar runtimeScalar, RuntimeScalar arg2) // Try nomethod fallback (may throw if fallback=0) result = OverloadContext.tryTwoArgumentNomethod(runtimeScalar, arg2, blessId, blessId2, "ne"); if (result != null) return result; - // See eq() above — retain throwIfFallbackDenied for the - // fallback=undef case (DBIC t/storage/txn.t test 90, commit 1869badd2). + // See eq() above — same semantics: only throw when at least one + // operator is defined and fallback is not 1 (DBIC t/storage/txn.t + // test 90, commit 1869badd2). No-op when package has no operators. throwIfFallbackDenied(runtimeScalar, blessId, arg2, blessId2, "ne"); } diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/OverloadContext.java b/src/main/java/org/perlonjava/runtime/runtimetypes/OverloadContext.java index 9ba784dbe..d0b47296d 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/OverloadContext.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/OverloadContext.java @@ -72,20 +72,36 @@ public class OverloadContext { * false (0) → deny autogeneration entirely */ final RuntimeScalar fallbackValue; + /** + * Whether the class (or any class in its MRO) defines at least one actual + * operator overload method — i.e. any {@code (x} glob that is not the + * {@code (( } overload-marker or the {@code ()} fallback glob. + *

+ * This distinguishes {@code use overload;} (no operators) from a package + * that genuinely overloads some operators. Standard Perl silently falls + * back to native string/numeric comparison when a blessed object belongs + * to a class with {@code use overload;} but no operator methods; it only + * throws "Operation ... no method found" when at least one operator is + * defined but the requested one is missing and {@code fallback} is not + * explicitly {@code 1}. + */ + final boolean hasAnyOperatorMethod; /** * Private constructor to create an OverloadContext instance. * - * @param perlClassName The Perl class name - * @param methodOverloaded The overloaded method handler - * @param hasFallbackGlob Whether "()" glob was found - * @param fallbackValue The SCALAR slot value of "()" glob + * @param perlClassName The Perl class name + * @param methodOverloaded The overloaded method handler + * @param hasFallbackGlob Whether "()" glob was found + * @param fallbackValue The SCALAR slot value of "()" glob + * @param hasAnyOperatorMethod Whether any actual operator method is defined */ - private OverloadContext(String perlClassName, RuntimeScalar methodOverloaded, boolean hasFallbackGlob, RuntimeScalar fallbackValue) { + private OverloadContext(String perlClassName, RuntimeScalar methodOverloaded, boolean hasFallbackGlob, RuntimeScalar fallbackValue, boolean hasAnyOperatorMethod) { this.perlClassName = perlClassName; this.methodOverloaded = methodOverloaded; this.hasFallbackGlob = hasFallbackGlob; this.fallbackValue = fallbackValue; + this.hasAnyOperatorMethod = hasAnyOperatorMethod; } /** @@ -104,18 +120,35 @@ public String getPerlClassName() { *

* Perl's semantics: *

*

* Used by binary operators (eq/ne/cmp/lt/gt) to decide whether a * fallback to stringification-based comparison is safe, or whether * the operation should throw "no method found" to match Perl 5. + *

+ * The critical distinction: {@code use overload;} with no operator + * arguments does NOT cause "no method found" errors — standard Perl + * silently falls back to native comparison in that case. Only a + * package that defines at least one operator but leaves others undefined + * (and doesn't set {@code fallback => 1}) triggers the error. */ public boolean allowsFallbackAutogen() { + // If no actual operator method is defined at all (e.g. "use overload;"), + // behave as if the package were not overloaded for dispatch purposes — + // standard Perl falls through to native string/numeric comparison. + if (!hasAnyOperatorMethod) { + return true; + } + // At least one operator is defined: check fallback setting. + // fallback => 1 (true): permit native fallback → true + // fallback => 0 (false) or fallback => undef: deny fallback → false return hasFallbackGlob && fallbackValue != null && fallbackValue.getDefinedBoolean() @@ -180,10 +213,43 @@ public static OverloadContext prepare(int blessId) { System.err.flush(); } + // Check whether any actual operator overload method is defined in the MRO + // (excluding the "((" overload-marker and the "()" fallback-glob). + // Standard Perl: a class with "use overload;" and NO operator methods falls + // through to native comparison silently; a class with at least one operator + // method defined (and fallback not set to 1) throws "no method found" for + // unhandled operators. + boolean hasAnyOperatorMethod = false; + if (methodOverloaded != null) { + java.util.List mroClasses = InheritanceResolver.linearizeHierarchy(perlClassName); + outer: + for (String mroClass : mroClasses) { + String effectiveClass = GlobalVariable.resolveStashAlias(mroClass); + // Prefix used for all overload methods in this class, e.g. "Foo::(" + String keyPrefix = effectiveClass + "::("; + // Walk globalCodeRefs to find any (x method that is not (( or () + for (String key : GlobalVariable.globalCodeRefs.keySet()) { + if (key.startsWith(keyPrefix)) { + // Extract the method-name part (everything after "::") + String op = key.substring(effectiveClass.length() + 2); + if (!"((".equals(op) && !"()".equals(op)) { + hasAnyOperatorMethod = true; + break outer; + } + } + } + } + } + + if (TRACE_OVERLOAD_CONTEXT) { + System.err.println(" hasAnyOperatorMethod: " + hasAnyOperatorMethod); + System.err.flush(); + } + // Create context if overloading is enabled OverloadContext context = null; if (methodOverloaded != null || hasFallbackGlob) { - context = new OverloadContext(perlClassName, methodOverloaded, hasFallbackGlob, fallbackValue); + context = new OverloadContext(perlClassName, methodOverloaded, hasFallbackGlob, fallbackValue, hasAnyOperatorMethod); // Cache the result InheritanceResolver.cacheOverloadContext(blessId, context); } diff --git a/src/test/resources/unit/overload/fallback.t b/src/test/resources/unit/overload/fallback.t index 51e6a9aa1..b260c9459 100644 --- a/src/test/resources/unit/overload/fallback.t +++ b/src/test/resources/unit/overload/fallback.t @@ -62,4 +62,84 @@ ok(!($sum != 52), 'numeric fallback works in addition'); my $concat = "Value: " . $num_obj; ok(!($concat ne "Value: 42"), 'string fallback works in concatenation (got: \'$concat\')'); +# ----------------------------------------------------------------------- +# Tests for "use overload; with no operators" fallback behaviour +# +# Standard Perl: a class that calls "use overload;" (no arguments) is +# technically overloaded (overload::Overloaded returns true) but has no +# operator methods. String/numeric operators on such objects should +# silently fall back to native comparison rather than dying with +# "Operation '...': no method found". +# ----------------------------------------------------------------------- + +{ + package NoOperators; + # "use overload;" with no arguments: package is marked overloaded + # but no operator methods are defined. + use overload; + + sub new { bless {}, shift } +} + +{ + package OneOperator; + # Has one operator (+), but not eq/ne/cmp/etc. + # fallback is not set → default is undef → should throw "no method found" + # for unhandled operators like eq. + use overload '+' => sub { 42 }; + + sub new { bless {}, shift } +} + +{ + package OneOpFallback1; + # Has one operator, but fallback => 1 → silently fall back to native. + use overload '+' => sub { 42 }, fallback => 1; + + sub new { bless {}, shift } +} + +{ + package NoOpFallback0; + # No operators, but explicit fallback => 0. + # Perl: the fallback => 0 is still stored, so the package HAS a () + # glob, but no actual operator methods. When eq is requested, + # tryTwoArgumentNomethod throws immediately because fallback=0. + use overload fallback => 0; + + sub new { bless {}, shift } +} + +my $no_ops = NoOperators->new; +my $one_op = OneOperator->new; +my $one_fb1 = OneOpFallback1->new; +my $no_op_f0 = NoOpFallback0->new; + +# Case 1: "use overload;" with no operators → native fallback, no error +{ + my $ok = eval { ($no_ops eq "anything") ? 1 : 0 }; + ok(!$@, 'use overload; (no ops) does not throw for eq'); + ok(defined $ok, 'use overload; (no ops) eq returns a value'); +} + +# Case 2: one operator defined, fallback not set → "no method found" for eq +{ + eval { my $x = ($one_op eq "anything") }; + like($@, qr/no method found/, 'one operator, fallback undef: eq throws "no method found"'); +} + +# Case 3: one operator + fallback => 1 → native fallback, no error +{ + my $ok = eval { ($one_fb1 eq "anything") ? 1 : 0 }; + ok(!$@, 'one operator + fallback=>1 does not throw for eq'); + ok(defined $ok, 'one operator + fallback=>1 eq returns a value'); +} + +# Case 4: no operators + explicit fallback => 0 → throws for eq +{ + eval { my $x = ($no_op_f0 eq "anything") }; + like($@, qr/no method found/, 'no ops + fallback=>0 throws "no method found" for eq'); +} + done_testing(); +