From ee2295d6bf16da5563fe09099fbae6543c83e6ec Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 09:00:46 +0100 Subject: [PATCH 01/14] Fix interpreter: backslash-ampersand-{expr} should return CODE directly, not REFERENCE The interpreter was wrapping backslash-ampersand-{expr} results in CREATE_REF, causing *{$name} = backslash-ampersand-{$func} to pass REFERENCE->CODE to RuntimeGlob.set(). This broke Time::HiRes import because the glob CODE slot received a constant sub instead of the actual code reference. The JVM backend uses RuntimeCode.createCodeReference() which returns type=CODE directly. Now the interpreter matches: backslash-ampersand-{expr} compiles to just CODE_DEREF_NONSTRICT without CREATE_REF wrapping. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/backend/bytecode/BytecodeCompiler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index 26f9e4a5e..519c3398a 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -3942,11 +3942,11 @@ void compileVariableReference(OperatorNode node, String op) { } else if (op.equals("\\")) { // Reference operator: \$x, \@x, \%x, \*x, etc. if (node.operand != null) { - // Special case: \&name — CODE is already a reference type. - // Emit LOAD_GLOBAL_CODE directly without CREATE_REF, matching JVM compiler. + // Special case: \&name or \&{expr} — CODE is already a reference type. + // Emit LOAD_GLOBAL_CODE or CODE_DEREF_NONSTRICT without CREATE_REF, matching JVM compiler. + // The JVM uses RuntimeCode.createCodeReference() which returns type=CODE directly. if (node.operand instanceof OperatorNode operandOp - && operandOp.operator.equals("&") - && operandOp.operand instanceof IdentifierNode) { + && operandOp.operator.equals("&")) { node.operand.accept(this); // lastResultReg already holds the CODE scalar — no wrapping needed return; From 7445a736428d31121fd0af8b9ff91bba5d260650 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 09:06:52 +0100 Subject: [PATCH 02/14] Fix interpreter: hash slice assignment with block dereference @{$ref}{keys} Added BlockNode handling alongside OperatorNode for hash slice assignment. This enables patterns like @{$hash_ref}{qw(a b)} = (1, 2) which are used by Exporter/Heavy.pm and other CPAN modules. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../org/perlonjava/backend/bytecode/CompileAssignment.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/CompileAssignment.java b/src/main/java/org/perlonjava/backend/bytecode/CompileAssignment.java index d66a14a1d..10a70718f 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/CompileAssignment.java +++ b/src/main/java/org/perlonjava/backend/bytecode/CompileAssignment.java @@ -1253,7 +1253,9 @@ public static void compileAssignmentOperator(BytecodeCompiler bytecodeCompiler, bytecodeCompiler.emitReg(hashReg); bytecodeCompiler.emit(nameIdx); } - } else if (hashOp.operand instanceof OperatorNode) { + } else if (hashOp.operand instanceof OperatorNode || + hashOp.operand instanceof BlockNode) { + // Handle @{$ref}{keys} or @{expr}{keys} bytecodeCompiler.compileNode(hashOp.operand, -1, rhsContext); int scalarRefReg = bytecodeCompiler.lastResultReg; hashReg = bytecodeCompiler.allocateRegister(); From be493aea8d9a555075c79f77b29532ed767b7df5 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 09:20:22 +0100 Subject: [PATCH 03/14] Fix interpreter: hash slice with block ref + defensive SUPER:: handling 1. Hash slice assignment with block dereference @{$ref}{keys}: Added BlockNode handling alongside OperatorNode for hash slice assignment. This enables patterns like @{$hash_ref}{qw(a b)} = (1, 2) which are used by Exporter/Heavy.pm. 2. Defensive null check in superMethod for SUPER:: resolution: When currentSub is null (no __SUB__ set), fall back to InterpreterState.currentPackage. This prevents NPE when SUPER::method is called from contexts without __SUB__. Note: Full fix for Getopt::Long SUPER::import still pending - requires investigation of __SUB__ initialization for subroutines in loaded .pm modules. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../perlonjava/runtime/runtimetypes/NextMethod.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/NextMethod.java b/src/main/java/org/perlonjava/runtime/runtimetypes/NextMethod.java index 0b925cbc6..f593101f7 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/NextMethod.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/NextMethod.java @@ -299,7 +299,17 @@ public static RuntimeList maybeNextMethod(RuntimeArray args, int ctx) { static RuntimeScalar superMethod(RuntimeScalar currentSub, String methodName) { RuntimeScalar method; - String packageName = ((RuntimeCode) currentSub.value).packageName; + String packageName; + + // Get package name from currentSub if available, otherwise fall back to current package + if (currentSub != null && currentSub.value instanceof RuntimeCode code) { + packageName = code.packageName; + } else { + // Fall back to the current package from interpreter state + // This handles cases like SUPER::import called from top-level code + packageName = org.perlonjava.backend.bytecode.InterpreterState.currentPackage.get().toString(); + } + method = InheritanceResolver.findMethodInHierarchy( methodName.substring(7), // method name without SUPER:: prefix packageName, From bd558f485fb8eda7dfc1c44fe60d1355e42f8e35 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 10:03:13 +0100 Subject: [PATCH 04/14] Fix caller() stack trace ordering in interpreter mode Two issues were causing caller() to return incorrect package/line info in interpreter mode: 1. InterpreterState.getPcStack() returned PCs in oldest-first order, but getStack() returns frames in newest-first order. Fixed by reversing the iteration order in getPcStack(). 2. ExceptionFormatter was using CallerStack[interpreterFrameIndex] but CallerStack stores CALL SITES, not execution positions. For frame N, we need CallerStack[N-1] because CallerStack[M] stores the call site of the Mth call, which is the execution position of frame M+1. This fixes caller(0) and caller(1) returning incorrect line numbers when called from subroutines loaded via require/use. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/bytecode/InterpreterState.java | 12 ++++++++++-- .../runtimetypes/ExceptionFormatter.java | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/InterpreterState.java b/src/main/java/org/perlonjava/backend/bytecode/InterpreterState.java index dd2faa588..9f66baa0c 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/InterpreterState.java +++ b/src/main/java/org/perlonjava/backend/bytecode/InterpreterState.java @@ -146,11 +146,19 @@ public static List getStack() { return new ArrayList<>(frameStack.get()); } + /** + * Get the PC stack in frame order (most recent first to match getStack()). + * The pcStack ArrayList stores PCs oldest-first, but getStack() returns + * frames newest-first, so we reverse the order here to match. + * + * @return A list of PCs from most recent (index 0) to oldest + */ public static List getPcStack() { ArrayList pcs = pcStack.get(); ArrayList result = new ArrayList<>(pcs.size()); - for (int[] holder : pcs) { - result.add(holder[0]); + // Iterate in reverse order to match frame order (newest first) + for (int i = pcs.size() - 1; i >= 0; i--) { + result.add(pcs.get(i)[0]); } return result; } diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java index f68d3b372..b1dc8aa80 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java @@ -96,10 +96,17 @@ private static ArrayList> formatThrowable(Throwable t) { if (!addedFrameForCurrentLevel && interpreterFrameIndex < interpreterFrames.size()) { var frame = interpreterFrames.get(interpreterFrameIndex); if (frame != null && frame.code() != null) { - // First check CallerStack for accurate call site info. - // CallerStack entries are pushed by CALL_SUB/CALL_METHOD with the exact - // call site location, which is more accurate than the current PC. - var callerInfo = CallerStack.peek(interpreterFrameIndex); + // For JVM-style stack traces: + // - Frame 0 (innermost): needs current execution position (use PC) + // - Frame N > 0: needs execution position = call site of frame N-1 + // which is CallerStack[N-1] since CallerStack[M] stores + // the call site for the Mth call (most recent = 0) + + // For frames > 0, use CallerStack[frameIndex-1] which gives the + // call site of the previous frame (= execution position of this frame) + var callerInfo = (interpreterFrameIndex > 0) + ? CallerStack.peek(interpreterFrameIndex - 1) + : null; String pkg = null; String filename = frame.code().sourceName; @@ -111,8 +118,8 @@ private static ArrayList> formatThrowable(Throwable t) { filename = callerInfo.filename(); line = String.valueOf(callerInfo.line()); if (System.getenv("DEBUG_CALLER") != null) { - System.err.println("DEBUG ExceptionFormatter: using CallerStack[" + interpreterFrameIndex + - "] pkg=" + pkg + " file=" + filename + " line=" + line); + System.err.println("DEBUG ExceptionFormatter: using CallerStack[" + (interpreterFrameIndex - 1) + + "] for frame " + interpreterFrameIndex + " pkg=" + pkg + " file=" + filename + " line=" + line); } } else { // Fallback: get tokenIndex from PC mapping From 7c0890c92c554b9e32ea64ad99f2735f5dd3a80e Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 10:22:24 +0100 Subject: [PATCH 05/14] Fix caller() for use statements with ExportLevel in interpreter mode The parseUseDeclaration CallerStack entry was being accessed with the wrong index when modules like Getopt::Long use $Exporter::ExportLevel. The issue: CallerStack contains entries from both parseUseDeclaration (pushed first) and interpreter CALL_SUB/CALL_METHOD (pushed later). When building the stack trace, parseUseDeclaration handling used callerStackIndex=0, but the actual entry was at a higher index after accounting for interpreter frames. The fix: Use Math.max(interpreterFrameIndex - 1, callerStackIndex) to find the correct CallerStack entry for parseUseDeclaration. This fixes use Getopt::Long and similar modules that rely on $Exporter::ExportLevel to determine the target package for exports. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../runtime/runtimetypes/ExceptionFormatter.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java index b1dc8aa80..b85b0525e 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java @@ -70,9 +70,18 @@ private static ArrayList> formatThrowable(Throwable t) { } if (element.getClassName().equals("org.perlonjava.frontend.parser.StatementParser") && element.getMethodName().equals("parseUseDeclaration")) { - // Artificial caller stack entry created at `use` statement - var callerInfo = CallerStack.peek(callerStackIndex); + // Artificial caller stack entry created at `use` statement. + // This entry represents where the `use Module` was called from (e.g., main script). + // The CallerStack entry for parseUseDeclaration was pushed BEFORE any CALL_SUB entries + // from the module's import method, so it's at index (interpreterFrameIndex - 1) + // after accounting for all interpreter frames processed so far. + int useStatementIndex = Math.max(interpreterFrameIndex - 1, callerStackIndex); + var callerInfo = CallerStack.peek(useStatementIndex); if (callerInfo != null) { + if (System.getenv("DEBUG_CALLER") != null) { + System.err.println("DEBUG ExceptionFormatter: parseUseDeclaration using CallerStack[" + useStatementIndex + + "] pkg=" + callerInfo.packageName() + " file=" + callerInfo.filename() + " line=" + callerInfo.line()); + } var entry = new ArrayList(); entry.add(callerInfo.packageName()); entry.add(callerInfo.filename()); @@ -80,7 +89,7 @@ private static ArrayList> formatThrowable(Throwable t) { entry.add(null); // No subroutine name available for use statements stackTrace.add(entry); lastFileName = callerInfo.filename() != null ? callerInfo.filename() : ""; - callerStackIndex++; + callerStackIndex = useStatementIndex + 1; } } else if (element.getClassName().equals("org.perlonjava.backend.bytecode.InterpretedCode") && element.getMethodName().equals("apply")) { From 7ff118487780cf294c0a2af8167d823935eacbd7 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 12:05:09 +0100 Subject: [PATCH 06/14] Fix caller() in signature defaults with eval package change The previous change introduced a regression where caller() in signature default values would return the wrong package when called from an eval with a package statement. Test case: eval("package T121::Z; ::t121()") where sub t121 has signature ($a = caller). Expected: T121::Z, was returning: main The issue was the change to use CallerStack[interpreterFrameIndex-1] for frames > 0. This was incorrect because: - CallerStack[N] contains the call site info for the Nth call - For interpreterFrameIndex=0, we need CallerStack[0] (the most recent call site), not null Reverted to using CallerStack[interpreterFrameIndex] directly, which correctly returns T121::Z for the innermost interpreter frame. The parseUseDeclaration fix for Exporter is retained separately. Fixes regression in op/signatures.t test 217. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../runtimetypes/ExceptionFormatter.java | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java index b85b0525e..6024d0ccb 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/ExceptionFormatter.java @@ -105,17 +105,8 @@ private static ArrayList> formatThrowable(Throwable t) { if (!addedFrameForCurrentLevel && interpreterFrameIndex < interpreterFrames.size()) { var frame = interpreterFrames.get(interpreterFrameIndex); if (frame != null && frame.code() != null) { - // For JVM-style stack traces: - // - Frame 0 (innermost): needs current execution position (use PC) - // - Frame N > 0: needs execution position = call site of frame N-1 - // which is CallerStack[N-1] since CallerStack[M] stores - // the call site for the Mth call (most recent = 0) - - // For frames > 0, use CallerStack[frameIndex-1] which gives the - // call site of the previous frame (= execution position of this frame) - var callerInfo = (interpreterFrameIndex > 0) - ? CallerStack.peek(interpreterFrameIndex - 1) - : null; + // Use CallerStack for the call site info + var callerInfo = CallerStack.peek(interpreterFrameIndex); String pkg = null; String filename = frame.code().sourceName; @@ -127,8 +118,8 @@ private static ArrayList> formatThrowable(Throwable t) { filename = callerInfo.filename(); line = String.valueOf(callerInfo.line()); if (System.getenv("DEBUG_CALLER") != null) { - System.err.println("DEBUG ExceptionFormatter: using CallerStack[" + (interpreterFrameIndex - 1) + - "] for frame " + interpreterFrameIndex + " pkg=" + pkg + " file=" + filename + " line=" + line); + System.err.println("DEBUG ExceptionFormatter: using CallerStack[" + interpreterFrameIndex + + "] pkg=" + pkg + " file=" + filename + " line=" + line); } } else { // Fallback: get tokenIndex from PC mapping From 9ceecdfcea98183d510ee1584ad67ed3ea2d55ea Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 13:05:23 +0100 Subject: [PATCH 07/14] Update skill files: fix JAR paths and improve documentation - Update all skill files to use dynamic JAR path (build/libs/perlonjava-*.jar) instead of hardcoded target/perlonjava-3.0.0.jar - Use ./jperl wrapper script in examples instead of direct java -jar - Fix profile-perlonjava paths (PerlOnJava2 -> PerlOnJava) - Add interpreter profiling documentation - Fix git stash warning consistency (never use stash) Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .cognition/skills/debug-exiftool/SKILL.md | 27 +++++----- .cognition/skills/debug-perlonjava/SKILL.md | 7 +-- .cognition/skills/interpreter-parity/SKILL.md | 2 +- .cognition/skills/profile-perlonjava/SKILL.md | 49 +++++++++++++++---- 4 files changed, 57 insertions(+), 28 deletions(-) diff --git a/.cognition/skills/debug-exiftool/SKILL.md b/.cognition/skills/debug-exiftool/SKILL.md index c26796118..3662d3f87 100644 --- a/.cognition/skills/debug-exiftool/SKILL.md +++ b/.cognition/skills/debug-exiftool/SKILL.md @@ -23,7 +23,7 @@ You are debugging failures in the Image::ExifTool test suite running under PerlO **IMPORTANT: Never push directly to master. Always use feature branches and PRs.** -**IMPORTANT: Always commit or stash changes BEFORE switching branches.** If `git stash pop` has conflicts, uncommitted changes may be lost. +**IMPORTANT: Always commit or save changes BEFORE switching branches.** Use `git diff > backup.patch` to save uncommitted work, or commit to a WIP branch. ```bash git checkout -b fix/exiftool-issue-name @@ -42,7 +42,7 @@ gh pr create --title "Fix: description" --body "Details" - **ExifTool reference output**: `Image-ExifTool-13.44/t/_N.out` (expected tag output per sub-test) - **PerlOnJava unit tests**: `src/test/resources/unit/*.t` (make suite, 154 tests) - **Perl5 core tests**: `perl5_t/t/` (Perl 5 compatibility suite, run via `make test-gradle`) -- **Fat JAR**: `target/perlonjava-3.0.0.jar` +- **Fat JAR**: `build/libs/perlonjava-*.jar` (version varies) - **Launcher script**: `./jperl` (resolves JAR path, sets `$^X`) ## Building PerlOnJava @@ -64,16 +64,13 @@ make dev # Quick build - compiles only, NO tests ### Single test ```bash cd Image-ExifTool-13.44 -java -jar ../target/perlonjava-3.0.0.jar -Ilib t/Writer.t -# Or using the launcher: -cd Image-ExifTool-13.44 ../jperl -Ilib t/Writer.t ``` ### Single test with timeout (prevents infinite loops) ```bash cd Image-ExifTool-13.44 -timeout 120 java -jar ../target/perlonjava-3.0.0.jar -Ilib t/XMP.t +timeout 120 ../jperl -Ilib t/XMP.t ``` ### All ExifTool tests in parallel with summary @@ -82,7 +79,7 @@ cd Image-ExifTool-13.44 mkdir -p /tmp/exiftool_results for t in t/*.t; do name=$(basename "$t" .t) - ( output=$(timeout 120 java -jar ../target/perlonjava-3.0.0.jar -Ilib "$t" 2>&1) + ( output=$(timeout 120 ../jperl -Ilib "$t" 2>&1) ec=$? if [ $ec -eq 124 ]; then echo "$name TIMEOUT" else @@ -133,7 +130,7 @@ cd Image-ExifTool-13.44 perl -Ilib t/Writer.t 2>&1 | grep -E '^(not )?ok ' > /tmp/perl_results.txt # Run with PerlOnJava -java -jar ../target/perlonjava-3.0.0.jar -Ilib t/Writer.t 2>&1 | grep -E '^(not )?ok ' > /tmp/jperl_results.txt +../jperl -Ilib t/Writer.t 2>&1 | grep -E '^(not )?ok ' > /tmp/jperl_results.txt # Diff diff /tmp/perl_results.txt /tmp/jperl_results.txt @@ -145,7 +142,7 @@ For individual Perl constructs: perl -e 'my @a = (1,2,3); $_ *= 2 foreach @a; print "@a\n"' # PerlOnJava -java -jar target/perlonjava-3.0.0.jar -e 'my @a = (1,2,3); $_ *= 2 foreach @a; print "@a\n"' +./jperl -e 'my @a = (1,2,3); $_ *= 2 foreach @a; print "@a\n"' ``` For comparing `.failed` output files against `.out` reference files: @@ -188,8 +185,8 @@ diff t/Writer_11.out t/Writer_11.failed # Pass JVM options via JPERL_OPTS JPERL_OPTS="-Xmx512m" ./jperl script.pl -# Combine env vars -JPERL_SHOW_FALLBACK=1 JPERL_EVAL_TRACE=1 java -jar target/perlonjava-3.0.0.jar -Ilib t/Writer.t 2>&1 +# Combine env vars (inside ExifTool dir) +JPERL_SHOW_FALLBACK=1 JPERL_EVAL_TRACE=1 ../jperl -Ilib t/Writer.t 2>&1 ``` ## Test File Anatomy @@ -235,7 +232,7 @@ The `check()` function compares extracted tags against reference files `t/&1 | \ diff --git a/.cognition/skills/interpreter-parity/SKILL.md b/.cognition/skills/interpreter-parity/SKILL.md index 95e688b19..280bb6ea5 100644 --- a/.cognition/skills/interpreter-parity/SKILL.md +++ b/.cognition/skills/interpreter-parity/SKILL.md @@ -36,7 +36,7 @@ gh pr create --title "Fix interpreter: description" --body "Details" - **PerlOnJava source**: `src/main/java/org/perlonjava/` (compiler, bytecode interpreter, runtime) - **Unit tests**: `src/test/resources/unit/*.t` (155 tests, run via `make`) -- **Fat JAR**: `target/perlonjava-3.0.0.jar` +- **Fat JAR**: `build/libs/perlonjava-*.jar` (version varies) - **Launcher script**: `./jperl` ## Building diff --git a/.cognition/skills/profile-perlonjava/SKILL.md b/.cognition/skills/profile-perlonjava/SKILL.md index 4f532afff..f72e4b01f 100644 --- a/.cognition/skills/profile-perlonjava/SKILL.md +++ b/.cognition/skills/profile-perlonjava/SKILL.md @@ -1,6 +1,6 @@ # Profile PerlOnJava -## ⚠️⚠️⚠️ CRITICAL: NEVER USE `git stash` ⚠️⚠️⚠️ +## CRITICAL: NEVER USE `git stash` **DANGER: Changes are SILENTLY LOST when using git stash/stash pop!** @@ -34,12 +34,23 @@ gh pr create --title "Perf: description" --body "Details" ### 1. Run with JFR Profiling ```bash -cd /Users/fglock/projects/PerlOnJava2 +cd /Users/fglock/projects/PerlOnJava + +# Find the jar file (version changes with releases) +JAR=$(ls build/libs/perlonjava-*.jar | head -1) # Profile a long-running script (adjust duration as needed) java -XX:+FlightRecorder \ -XX:StartFlightRecording=duration=60s,filename=profile.jfr \ - -jar target/perlonjava-3.0.0.jar [args...] + -jar $JAR [args...] + +# Or use the wrapper script with JFR options via JAVA_OPTS +JAVA_OPTS="-XX:+FlightRecorder -XX:StartFlightRecording=duration=60s,filename=profile.jfr" \ + ./jperl [args...] + +# For interpreter mode profiling +JAVA_OPTS="-XX:+FlightRecorder -XX:StartFlightRecording=duration=60s,filename=profile.jfr" \ + ./jperl --interpreter [args...] ``` ### 2. Analyze with JFR Tools @@ -65,12 +76,13 @@ $JFR print --events jdk.ExecutionSample profile.jfr 2>&1 | \ | Category | Methods to Watch | Optimization Approach | |----------|------------------|----------------------| -| **Number parsing** | `Long.parseLong`, `Double.parseDouble`, `NumberParser.parseNumber` | Cache numeric values, avoid string→number conversions | +| **Number parsing** | `Long.parseLong`, `Double.parseDouble`, `NumberParser.parseNumber` | Cache numeric values, avoid string->number conversions | | **Type checking** | `ScalarUtils.looksLikeNumber`, `RuntimeScalar.getDefinedBoolean` | Fast-path for common types (INTEGER, DOUBLE) | | **Bitwise ops** | `BitwiseOperators.*` | Ensure values stay as INTEGER type | | **Regex** | `Pattern.match`, `Matcher.matches` | Reduce unnecessary regex checks | | **Loop control** | `RuntimeControlFlowRegistry.checkLoopAndGetAction` | ThreadLocal overhead | | **Array ops** | `ArrayList.grow`, `Arrays.copyOf` | Pre-size arrays, reduce allocations | +| **Interpreter** | `BytecodeInterpreter.execute`, opcode handlers | Reduce dispatch overhead, inline hot paths | ### 4. Common Runtime Files @@ -78,9 +90,10 @@ $JFR print --events jdk.ExecutionSample profile.jfr 2>&1 | \ |------|---------| | `src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java` | Scalar value representation, getLong/getDouble/getInt | | `src/main/java/org/perlonjava/runtime/runtimetypes/ScalarUtils.java` | Utility functions like looksLikeNumber | -| `src/main/java/org/perlonjava/runtime/operators/BitwiseOperators.java` | Bitwise operations (&, |, ^, ~, <<, >>) | +| `src/main/java/org/perlonjava/runtime/operators/BitwiseOperators.java` | Bitwise operations (&, \|, ^, ~, <<, >>) | | `src/main/java/org/perlonjava/runtime/operators/Operator.java` | General operators | | `src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeArray.java` | Array operations | +| `src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java` | Interpreter main loop | ### 5. Optimization Patterns @@ -110,14 +123,20 @@ if (runtimeScalar.type == INTEGER) { ### 6. Benchmark Commands ```bash +cd /Users/fglock/projects/PerlOnJava + +# Quick benchmark with closure test +./jperl dev/bench/benchmark_closure.pl + +# Interpreter mode benchmark (slower, good for profiling interpreter) +./jperl --interpreter dev/bench/benchmark_closure.pl + # Quick benchmark with life_bitpacked.pl -java -jar target/perlonjava-3.0.0.jar examples/life_bitpacked.pl \ - -w 200 -h 200 -g 10000 -r none +./jperl examples/life_bitpacked.pl -w 200 -h 200 -g 10000 -r none # Multiple runs for consistency for i in 1 2 3; do - java -jar target/perlonjava-3.0.0.jar examples/life_bitpacked.pl \ - -w 200 -h 200 -g 10000 -r none 2>&1 | grep "per second" + ./jperl examples/life_bitpacked.pl -w 200 -h 200 -g 10000 -r none 2>&1 | grep "per second" done ``` @@ -147,3 +166,15 @@ make dev # Quick build - compiles only, NO tests 7. Profile again to verify improvement 8. Run tests to ensure correctness ``` + +## JVM vs Interpreter Performance + +The interpreter mode (`--interpreter`) is typically 20-40x slower than JVM-compiled mode. +This is expected and useful for: +- Testing interpreter-specific code paths +- Debugging interpreter behavior +- Profiling interpreter bottlenecks + +Example typical performance: +- JVM mode: ~4 seconds for benchmark_closure.pl +- Interpreter mode: ~120-130 seconds for the same benchmark From edd69f04dd6b0f6c3c7677e3aacfc4b57b369aaa Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 13:06:15 +0100 Subject: [PATCH 08/14] Fix profile-perlonjava skill: consistent git stash warning - Add emoji warning icons to match other skill files - Remove contradictory 'commit or stash' instruction Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .cognition/skills/profile-perlonjava/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cognition/skills/profile-perlonjava/SKILL.md b/.cognition/skills/profile-perlonjava/SKILL.md index f72e4b01f..e6212d2f9 100644 --- a/.cognition/skills/profile-perlonjava/SKILL.md +++ b/.cognition/skills/profile-perlonjava/SKILL.md @@ -1,6 +1,6 @@ # Profile PerlOnJava -## CRITICAL: NEVER USE `git stash` +## ⚠️⚠️⚠️ CRITICAL: NEVER USE `git stash` ⚠️⚠️⚠️ **DANGER: Changes are SILENTLY LOST when using git stash/stash pop!** @@ -14,7 +14,7 @@ Profile and optimize PerlOnJava runtime performance using Java Flight Recorder. **IMPORTANT: Never push directly to master. Always use feature branches and PRs.** -**IMPORTANT: Always commit or stash changes BEFORE switching branches.** If `git stash pop` has conflicts, uncommitted changes may be lost. +**IMPORTANT: Always commit or save changes BEFORE switching branches.** Use `git diff > backup.patch` to save uncommitted work, or commit to a WIP branch. ```bash git checkout -b perf/optimization-name From f4529b307fa6668f261f8b8a84de888ad07b87b2 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 13:12:29 +0100 Subject: [PATCH 09/14] Add interpreter performance optimization design doc Profile analysis of benchmark_closure.pl in interpreter mode: - Identified ThreadLocal lookup overhead in CALL opcode - CallerStack push/pop on every call even when caller() unused - Deep call chain indirection for subroutine dispatch - TreeMap lookup for line numbers Optimization plan with 4 phases documented. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/INTERPRETER_OPTIMIZATION.md | 107 +++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 dev/design/INTERPRETER_OPTIMIZATION.md diff --git a/dev/design/INTERPRETER_OPTIMIZATION.md b/dev/design/INTERPRETER_OPTIMIZATION.md new file mode 100644 index 000000000..fbbb8422f --- /dev/null +++ b/dev/design/INTERPRETER_OPTIMIZATION.md @@ -0,0 +1,107 @@ +# Interpreter Performance Optimization + +## Profile Analysis (2026-03-23) + +**Benchmark:** `./jperl --interpreter dev/bench/benchmark_closure.pl` +- Interpreter mode: ~127 seconds +- JVM mode: ~4 seconds +- Ratio: ~32x (expected for bytecode interpreter vs JIT-compiled code) + +## Top Hotspots by Sample Count + +| Samples | Location | Description | +|---------|----------|-------------| +| 90 | `BytecodeInterpreter.execute` | Main interpreter loop | +| 54 | `RuntimeCode.apply` | Subroutine dispatch | +| 39 | `InterpretedCode.apply` | Delegation to interpreter | +| 7 | `getCallSiteInfo` | TreeMap lookup for caller() | +| 5 | `getSourceLocationAccurate` | Line number computation | + +## Detailed Hotspot Analysis + +### CALL Opcode Handling (BytecodeInterpreter.java lines 816-838) + +``` +Line 816 (6 samples): ThreadLocal lookup - InterpreterState.currentPackage.get() +Line 834 (7 samples): getCallSiteInfo - TreeMap.floorEntry() +Line 835 (4 samples): CallerStack.push +Line 838 (10 samples): RuntimeCode.apply - actual call +``` + +### Call Chain Overhead + +The subroutine call dispatch has deep indirection: + +``` +CALL opcode (BytecodeInterpreter) + → RuntimeCode.apply (54 samples) + → InterpretedCode.apply (39 samples) + → BytecodeInterpreter.execute (90 samples) +``` + +Each call goes through multiple layers before reaching the actual interpreter execution. + +## Optimization Plan + +### Phase 1: ThreadLocal Caching (High Impact, Low Risk) + +**Problem:** `InterpreterState.currentPackage.get()` is called on every CALL opcode. + +**Solution:** Cache the package name at the start of execute() and pass it through or use a local variable. + +**Files:** `BytecodeInterpreter.java` + +### Phase 2: Lazy CallerStack (High Impact, Medium Risk) + +**Problem:** `CallerStack.push/pop` and `getCallSiteInfo` happen on EVERY subroutine call, even when `caller()` is never invoked. + +**Solution:** Defer CallerStack operations until caller() is actually called: +1. Store call site info in a lightweight structure +2. Only populate CallerStack on-demand when caller() executes +3. Use a "dirty" flag to track if stack needs updating + +**Files:** `BytecodeInterpreter.java`, `CallerStack.java` + +### Phase 3: Inline Apply Path (Medium Impact, Medium Risk) + +**Problem:** Call dispatch goes through multiple indirection layers. + +**Solution:** For InterpretedCode, bypass RuntimeCode.apply and call BytecodeInterpreter.execute directly from the CALL opcode handler. + +**Files:** `BytecodeInterpreter.java` + +### Phase 4: Cache pcToTokenIndex Lookup (Low Impact, Low Risk) + +**Problem:** `TreeMap.floorEntry()` is O(log n) for line number lookups. + +**Solution:** Cache last lookup result since sequential execution often hits nearby PCs. + +**Files:** `BytecodeInterpreter.java` + +## Implementation Status + +### Completed +- [x] Profile analysis (2026-03-23) + +### In Progress +- [ ] Phase 1: ThreadLocal Caching + +### Pending +- [ ] Phase 2: Lazy CallerStack +- [ ] Phase 3: Inline Apply Path +- [ ] Phase 4: Cache pcToTokenIndex Lookup + +## Verification + +After each optimization: +1. Run `make` to ensure no regressions +2. Re-run benchmark to measure improvement +3. Re-profile to confirm hotspot reduction + +## Related Files + +- `src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java` +- `src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java` +- `src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java` +- `src/main/java/org/perlonjava/runtime/runtimetypes/CallerStack.java` +- `src/main/java/org/perlonjava/backend/bytecode/InterpreterState.java` From b7555267796e943f1cd06886ae2a3bd5898715e6 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 13:25:58 +0100 Subject: [PATCH 10/14] Interpreter optimization Phase 1: Cache ThreadLocal RuntimeScalar reference - Cache InterpreterState.currentPackage.get() at start of execute() - Reuse cached RuntimeScalar for SET_PACKAGE opcode - Avoid repeated ThreadLocal lookups in CALL_SUB opcodes No measurable speedup on benchmark_closure.pl, but cleaner code. Profile shows ~10% of time in getCallSiteInfo + getSourceLocationAccurate for caller() support - Phase 2 target. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/INTERPRETER_OPTIMIZATION.md | 11 +++++++-- .../backend/bytecode/BytecodeInterpreter.java | 24 ++++++++++++------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/dev/design/INTERPRETER_OPTIMIZATION.md b/dev/design/INTERPRETER_OPTIMIZATION.md index fbbb8422f..b7299c2d2 100644 --- a/dev/design/INTERPRETER_OPTIMIZATION.md +++ b/dev/design/INTERPRETER_OPTIMIZATION.md @@ -82,15 +82,22 @@ Each call goes through multiple layers before reaching the actual interpreter ex ### Completed - [x] Profile analysis (2026-03-23) +- [x] Phase 1: ThreadLocal Caching (2026-03-23) - Cache RuntimeScalar reference, no measurable speedup but cleaner code ### In Progress -- [ ] Phase 1: ThreadLocal Caching +- [ ] Phase 2: Lazy CallerStack - ~10% of time spent on caller() support ### Pending -- [ ] Phase 2: Lazy CallerStack - [ ] Phase 3: Inline Apply Path - [ ] Phase 4: Cache pcToTokenIndex Lookup +## Profile Results After Phase 1 + +Second profile showed `getCallSiteInfo` (16 samples) + `getSourceLocationAccurate` (15 samples) = ~10% overhead. +This is spent computing call site info for `caller()` support on every subroutine call. + +Phase 2 (Lazy CallerStack) is the next high-impact optimization. + ## Verification After each optimization: diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index c6f762788..9b4ce3b4a 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -111,8 +111,10 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c // Record DVM level so the finally block can clean up everything pushed // by this subroutine (local variables AND regex state snapshot). int savedLocalLevel = usesLocalization ? DynamicVariableManager.getLocalLevel() : 0; - String savedPackage = InterpreterState.currentPackage.get().toString(); - InterpreterState.currentPackage.get().set(framePackageName); + // Cache the currentPackage RuntimeScalar to avoid ThreadLocal lookups in hot loop + RuntimeScalar currentPackageScalar = InterpreterState.currentPackage.get(); + String savedPackage = currentPackageScalar.toString(); + currentPackageScalar.set(framePackageName); if (usesLocalization) { RegexState.save(); } @@ -813,9 +815,9 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c // This matches the JVM backend's call to codeDerefNonStrict() // Only call for STRING/BYTE_STRING types (symbolic references) // For CODE, REFERENCE, etc. let RuntimeCode.apply() handle errors - String currentPkg = InterpreterState.currentPackage.get().toString(); + // Use cached RuntimeScalar to avoid ThreadLocal lookup if (codeRef.type == RuntimeScalarType.STRING || codeRef.type == RuntimeScalarType.BYTE_STRING) { - codeRef = codeRef.codeDerefNonStrict(currentPkg); + codeRef = codeRef.codeDerefNonStrict(currentPackageScalar.toString()); } RuntimeBase argsBase = registers[argsReg]; @@ -831,6 +833,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } // Push call site info to CallerStack for caller() to see the correct location + // Get package name only when actually needed (lazy) + String currentPkg = currentPackageScalar.toString(); CallerStack.CallerInfo callSiteInfo = getCallSiteInfo(code, callSitePc, currentPkg); CallerStack.push(callSiteInfo.packageName(), callSiteInfo.filename(), callSiteInfo.line()); RuntimeList result; @@ -915,7 +919,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } // Push call site info to CallerStack for caller() to see the correct location - String currentPkg = InterpreterState.currentPackage.get().toString(); + // Use cached RuntimeScalar to avoid ThreadLocal lookup + String currentPkg = currentPackageScalar.toString(); CallerStack.CallerInfo callSiteInfo = getCallSiteInfo(code, callSitePc, currentPkg); CallerStack.push(callSiteInfo.packageName(), callSiteInfo.filename(), callSiteInfo.line()); RuntimeList result; @@ -1005,9 +1010,9 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c : codeRefBase.scalar(); // Dereference symbolic code references + // Use cached RuntimeScalar to avoid ThreadLocal lookup if (codeRef.type == RuntimeScalarType.STRING || codeRef.type == RuntimeScalarType.BYTE_STRING) { - String currentPkg = InterpreterState.currentPackage.get().toString(); - codeRef = codeRef.codeDerefNonStrict(currentPkg); + codeRef = codeRef.codeDerefNonStrict(currentPackageScalar.toString()); } // Get args @@ -1611,8 +1616,9 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c case Opcodes.SET_PACKAGE -> { // Non-scoped package declaration: package Foo; // Update the runtime current-package tracker so caller() returns the right package. + // Uses cached RuntimeScalar reference to avoid ThreadLocal lookup int nameIdx = bytecode[pc++]; - InterpreterState.currentPackage.get().set(code.stringPool[nameIdx]); + currentPackageScalar.set(code.stringPool[nameIdx]); } case Opcodes.PUSH_PACKAGE -> { @@ -1876,7 +1882,7 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c if (usesLocalization) { DynamicVariableManager.popToLocalLevel(savedLocalLevel); } - InterpreterState.currentPackage.get().set(savedPackage); + currentPackageScalar.set(savedPackage); InterpreterState.pop(); } } From c05eefcece281aa4ab2a561132583663545ac5f3 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 13:31:40 +0100 Subject: [PATCH 11/14] Interpreter optimization Phase 2: Lazy CallerStack (~19% speedup) Defer caller() info computation until actually needed: - Add CallerStack.pushLazy() with lambda-based resolution - CALL_SUB/CALL_METHOD now push lazy entries - Line number computation only happens when caller() is called - pop() skips resolution for unneeded entries Benchmark improvement: 127s -> 103s = ~19% speedup on benchmark_closure.pl Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/INTERPRETER_OPTIMIZATION.md | 13 ++-- .../backend/bytecode/BytecodeInterpreter.java | 21 +++--- .../org/perlonjava/core/Configuration.java | 4 +- .../runtime/runtimetypes/CallerStack.java | 71 +++++++++++++++++-- 4 files changed, 89 insertions(+), 20 deletions(-) diff --git a/dev/design/INTERPRETER_OPTIMIZATION.md b/dev/design/INTERPRETER_OPTIMIZATION.md index b7299c2d2..249a86f47 100644 --- a/dev/design/INTERPRETER_OPTIMIZATION.md +++ b/dev/design/INTERPRETER_OPTIMIZATION.md @@ -83,9 +83,7 @@ Each call goes through multiple layers before reaching the actual interpreter ex ### Completed - [x] Profile analysis (2026-03-23) - [x] Phase 1: ThreadLocal Caching (2026-03-23) - Cache RuntimeScalar reference, no measurable speedup but cleaner code - -### In Progress -- [ ] Phase 2: Lazy CallerStack - ~10% of time spent on caller() support +- [x] Phase 2: Lazy CallerStack (2026-03-23) - **~19% speedup** (127s → 103s) ### Pending - [ ] Phase 3: Inline Apply Path @@ -96,7 +94,14 @@ Each call goes through multiple layers before reaching the actual interpreter ex Second profile showed `getCallSiteInfo` (16 samples) + `getSourceLocationAccurate` (15 samples) = ~10% overhead. This is spent computing call site info for `caller()` support on every subroutine call. -Phase 2 (Lazy CallerStack) is the next high-impact optimization. +## Phase 2 Results + +Implemented lazy CallerStack: +- `CallerStack.pushLazy()` stores a lambda that computes CallerInfo on demand +- Line number computation deferred until `caller()` is actually called +- `pop()` doesn't resolve lazy entries (no computation needed) + +**Benchmark improvement:** 127s → 103s = **~19% speedup** ## Verification diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index 9b4ce3b4a..5498c16be 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -832,11 +832,12 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c callArgs = new RuntimeArray((RuntimeScalar) argsBase); } - // Push call site info to CallerStack for caller() to see the correct location - // Get package name only when actually needed (lazy) - String currentPkg = currentPackageScalar.toString(); - CallerStack.CallerInfo callSiteInfo = getCallSiteInfo(code, callSitePc, currentPkg); - CallerStack.push(callSiteInfo.packageName(), callSiteInfo.filename(), callSiteInfo.line()); + // Push lazy call site info to CallerStack for caller() to see the correct location + // The actual line number computation is deferred until caller() is called + // Capture variables needed for lazy resolution + final String lazyPkg = currentPackageScalar.toString(); + final int lazyPc = callSitePc; + CallerStack.pushLazy(lazyPkg, () -> getCallSiteInfo(code, lazyPc, lazyPkg)); RuntimeList result; try { result = RuntimeCode.apply(codeRef, "", callArgs, context); @@ -918,11 +919,11 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c callArgs = new RuntimeArray((RuntimeScalar) argsBase); } - // Push call site info to CallerStack for caller() to see the correct location - // Use cached RuntimeScalar to avoid ThreadLocal lookup - String currentPkg = currentPackageScalar.toString(); - CallerStack.CallerInfo callSiteInfo = getCallSiteInfo(code, callSitePc, currentPkg); - CallerStack.push(callSiteInfo.packageName(), callSiteInfo.filename(), callSiteInfo.line()); + // Push lazy call site info to CallerStack for caller() to see the correct location + // Capture variables needed for lazy resolution + final String lazyPkg = currentPackageScalar.toString(); + final int lazyPc = callSitePc; + CallerStack.pushLazy(lazyPkg, () -> getCallSiteInfo(code, lazyPc, lazyPkg)); RuntimeList result; try { result = RuntimeCode.call(invocant, method, currentSub, callArgs, context); diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 8c9c56bce..262ca1e0a 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,14 +33,14 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "4473efe87"; + public static final String gitCommitId = "b75552677"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitDate = "2026-03-21"; + public static final String gitCommitDate = "2026-03-23"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/CallerStack.java b/src/main/java/org/perlonjava/runtime/runtimetypes/CallerStack.java index 0311aab10..36379f1d3 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/CallerStack.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/CallerStack.java @@ -10,7 +10,8 @@ * for implementing the caller() function during operations like import() and unimport(). */ public class CallerStack { - private static final List callerStack = new ArrayList<>(); + // Store either CallerInfo (resolved) or LazyCallerInfo (deferred) + private static final List callerStack = new ArrayList<>(); /** * Pushes a new CallerInfo object onto the stack, representing a new entry in the calling sequence. @@ -26,9 +27,25 @@ public static void push(String packageName, String filename, int line) { callerStack.add(new CallerInfo(packageName, filename, line)); } + /** + * Pushes a lazy CallerInfo onto the stack. The actual filename and line number + * will be computed only when peek() is called, avoiding expensive line number + * lookups for subroutine calls that never use caller(). + * + * @param packageName The name of the package where the call originated. + * @param resolver A function to compute the CallerInfo when needed. + */ + public static void pushLazy(String packageName, CallerInfoResolver resolver) { + if (System.getenv("DEBUG_CALLER") != null) { + System.err.println("DEBUG CallerStack.pushLazy: pkg=" + packageName + " (stack size now " + (callerStack.size() + 1) + ")"); + } + callerStack.add(new LazyCallerInfo(packageName, resolver)); + } + /** * Retrieves the most recent CallerInfo object from the stack without removing it. * Zero is the most recent entry. + * If the entry is lazy, it will be resolved and replaced with the actual CallerInfo. * * @return The most recent CallerInfo object, or null if the stack is empty. */ @@ -40,11 +57,21 @@ public static CallerInfo peek(int callFrame) { if (index < 0) { return null; } - return callerStack.get(index); + Object entry = callerStack.get(index); + if (entry instanceof CallerInfo ci) { + return ci; + } else if (entry instanceof LazyCallerInfo lazy) { + // Resolve the lazy entry and cache it + CallerInfo resolved = lazy.resolve(); + callerStack.set(index, resolved); + return resolved; + } + return null; } /** * Removes and returns the most recent CallerInfo object from the stack. + * If the entry is lazy, it will NOT be resolved (saves computation on pop). * * @return The most recent CallerInfo object, or null if the stack is empty. */ @@ -52,16 +79,52 @@ public static CallerInfo pop() { if (callerStack.isEmpty()) { return null; } - return callerStack.removeLast(); + Object entry = callerStack.removeLast(); + if (entry instanceof CallerInfo ci) { + return ci; + } else if (entry instanceof LazyCallerInfo lazy) { + // Don't resolve on pop - caller info not needed + return null; + } + return null; } /** * Retrieves a copy of the current calling stack. + * Lazy entries will be resolved. * * @return A list containing all CallerInfo objects in the stack. */ public static List getStack() { - return new ArrayList<>(callerStack); + List result = new ArrayList<>(); + for (int i = 0; i < callerStack.size(); i++) { + Object entry = callerStack.get(i); + if (entry instanceof CallerInfo ci) { + result.add(ci); + } else if (entry instanceof LazyCallerInfo lazy) { + CallerInfo resolved = lazy.resolve(); + callerStack.set(i, resolved); + result.add(resolved); + } + } + return result; + } + + /** + * Functional interface for lazy resolution of caller info. + */ + @FunctionalInterface + public interface CallerInfoResolver { + CallerInfo resolve(); + } + + /** + * Holds deferred caller info computation. + */ + private record LazyCallerInfo(String packageName, CallerInfoResolver resolver) { + CallerInfo resolve() { + return resolver.resolve(); + } } /** From b4fad9c7ab426a020fe34cc5a2caca9291d033a8 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 14:01:59 +0100 Subject: [PATCH 12/14] Interpreter optimization Phase 3: Inline InterpretedCode calls - Add fast path in CALL_SUB for InterpretedCode: call execute() directly - Bypass RuntimeCode.apply() indirection chain for interpreter-to-interpreter calls - Pass null for subroutineName to enable InterpreterFrame caching - Apply same optimization to TAILCALL handling Small improvement (~2%) combined with previous optimizations. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/bytecode/BytecodeInterpreter.java | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index 5498c16be..a7619fc9d 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -840,7 +840,16 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c CallerStack.pushLazy(lazyPkg, () -> getCallSiteInfo(code, lazyPc, lazyPkg)); RuntimeList result; try { - result = RuntimeCode.apply(codeRef, "", callArgs, context); + // Fast path for InterpretedCode: call execute() directly, + // bypassing RuntimeCode.apply() indirection chain + if (codeRef.type == RuntimeScalarType.CODE && codeRef.value instanceof InterpretedCode interpCode) { + // Direct call to interpreter - skip RuntimeCode.apply overhead + // Pass null for subroutineName to enable frame caching + result = BytecodeInterpreter.execute(interpCode, callArgs, context, null); + } else { + // Slow path for JVM-compiled code, symbolic references, etc. + result = RuntimeCode.apply(codeRef, "", callArgs, context); + } // Handle TAILCALL with trampoline loop (same as JVM backend) while (result.isNonLocalGoto()) { @@ -849,7 +858,12 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c // Extract codeRef and args, call target codeRef = flow.getTailCallCodeRef(); callArgs = flow.getTailCallArgs(); - result = RuntimeCode.apply(codeRef, "tailcall", callArgs, context); + // Use fast path for InterpretedCode + if (codeRef.type == RuntimeScalarType.CODE && codeRef.value instanceof InterpretedCode interpCode) { + result = BytecodeInterpreter.execute(interpCode, callArgs, context, null); + } else { + result = RuntimeCode.apply(codeRef, "tailcall", callArgs, context); + } // Loop to handle chained tail calls } else { // Not TAILCALL - check labeled blocks or propagate From 13322ef3f2f599f3d70955a3d6a794a07ac7ef6a Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 14:05:49 +0100 Subject: [PATCH 13/14] Add register array pooling for interpreter (97s from 101s) - InterpretedCode.getRegisters() caches register arrays per-code-object - Uses ThreadLocal for thread safety with recursion detection - Recursive calls fallback to fresh allocation (no contention) - BytecodeInterpreter.execute() releases registers in finally block Benchmark: 97s from 101s baseline (4% improvement from allocation reduction) Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../backend/bytecode/BytecodeInterpreter.java | 6 ++-- .../backend/bytecode/InterpretedCode.java | 31 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index a7619fc9d..7e80a9fca 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -74,8 +74,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c // Get PC holder for direct updates (avoids ThreadLocal lookups in hot loop) int[] pcHolder = InterpreterState.push(code, framePackageName, frameSubName); - // Pure register file (NOT stack-based - matches compiler for control flow correctness) - RuntimeBase[] registers = new RuntimeBase[code.maxRegisters]; + // Get register array from cache (avoids allocation for non-recursive calls) + RuntimeBase[] registers = code.getRegisters(); // Initialize special registers (same as compiler) registers[0] = code; // $this (for closures - register 0) @@ -1899,6 +1899,8 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c } currentPackageScalar.set(savedPackage); InterpreterState.pop(); + // Release cached registers for reuse + code.releaseRegisters(); } } diff --git a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java index 8cb64b73f..91f73a453 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java +++ b/src/main/java/org/perlonjava/backend/bytecode/InterpretedCode.java @@ -40,6 +40,37 @@ public class InterpretedCode extends RuntimeCode implements PerlSubroutine { // Created lazily on first use (after packageName/subName are set) public volatile InterpreterState.InterpreterFrame cachedFrame; + // Cached register array for non-recursive calls (avoids allocation) + // Thread-safe via ThreadLocal for multi-threaded execution + private final ThreadLocal cachedRegisters = new ThreadLocal<>(); + // Flag to track if cached registers are currently in use (for recursion detection) + private final ThreadLocal registersInUse = ThreadLocal.withInitial(() -> false); + + /** + * Get a register array for execution. Returns cached array if not in use (common case), + * otherwise allocates a new one (recursive call). + */ + public RuntimeBase[] getRegisters() { + if (registersInUse.get()) { + // Recursive call - need fresh array + return new RuntimeBase[maxRegisters]; + } + RuntimeBase[] regs = cachedRegisters.get(); + if (regs == null || regs.length != maxRegisters) { + regs = new RuntimeBase[maxRegisters]; + cachedRegisters.set(regs); + } + registersInUse.set(true); + return regs; + } + + /** + * Release the register array after execution completes. + */ + public void releaseRegisters() { + registersInUse.set(false); + } + // Lexical pragma state (for eval STRING to inherit) public final int strictOptions; // Strict flags at compile time public final int featureFlags; // Feature flags at compile time From 70ef173b6ab9de3ec297bf74df8b48bf33412025 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Mon, 23 Mar 2026 14:06:14 +0100 Subject: [PATCH 14/14] Update design doc with Phase 3 & 4 results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Total improvement: 127s → 97s (~24% speedup) - Phase 3: Inline apply path (2% speedup) - Phase 4: Register pooling (4% speedup) Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- dev/design/INTERPRETER_OPTIMIZATION.md | 33 ++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/dev/design/INTERPRETER_OPTIMIZATION.md b/dev/design/INTERPRETER_OPTIMIZATION.md index 249a86f47..522a8ab87 100644 --- a/dev/design/INTERPRETER_OPTIMIZATION.md +++ b/dev/design/INTERPRETER_OPTIMIZATION.md @@ -84,10 +84,11 @@ Each call goes through multiple layers before reaching the actual interpreter ex - [x] Profile analysis (2026-03-23) - [x] Phase 1: ThreadLocal Caching (2026-03-23) - Cache RuntimeScalar reference, no measurable speedup but cleaner code - [x] Phase 2: Lazy CallerStack (2026-03-23) - **~19% speedup** (127s → 103s) +- [x] Phase 3: Inline Apply Path (2026-03-23) - **~2% speedup** (103s → 101s) +- [x] Phase 4: Register Array Pooling (2026-03-23) - **~4% speedup** (101s → 97s) ### Pending -- [ ] Phase 3: Inline Apply Path -- [ ] Phase 4: Cache pcToTokenIndex Lookup +- [ ] Phase 5: Cache pcToTokenIndex Lookup (moved from Phase 4) ## Profile Results After Phase 1 @@ -103,6 +104,34 @@ Implemented lazy CallerStack: **Benchmark improvement:** 127s → 103s = **~19% speedup** +## Phase 3 Results + +Inline InterpretedCode apply path in CALL_SUB: +- Check if code is `InterpretedCode` and call `BytecodeInterpreter.execute()` directly +- Bypasses `RuntimeCode.apply()` → `InterpretedCode.apply()` chain + +**Benchmark improvement:** 103s → 101s = **~2% speedup** + +## Phase 4 Results + +Register array pooling in InterpretedCode: +- `InterpretedCode.getRegisters()` caches register arrays per-code-object +- Uses ThreadLocal for thread safety with recursion detection +- Recursive calls fallback to fresh allocation (no contention) +- `BytecodeInterpreter.execute()` releases registers in finally block + +**Benchmark improvement:** 101s → 97s = **~4% speedup** + +## Total Performance Improvement + +| Phase | Time (s) | Improvement | +|-------|----------|-------------| +| Baseline | 127 | - | +| Phase 2 (Lazy CallerStack) | 103 | 19% | +| Phase 3 (Inline Apply) | 101 | 2% | +| Phase 4 (Register Pooling) | 97 | 4% | +| **Total** | **97** | **~24%** | + ## Verification After each optimization: