diff --git a/dev/interpreter/SKILL.md b/dev/interpreter/SKILL.md index 95b7828b6..72735fd8e 100644 --- a/dev/interpreter/SKILL.md +++ b/dev/interpreter/SKILL.md @@ -16,6 +16,174 @@ - `InterpretedCode.java` - Bytecode container with disassembler - `SlowOpcodeHandler.java` - Handlers for rare operations (151-154) +## Code Generation Tool + +**Location:** `dev/tools/generate_opcode_handlers.pl` + +Automates creation of opcode handlers for built-in functions with simple signatures. + +### Quick Start + +```bash +# Generate handlers for all eligible operators in OperatorHandler.java +perl dev/tools/generate_opcode_handlers.pl + +# Rebuilds: +# - ScalarUnaryOpcodeHandler.java (31 ops: chr, ord, abs, sin, cos, etc.) +# - ScalarBinaryOpcodeHandler.java (12 ops: atan2, eq, ne, lt, le, gt, ge, cmp, etc.) +# - Opcodes.java (adds new opcode constants) +# - BytecodeInterpreter.java (adds dispatch cases) +# - InterpretedCode.java (adds disassembly cases) +``` + +### What Gets Generated + +**Automatically:** +1. Handler classes with zero-overhead dispatch pattern +2. Opcode constants in Opcodes.java +3. Dispatch cases in BytecodeInterpreter.java +4. Disassembly cases in InterpretedCode.java + +**Still Manual:** +- Emit cases in BytecodeCompiler.java (between `// GENERATED_OPERATORS_START/END`) + +### Eligibility Criteria + +**Included:** +- Scalar unary: `(RuntimeScalar) → RuntimeScalar` +- Scalar binary: `(RuntimeScalar, RuntimeScalar) → RuntimeScalar` +- Scalar ternary: `(RuntimeScalar, RuntimeScalar, RuntimeScalar) → RuntimeScalar` + +**Excluded:** +- Varargs signatures: `(int, RuntimeBase...)` - getc +- Array/List/Hash parameters +- Primitive parameters (except in skipped varargs) +- Already existing opcodes (rand=91, length=30, rindex=173, index=172, require=170, isa=105, bless=104, ref=103, join=88, prototype=158) + +### Adding BytecodeCompiler Cases + +Tool prints list of operators needing emit cases. Add between markers: + +```java +// GENERATED_OPERATORS_START +} else if (op.equals("chr")) { + // chr($x) - convert codepoint to character + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chr requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CHR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; +// GENERATED_OPERATORS_END +``` + +### Critical: LASTOP Management + +Tool reads `LASTOP` from Opcodes.java to determine starting opcode: + +```java +// In Opcodes.java +public static final short REDO = 220; + +// Last manually-assigned opcode (for tool reference) +private static final short LASTOP = 220; // ← UPDATE WHEN ADDING MANUAL OPCODES +``` + +**When adding manual opcodes:** +1. Add constant BEFORE generated section +2. Update `LASTOP = ` +3. Run tool - it starts at LASTOP + 1 + +### Gotchas + +**1. Don't Edit Generated Sections** +- Between `// GENERATED_*_START` and `// GENERATED_*_END` +- Tool overwrites on regeneration +- Your changes will be lost! + +**2. LASTOP Drift** +```java +// WRONG: Forgot to update LASTOP +public static final short MY_NEW_OP = 221; +private static final short LASTOP = 220; // ← Still 220! + +// Tool starts at 221, collides with MY_NEW_OP! + +// RIGHT: Always update LASTOP +public static final short MY_NEW_OP = 221; +private static final short LASTOP = 221; // ← Updated! +``` + +**3. Import Path Conversion** +- Tool auto-converts: `org/perlonjava/operators/...` → `org.perlonjava.operators....` +- Works correctly for all Java imports + +**4. BytecodeCompiler Not Automated** +- Tool can't automatically add emit cases (too many variations) +- Must add manually between markers +- Tool prints list of operators needing implementation + +**5. Signature Mismatches** +- Tool skips complex signatures silently +- Check tool output for "Skipping X" messages +- These need manual implementation + +### Testing Generated Opcodes + +```bash +# Build +make + +# Test in interpreter mode (forces eval STRING to use interpreter) +JPERL_EVAL_USE_INTERPRETER=1 ./jperl /tmp/test.pl + +# Test script example: +cat > /tmp/test.pl << 'EOF' +print "chr(65): ", eval("chr(65)"), "\n"; +print "ord('A'): ", eval("ord('A')"), "\n"; +print "abs(-42): ", eval("abs(-42)"), "\n"; +EOF + +# Expected output (after adding BytecodeCompiler cases): +# chr(65): A +# ord('A'): 65 +# abs(-42): 42 +``` + +### Regenerating After Changes + +```bash +# After adding new operators to OperatorHandler.java +perl dev/tools/generate_opcode_handlers.pl + +# After updating LASTOP +perl dev/tools/generate_opcode_handlers.pl + +# Tool output shows: +# - Existing opcodes skipped +# - New opcodes generated +# - Next available opcode number +# - List of operators needing BytecodeCompiler cases +``` + +### Manual Implementation Still Needed For + +- **Varargs functions**: getc, printf, sprintf +- **List operators**: map, grep, sort, push, pop +- **Hash operators**: keys, values, each +- **Array operators**: splice (complex signature) +- **Special forms**: defined, wantarray (already manual) + ## Adding New Operators ### 1. Decide: Fast Opcode or SLOW_OP? @@ -187,6 +355,13 @@ make # Run unit tests make test-unit +# Run specific test in interpreter mode +cd perl5_t/t && JPERL_EVAL_USE_INTERPRETER=1 ../../jperl op/bop.t + +# Compare compiler vs interpreter results +./jperl op/bop.t # Compiler mode +JPERL_EVAL_USE_INTERPRETER=1 ./jperl op/bop.t # Interpreter mode + # Verify tableswitch preserved javap -c -classpath build/classes/java/main \ org.perlonjava.interpreter.BytecodeInterpreter | grep -A 5 "switch" diff --git a/dev/prompts/generated_opcodes_report.txt b/dev/prompts/generated_opcodes_report.txt new file mode 100644 index 000000000..61deb27dd --- /dev/null +++ b/dev/prompts/generated_opcodes_report.txt @@ -0,0 +1,257 @@ +Reading existing opcodes... + Found 221 existing opcodes + +Parsing OperatorHandler.java... + Skipping rand (RAND) - already exists as opcode 91 + Skipping length (LENGTH) - already exists as opcode 30 + Skipping rindex (RINDEX) - already exists as opcode 173 + Skipping index (INDEX) - already exists as opcode 172 + Skipping require (REQUIRE) - already exists as opcode 170 + Skipping isa (ISA) - already exists as opcode 105 + Skipping bless (BLESS) - already exists as opcode 104 + Skipping ref (REF) - already exists as opcode 103 + Skipping join (JOIN) - already exists as opcode 88 + Skipping prototype (PROTOTYPE) - already exists as opcode 158 + +Parsed operators by signature: + scalar_binary : 12 operators + scalar_unary : 32 operators + +Generating ScalarBinaryOpcodeHandler with 12 operators... + Generated: src/main/java/org/perlonjava/interpreter/ScalarBinaryOpcodeHandler.java +Generating ScalarUnaryOpcodeHandler with 32 operators... + Generated: src/main/java/org/perlonjava/interpreter/ScalarUnaryOpcodeHandler.java + +====================================================================== +UPDATE INSTRUCTIONS +====================================================================== + +1. ADD TO Opcodes.java (at marker: // GENERATED_OPCODES_START): + + // scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.) + public static final short ATAN2 = 228; + public static final short BINARY_AND = 229; + public static final short BINARY_OR = 230; + public static final short BINARY_XOR = 231; + public static final short EQ = 239; + public static final short NE = 240; + public static final short LT = 241; + public static final short LE = 242; + public static final short GT = 243; + public static final short GE = 244; + public static final short CMP = 245; + public static final short X = 264; + + // scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.) + public static final short INT = 221; + public static final short LOG = 222; + public static final short SQRT = 223; + public static final short COS = 224; + public static final short SIN = 225; + public static final short EXP = 226; + public static final short ABS = 227; + public static final short BINARY_NOT = 232; + public static final short INTEGER_BITWISE_NOT = 233; + public static final short ORD = 234; + public static final short ORD_BYTES = 235; + public static final short OCT = 236; + public static final short HEX = 237; + public static final short SRAND = 238; + public static final short CHR = 246; + public static final short CHR_BYTES = 247; + public static final short LENGTH_BYTES = 248; + public static final short QUOTEMETA = 249; + public static final short FC = 250; + public static final short LC = 251; + public static final short LCFIRST = 252; + public static final short UC = 253; + public static final short UCFIRST = 254; + public static final short SLEEP = 255; + public static final short TELL = 256; + public static final short GETC = 257; + public static final short RMDIR = 258; + public static final short CLOSEDIR = 259; + public static final short REWINDDIR = 260; + public static final short TELLDIR = 261; + public static final short CHDIR = 262; + public static final short EXIT = 263; + +2. ADD TO BytecodeInterpreter.java (at marker: // GENERATED_HANDLERS_START): + + // scalar_binary + case Opcodes.ATAN2: + case Opcodes.BINARY_AND: + case Opcodes.BINARY_OR: + case Opcodes.BINARY_XOR: + case Opcodes.EQ: + case Opcodes.NE: + case Opcodes.LT: + case Opcodes.LE: + case Opcodes.GT: + case Opcodes.GE: + case Opcodes.CMP: + case Opcodes.X: + pc = ScalarBinaryOpcodeHandler.execute(opcode, bytecode, pc, registers); + break; + + // scalar_unary + case Opcodes.INT: + case Opcodes.LOG: + case Opcodes.SQRT: + case Opcodes.COS: + case Opcodes.SIN: + case Opcodes.EXP: + case Opcodes.ABS: + case Opcodes.BINARY_NOT: + case Opcodes.INTEGER_BITWISE_NOT: + case Opcodes.ORD: + case Opcodes.ORD_BYTES: + case Opcodes.OCT: + case Opcodes.HEX: + case Opcodes.SRAND: + case Opcodes.CHR: + case Opcodes.CHR_BYTES: + case Opcodes.LENGTH_BYTES: + case Opcodes.QUOTEMETA: + case Opcodes.FC: + case Opcodes.LC: + case Opcodes.LCFIRST: + case Opcodes.UC: + case Opcodes.UCFIRST: + case Opcodes.SLEEP: + case Opcodes.TELL: + case Opcodes.GETC: + case Opcodes.RMDIR: + case Opcodes.CLOSEDIR: + case Opcodes.REWINDDIR: + case Opcodes.TELLDIR: + case Opcodes.CHDIR: + case Opcodes.EXIT: + pc = ScalarUnaryOpcodeHandler.execute(opcode, bytecode, pc, registers); + break; + +3. ADD TO InterpretedCode.java disassemble() (at marker: // GENERATED_DISASM_START): + + // scalar_binary + case Opcodes.ATAN2: + case Opcodes.BINARY_AND: + case Opcodes.BINARY_OR: + case Opcodes.BINARY_XOR: + case Opcodes.EQ: + case Opcodes.NE: + case Opcodes.LT: + case Opcodes.LE: + case Opcodes.GT: + case Opcodes.GE: + case Opcodes.CMP: + case Opcodes.X: + pc = ScalarBinaryOpcodeHandler.disassemble(opcode, bytecode, pc, sb); + break; + + // scalar_unary + case Opcodes.INT: + case Opcodes.LOG: + case Opcodes.SQRT: + case Opcodes.COS: + case Opcodes.SIN: + case Opcodes.EXP: + case Opcodes.ABS: + case Opcodes.BINARY_NOT: + case Opcodes.INTEGER_BITWISE_NOT: + case Opcodes.ORD: + case Opcodes.ORD_BYTES: + case Opcodes.OCT: + case Opcodes.HEX: + case Opcodes.SRAND: + case Opcodes.CHR: + case Opcodes.CHR_BYTES: + case Opcodes.LENGTH_BYTES: + case Opcodes.QUOTEMETA: + case Opcodes.FC: + case Opcodes.LC: + case Opcodes.LCFIRST: + case Opcodes.UC: + case Opcodes.UCFIRST: + case Opcodes.SLEEP: + case Opcodes.TELL: + case Opcodes.GETC: + case Opcodes.RMDIR: + case Opcodes.CLOSEDIR: + case Opcodes.REWINDDIR: + case Opcodes.TELLDIR: + case Opcodes.CHDIR: + case Opcodes.EXIT: + pc = ScalarUnaryOpcodeHandler.disassemble(opcode, bytecode, pc, sb); + break; + +4. ADD TO BytecodeCompiler.java visit(OperatorNode) (at marker: // GENERATED_OPERATORS_START): + +Add cases for each operator following the pattern: +} else if (op.equals("chr")) { + // chr($x) - convert codepoint to character + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chr requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CHR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; +} + + +Next opcode available: 265 + +Operators to add in BytecodeCompiler: + - atan2 + - binary& + - binary| + - binary^ + - eq + - ne + - lt + - le + - gt + - ge + - cmp + - x + - int + - log + - sqrt + - cos + - sin + - exp + - abs + - binary~ + - integerBitwiseNot + - ord + - ordBytes + - oct + - hex + - srand + - chr + - chrBytes + - lengthBytes + - quotemeta + - fc + - lc + - lcfirst + - uc + - ucfirst + - sleep + - tell + - getc + - rmdir + - closedir + - rewinddir + - telldir + - chdir + - exit diff --git a/dev/prompts/opcode_conflicts.md b/dev/prompts/opcode_conflicts.md new file mode 100644 index 000000000..1c4aefd4d --- /dev/null +++ b/dev/prompts/opcode_conflicts.md @@ -0,0 +1,61 @@ +# Opcode Conflicts Report + +When attempting to add bulk operator support to the interpreter, we discovered that many operators already have opcodes defined: + +## Already Exist (with existing opcodes): +- RAND (91) - duplicate at 239 +- LENGTH (30) - duplicate at 249 +- RINDEX (173) - duplicate at 257 +- INDEX (172) - duplicate at 258 +- REQUIRE (170) - duplicate at 267 +- ISA (105) - duplicate at 268 +- BLESS (104) - duplicate at 269 +- REF (103) - duplicate at 270 +- JOIN (88) - duplicate at 273 +- PROTOTYPE (158) - duplicate at 274 + +## New opcodes that don't conflict (221-274 range): +- INT, LOG, SQRT, COS, SIN, EXP, ABS, ATAN2 +- BINARY_AND, BINARY_OR, BINARY_XOR, BINARY_NOT, INTEGER_BITWISE_NOT +- ORD, ORD_BYTES, OCT, HEX, SRAND +- EQ, NE, LT, LE, GT, GE, CMP +- CHR, CHR_BYTES, LENGTH_BYTES +- QUOTEMETA, FC, LC, LCFIRST, UC, UCFIRST +- SLEEP, TELL, GETC, RMDIR, CLOSEDIR, REWINDDIR, TELLDIR, CHDIR +- EXIT, X + +## Next Steps: +1. Remove duplicate opcode definitions from Opcodes.java +2. Update handler classes to remove or redirect duplicate operators +3. Remove duplicate case labels from BytecodeInterpreter.java and InterpretedCode.java +4. Fix method signature issues (getc, prototype) +5. Add BytecodeCompiler integration to emit these opcodes for function calls + +## BytecodeCompiler Integration: +Need to add cases in visit(OperatorNode) around line 5700+ where other built-ins like "length" are handled. +Example pattern: +```java +} else if (op.equals("chr")) { + // chr($x) - convert codepoint to character + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chr requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + + int rd = allocateRegister(); + emit(Opcodes.CHR); + emitReg(rd); + emitReg(argReg); + + lastResultReg = rd; +} +``` + +This needs to be added for each new operator. diff --git a/dev/prompts/operator_generation_summary.md b/dev/prompts/operator_generation_summary.md new file mode 100644 index 000000000..6a1e6ba7e --- /dev/null +++ b/dev/prompts/operator_generation_summary.md @@ -0,0 +1,35 @@ +# Operator Generation Summary + +## Tool Created +- `dev/tools/generate_opcode_handlers.pl` - Generates opcode handlers from OperatorHandler.java +- Automatically skips operators that already have opcodes +- Generates clean handler classes with -> syntax + +## Generated Files +- `ScalarUnaryOpcodeHandler.java` - 32 unary operators (chr, ord, abs, sin, cos, lc, uc, etc.) +- `ScalarBinaryOpcodeHandler.java` - 12 binary operators (atan2, eq, ne, lt, le, gt, ge, cmp, binary&, binary|, binary^, x) + +## Opcodes Reserved +- 221-264 (44 opcodes for new operators) +- Next available: 265 + +## Operators Skipped (already have opcodes) +- rand (91), length (30), rindex (173), index (172) +- require (170), isa (105), bless (104), ref (103) +- join (88), prototype (158) + +## Integration Steps Needed +1. Add opcodes to Opcodes.java (see generated_opcodes_report.txt) +2. Add handler cases to BytecodeInterpreter.java +3. Add disassembly cases to InterpretedCode.java +4. Add emit cases to BytecodeCompiler.java for each operator + +## Markers Added to Files +Files now have markers like: +- `// GENERATED_OPCODES_START` in Opcodes.java +- `// GENERATED_HANDLERS_START` in BytecodeInterpreter.java +- `// GENERATED_DISASM_START` in InterpretedCode.java +- `// GENERATED_OPERATORS_START` in BytecodeCompiler.java + +## Next Enhancement +The tool could be enhanced to automatically insert/update code at these markers. diff --git a/dev/tools/generate_opcode_handlers.pl b/dev/tools/generate_opcode_handlers.pl new file mode 100755 index 000000000..a351b730e --- /dev/null +++ b/dev/tools/generate_opcode_handlers.pl @@ -0,0 +1,589 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use File::Path qw(make_path); + +# Configuration +my $operator_handler_file = 'src/main/java/org/perlonjava/operators/OperatorHandler.java'; +my $opcodes_file = 'src/main/java/org/perlonjava/interpreter/Opcodes.java'; +my $bytecode_interpreter_file = 'src/main/java/org/perlonjava/interpreter/BytecodeInterpreter.java'; +my $interpreted_code_file = 'src/main/java/org/perlonjava/interpreter/InterpretedCode.java'; +my $bytecode_compiler_file = 'src/main/java/org/perlonjava/interpreter/BytecodeCompiler.java'; +my $output_dir = 'src/main/java/org/perlonjava/interpreter'; + +# Read existing opcodes and LASTOP from Opcodes.java +print "Reading existing opcodes...\n"; +my %existing_opcodes = read_existing_opcodes($opcodes_file); +my $OPCODE_START = $existing_opcodes{__LASTOP__} + 1; +print " Starting new opcodes at $OPCODE_START (LASTOP + 1)\n"; + +# Read OperatorHandler.java +open my $fh, '<', $operator_handler_file or die "Cannot open $operator_handler_file: $!"; +my $content = do { local $/; <$fh> }; +close $fh; + +# Parse operators +my %operators_by_sig; + +print "\nParsing OperatorHandler.java...\n"; +while ($content =~ /put\("([^"]+)",\s*"(\w+)",\s*"([^"]+)"(?:,\s*"([^"]+)")?\)/g) { + my ($op_name, $method, $class_path, $descriptor) = ($1, $2, $3, $4); + + # Skip operators with special characters that are already handled + next if $op_name =~ /^[+\-*\/%&|^<>=!.~]+$/; + next if $op_name =~ /^(binary|unaryMinus|xor|not|\.\.)$/; + + # Skip operators with known signature issues + next if $op_name eq 'getc'; # varargs signature: (int, RuntimeBase...) + + # Default descriptor for binary scalar operators + $descriptor //= "(Lorg/perlonjava/runtime/RuntimeScalar;Lorg/perlonjava/runtime/RuntimeScalar;)Lorg/perlonjava/runtime/RuntimeScalar;"; + + my $class = $class_path =~ s|.*/||r; + my $sig_type = classify_signature($descriptor); + + # Skip already implemented or complex signatures + next if $sig_type eq 'skip'; + + my $opcode_const = opcode_name($op_name); + + # Check if opcode already exists + if (exists $existing_opcodes{$opcode_const}) { + print " Skipping $op_name ($opcode_const) - already exists as opcode $existing_opcodes{$opcode_const}\n"; + next; + } + + my $op = { + name => $op_name, + opcode_name => $opcode_const, + method => $method, + class => $class, + class_path => $class_path, + descriptor => $descriptor, + }; + + push @{$operators_by_sig{$sig_type}}, $op; +} + +# Now assign contiguous opcode numbers by signature type +my $opcode_num = $OPCODE_START; + +for my $sig_type (sort keys %operators_by_sig) { + for my $op (@{$operators_by_sig{$sig_type}}) { + $op->{opcode_num} = $opcode_num++; + } +} + +print "\nParsed operators by signature:\n"; +for my $sig (sort keys %operators_by_sig) { + printf " %-20s: %d operators\n", $sig, scalar @{$operators_by_sig{$sig}}; +} +print "\n"; + +# Generate handler for each signature type +for my $sig_type (sort keys %operators_by_sig) { + generate_handler($sig_type, $operators_by_sig{$sig_type}); +} + +# Update source files with generated code +print "\nUpdating source files...\n"; +update_opcodes_file(\%operators_by_sig, $opcode_num); +update_bytecode_interpreter(\%operators_by_sig); +update_interpreted_code(\%operators_by_sig); +update_bytecode_compiler(\%operators_by_sig); + +print "\nGeneration complete!\n"; +print "Next opcode available: $opcode_num\n"; + +sub read_existing_opcodes { + my ($filename) = @_; + + open my $fh, '<', $filename or die "Cannot open $filename: $!"; + my $content = do { local $/; <$fh> }; + close $fh; + + my %opcodes; + + # Remove the GENERATED section to avoid reading our own generated opcodes + $content =~ s/\/\/ GENERATED_OPCODES_START.*?\/\/ GENERATED_OPCODES_END//s; + + # Match: public static final short OPCODE_NAME = value; + while ($content =~ /public\s+static\s+final\s+short\s+(\w+)\s*=\s*(\d+);/g) { + my ($name, $value) = ($1, $2); + $opcodes{$name} = $value; + } + + # Match: private static final short LASTOP = value; + if ($content =~ /private\s+static\s+final\s+short\s+LASTOP\s*=\s*(\d+);/) { + $opcodes{__LASTOP__} = $1; + } else { + die "Cannot find LASTOP in $filename\n"; + } + + print " Found " . (scalar(keys %opcodes) - 1) . " existing manual opcodes, LASTOP = $opcodes{__LASTOP__}\n"; + return %opcodes; +} + +sub opcode_name { + my ($name) = @_; + + # Handle special operator names + my %special = ( + 'binary&' => 'BINARY_AND', + 'binary|' => 'BINARY_OR', + 'binary^' => 'BINARY_XOR', + 'binary~' => 'BINARY_NOT', + ); + + return $special{$name} if exists $special{$name}; + + # Convert camelCase to UPPER_CASE + $name =~ s/([a-z])([A-Z])/$1_$2/g; # insert underscore before caps + $name = uc($name); + + return $name; +} + +sub classify_signature { + my ($desc) = @_; + + # Extract parameter types and return type + my ($params) = $desc =~ /\(([^)]*)\)/; + my ($return) = $desc =~ /\)(.+)$/; + + # Count parameter types + my @param_types = $params =~ /(L[^;]+;|I|Z)/g; + my $param_count = scalar @param_types; + + # Check for special types + my $has_list = $params =~ /RuntimeList/; + my $has_array = $params =~ /RuntimeArray/; + my $has_base = $params =~ /RuntimeBase/; + my $has_int_param = $params =~ /\bI/; + my $has_varargs = $params =~ /\[L/; + + # Classify by signature pattern + if ($has_varargs || $params =~ /\[Lorg/) { + return 'skip'; # Variable args need special handling + } + + # Scalar unary: (RuntimeScalar) -> RuntimeScalar + if ($param_count == 1 && $params =~ /RuntimeScalar/ && $return =~ /RuntimeScalar/) { + return 'scalar_unary'; + } + + # Scalar binary: (RuntimeScalar, RuntimeScalar) -> RuntimeScalar + if ($param_count == 2 && !$has_list && !$has_array && !$has_int_param + && $return =~ /RuntimeScalar/ && !$has_varargs) { + return 'scalar_binary'; + } + + # Scalar ternary: (RuntimeScalar, RuntimeScalar, RuntimeScalar) -> RuntimeScalar + if ($param_count == 3 && $params =~ /^(Lorg\/perlonjava\/runtime\/RuntimeScalar;){3}$/ + && $return =~ /RuntimeScalar/) { + return 'scalar_ternary'; + } + + return 'skip'; +} + +sub generate_handler { + my ($sig_type, $ops) = @_; + + return unless $ops && @$ops; + + # Generate class name + my %sig_to_class = ( + scalar_unary => 'ScalarUnaryOpcodeHandler', + scalar_binary => 'ScalarBinaryOpcodeHandler', + scalar_ternary => 'ScalarTernaryOpcodeHandler', + ); + + my $class_name = $sig_to_class{$sig_type} or return; + my $output_file = "$output_dir/$class_name.java"; + + print "Generating $class_name with " . scalar(@$ops) . " operators...\n"; + + my $java_code = generate_java_class($class_name, $sig_type, $ops); + + open my $out, '>', $output_file or die "Cannot write $output_file: $!"; + print $out $java_code; + close $out; + + print " Generated: $output_file\n"; +} + +sub generate_java_class { + my ($class_name, $sig_type, $ops) = @_; + + # Collect imports - convert Java internal path format to dotted format + my %classes; + for my $op (@$ops) { + my $import_path = $op->{class_path}; + $import_path =~ s|/|.|g; # Convert / to . + $classes{$op->{class}} = $import_path; + } + + my $imports = join "\n", map { "import $_;" } sort values %classes; + + # Generate register loading code + my ($register_load, $register_list, $disasm_regs) = get_register_code($sig_type); + + # Generate switch cases + my @switch_cases; + my @disasm_cases; + + for my $op (@$ops) { + my $call = generate_method_call($op, $sig_type); + push @switch_cases, " case Opcodes.$op->{opcode_name} -> $call;"; + + my $disasm = generate_disasm_case($op, $sig_type); + push @disasm_cases, $disasm; + } + + my $switch_cases_str = join "\n", @switch_cases; + my $disasm_cases_str = join "\n", @disasm_cases; + + my $description = get_signature_description($sig_type); + + return qq{package org.perlonjava.interpreter; + +import org.perlonjava.runtime.RuntimeBase; +import org.perlonjava.runtime.RuntimeScalar; +$imports + +/** + * Handler for $description + * Generated by dev/tools/generate_opcode_handlers.pl + * DO NOT EDIT MANUALLY - regenerate using the tool + */ +public class $class_name { + + /** + * Execute $description operation. + */ + public static int execute(int opcode, short[] bytecode, int pc, + RuntimeBase[] registers) { + // Read registers (shared by all opcodes in this group) +$register_load + + // Dispatch based on specific opcode + registers[rd] = switch (opcode) { +$switch_cases_str + default -> throw new IllegalStateException("Unknown opcode in $class_name: " + opcode); + }; + + return pc; + } + + /** + * Disassemble $description operation. + */ + public static int disassemble(int opcode, short[] bytecode, int pc, + StringBuilder sb) { +$disasm_regs + + switch (opcode) { +$disasm_cases_str + default -> sb.append("UNKNOWN_").append(opcode).append("\\n"); + } + + return pc; + } +} +}; +} + +sub get_register_code { + my ($sig_type) = @_; + + if ($sig_type eq 'scalar_unary') { + return ( + " int rd = bytecode[pc++];\n int rs = bytecode[pc++];", + "registers[rs]", + " int rd = bytecode[pc++];\n int rs = bytecode[pc++];" + ); + } elsif ($sig_type eq 'scalar_binary') { + return ( + " int rd = bytecode[pc++];\n int rs1 = bytecode[pc++];\n int rs2 = bytecode[pc++];", + "registers[rs1], registers[rs2]", + " int rd = bytecode[pc++];\n int rs1 = bytecode[pc++];\n int rs2 = bytecode[pc++];" + ); + } elsif ($sig_type eq 'scalar_ternary') { + return ( + " int rd = bytecode[pc++];\n int rs1 = bytecode[pc++];\n int rs2 = bytecode[pc++];\n int rs3 = bytecode[pc++];", + "registers[rs1], registers[rs2], registers[rs3]", + " int rd = bytecode[pc++];\n int rs1 = bytecode[pc++];\n int rs2 = bytecode[pc++];\n int rs3 = bytecode[pc++];" + ); + } +} + +sub generate_method_call { + my ($op, $sig_type) = @_; + + if ($sig_type eq 'scalar_unary') { + return "$op->{class}.$op->{method}((RuntimeScalar) registers[rs])"; + } elsif ($sig_type eq 'scalar_binary') { + return "$op->{class}.$op->{method}((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2])"; + } elsif ($sig_type eq 'scalar_ternary') { + return "$op->{class}.$op->{method}((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2], (RuntimeScalar) registers[rs3])"; + } +} + +sub generate_disasm_case { + my ($op, $sig_type) = @_; + + my $name = uc($op->{name}); + + if ($sig_type eq 'scalar_unary') { + return qq{ case Opcodes.$op->{opcode_name} -> sb.append("$op->{opcode_name} r").append(rd).append(" = $op->{name}(r").append(rs).append(")\\n");}; + } elsif ($sig_type eq 'scalar_binary') { + return qq{ case Opcodes.$op->{opcode_name} -> sb.append("$op->{opcode_name} r").append(rd).append(" = $op->{name}(r").append(rs1).append(", r").append(rs2).append(")\\n");}; + } elsif ($sig_type eq 'scalar_ternary') { + return qq{ case Opcodes.$op->{opcode_name} -> sb.append("$op->{opcode_name} r").append(rd).append(" = $op->{name}(r").append(rs1).append(", r").append(rs2).append(", r").append(rs3).append(")\\n");}; + } +} + +sub get_signature_description { + my ($sig_type) = @_; + + my %descriptions = ( + scalar_unary => 'scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.)', + scalar_binary => 'scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.)', + scalar_ternary => 'scalar ternary operations (substr with position)', + ); + + return $descriptions{$sig_type} || $sig_type; +} + +sub generate_update_instructions { + my ($operators_by_sig) = @_; + + print "\n" . "="x70 . "\n"; + print "UPDATE INSTRUCTIONS\n"; + print "="x70 . "\n\n"; + + # 1. Opcodes.java additions + print "1. ADD TO Opcodes.java (at marker: // GENERATED_OPCODES_START):\n\n"; + + for my $sig_type (sort keys %$operators_by_sig) { + my $desc = get_signature_description($sig_type); + print " // $desc\n"; + + for my $op (@{$operators_by_sig->{$sig_type}}) { + printf " public static final short %s = %d;\n", + $op->{opcode_name}, $op->{opcode_num}; + } + print "\n"; + } + + # 2. BytecodeInterpreter.java additions + print "2. ADD TO BytecodeInterpreter.java (at marker: // GENERATED_HANDLERS_START):\n\n"; + + for my $sig_type (sort keys %$operators_by_sig) { + my %sig_to_class = ( + scalar_unary => 'ScalarUnaryOpcodeHandler', + scalar_binary => 'ScalarBinaryOpcodeHandler', + scalar_ternary => 'ScalarTernaryOpcodeHandler', + ); + my $handler = $sig_to_class{$sig_type}; + + print " // $sig_type\n"; + for my $op (@{$operators_by_sig->{$sig_type}}) { + print " case Opcodes.$op->{opcode_name}:\n"; + } + print " pc = $handler.execute(opcode, bytecode, pc, registers);\n"; + print " break;\n\n"; + } + + # 3. InterpretedCode.java disassembly + print "3. ADD TO InterpretedCode.java disassemble() (at marker: // GENERATED_DISASM_START):\n\n"; + + for my $sig_type (sort keys %$operators_by_sig) { + my %sig_to_class = ( + scalar_unary => 'ScalarUnaryOpcodeHandler', + scalar_binary => 'ScalarBinaryOpcodeHandler', + scalar_ternary => 'ScalarTernaryOpcodeHandler', + ); + my $handler = $sig_to_class{$sig_type}; + + print " // $sig_type\n"; + for my $op (@{$operators_by_sig->{$sig_type}}) { + print " case Opcodes.$op->{opcode_name}:\n"; + } + print " pc = $handler.disassemble(opcode, bytecode, pc, sb);\n"; + print " break;\n\n"; + } + + # 4. BytecodeCompiler.java additions + print "4. ADD TO BytecodeCompiler.java visit(OperatorNode) (at marker: // GENERATED_OPERATORS_START):\n\n"; + print "Add cases for each operator following the pattern:\n"; + print "} else if (op.equals(\"chr\")) {\n"; + print " // chr(\$x) - convert codepoint to character\n"; + print " if (node.operand instanceof ListNode) {\n"; + print " ListNode list = (ListNode) node.operand;\n"; + print " if (!list.elements.isEmpty()) {\n"; + print " list.elements.get(0).accept(this);\n"; + print " } else {\n"; + print " throwCompilerException(\"chr requires an argument\");\n"; + print " }\n"; + print " } else {\n"; + print " node.operand.accept(this);\n"; + print " }\n"; + print " int argReg = lastResultReg;\n"; + print " int rd = allocateRegister();\n"; + print " emit(Opcodes.CHR);\n"; + print " emitReg(rd);\n"; + print " emitReg(argReg);\n"; + print " lastResultReg = rd;\n"; + print "}\n\n"; + + print "\nNext opcode available: $opcode_num\n"; + print "\nOperators to add in BytecodeCompiler:\n"; + for my $sig_type (sort keys %$operators_by_sig) { + for my $op (@{$operators_by_sig->{$sig_type}}) { + print " - $op->{name}\n"; + } + } +} + +sub update_file_at_markers { + my ($filename, $start_marker, $end_marker, $new_content) = @_; + + # Read file + open my $fh, '<', $filename or die "Cannot open $filename: $!"; + my @lines = <$fh>; + close $fh; + + # Find markers + my ($start_idx, $end_idx); + for my $i (0 .. $#lines) { + if ($lines[$i] =~ /\Q$start_marker\E/) { + $start_idx = $i; + } + if ($lines[$i] =~ /\Q$end_marker\E/) { + $end_idx = $i; + last if defined $start_idx; + } + } + + unless (defined $start_idx && defined $end_idx) { + die "Cannot find markers $start_marker and $end_marker in $filename\n"; + } + + # Replace content between markers + splice @lines, $start_idx + 1, $end_idx - $start_idx - 1, $new_content; + + # Write file + open my $out, '>', $filename or die "Cannot write $filename: $!"; + print $out @lines; + close $out; + + print " Updated $filename\n"; +} + +sub update_opcodes_file { + my ($operators_by_sig, $next_opcode) = @_; + + my @content; + + for my $sig_type (sort keys %$operators_by_sig) { + my $desc = get_signature_description($sig_type); + push @content, "\n // $desc\n"; + + for my $op (@{$operators_by_sig->{$sig_type}}) { + my $offset = $op->{opcode_num} - $existing_opcodes{__LASTOP__}; + push @content, sprintf(" public static final short %s = LASTOP + %d;\n", + $op->{opcode_name}, $offset); + } + } + + update_file_at_markers($opcodes_file, '// GENERATED_OPCODES_START', '// GENERATED_OPCODES_END', + join('', @content)); +} + +sub update_bytecode_interpreter { + my ($operators_by_sig) = @_; + + my @content; + + for my $sig_type (sort keys %$operators_by_sig) { + my %sig_to_class = ( + scalar_unary => 'ScalarUnaryOpcodeHandler', + scalar_binary => 'ScalarBinaryOpcodeHandler', + scalar_ternary => 'ScalarTernaryOpcodeHandler', + ); + my $handler = $sig_to_class{$sig_type}; + + push @content, "\n // $sig_type\n"; + for my $op (@{$operators_by_sig->{$sig_type}}) { + push @content, " case Opcodes.$op->{opcode_name}:\n"; + } + push @content, " pc = $handler.execute(opcode, bytecode, pc, registers);\n"; + push @content, " break;\n"; + } + + update_file_at_markers($bytecode_interpreter_file, '// GENERATED_HANDLERS_START', '// GENERATED_HANDLERS_END', + join('', @content)); +} + +sub update_interpreted_code { + my ($operators_by_sig) = @_; + + my @content; + + for my $sig_type (sort keys %$operators_by_sig) { + my %sig_to_class = ( + scalar_unary => 'ScalarUnaryOpcodeHandler', + scalar_binary => 'ScalarBinaryOpcodeHandler', + scalar_ternary => 'ScalarTernaryOpcodeHandler', + ); + my $handler = $sig_to_class{$sig_type}; + + push @content, "\n // $sig_type\n"; + for my $op (@{$operators_by_sig->{$sig_type}}) { + push @content, " case Opcodes.$op->{opcode_name}:\n"; + } + push @content, " pc = $handler.disassemble(opcode, bytecode, pc, sb);\n"; + push @content, " break;\n"; + } + + update_file_at_markers($interpreted_code_file, '// GENERATED_DISASM_START', '// GENERATED_DISASM_END', + join('', @content)); +} + +sub update_bytecode_compiler { + my ($operators_by_sig) = @_; + + my @content; + + # Only generate unary operators for now (binary/ternary need different patterns) + if (exists $operators_by_sig->{scalar_unary}) { + for my $op (@{$operators_by_sig->{scalar_unary}}) { + my $op_name = $op->{name}; + my $opcode_name = $op->{opcode_name}; + + push @content, " } else if (op.equals(\"$op_name\")) {\n"; + push @content, " // $op_name(\$x) - $op->{class}.$op->{method}\n"; + push @content, " if (node.operand instanceof ListNode) {\n"; + push @content, " ListNode list = (ListNode) node.operand;\n"; + push @content, " if (!list.elements.isEmpty()) {\n"; + push @content, " list.elements.get(0).accept(this);\n"; + push @content, " } else {\n"; + push @content, " throwCompilerException(\"$op_name requires an argument\");\n"; + push @content, " }\n"; + push @content, " } else {\n"; + push @content, " node.operand.accept(this);\n"; + push @content, " }\n"; + push @content, " int argReg = lastResultReg;\n"; + push @content, " int rd = allocateRegister();\n"; + push @content, " emit(Opcodes.$opcode_name);\n"; + push @content, " emitReg(rd);\n"; + push @content, " emitReg(argReg);\n"; + push @content, " lastResultReg = rd;\n"; + } + } + + update_file_at_markers($bytecode_compiler_file, '// GENERATED_OPERATORS_START', '// GENERATED_OPERATORS_END', + join('', @content)); +} diff --git a/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java b/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java index ed6bd473c..b80192f60 100644 --- a/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java +++ b/src/main/java/org/perlonjava/codegen/EmitLogicalOperator.java @@ -311,15 +311,10 @@ private static void emitLogicalOperatorSimple(EmitterVisitor emitterVisitor, Bin rewritten = true; } - // For RUNTIME context, preserve it; otherwise use SCALAR for boolean evaluation - int operandContext = emitterVisitor.ctx.contextType == RuntimeContextType.RUNTIME - ? RuntimeContextType.RUNTIME - : RuntimeContextType.SCALAR; - resultRef = emitterVisitor.ctx.javaClassInfo.acquireSpillRefOrAllocate(emitterVisitor.ctx.symbolTable); - // Evaluate LHS and store it. - node.left.accept(emitterVisitor.with(operandContext)); + // Evaluate LHS in SCALAR context (for boolean test) and store it. + node.left.accept(emitterVisitor.with(RuntimeContextType.SCALAR)); emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, resultRef); // Boolean test on the stored LHS. @@ -327,8 +322,12 @@ private static void emitLogicalOperatorSimple(EmitterVisitor emitterVisitor, Bin mv.visitMethodInsn(Opcodes.INVOKEVIRTUAL, "org/perlonjava/runtime/RuntimeBase", getBoolean, "()Z", false); mv.visitJumpInsn(compareOpcode, endLabel); - // LHS didn't short-circuit: evaluate RHS, overwrite result. - node.right.accept(emitterVisitor.with(operandContext)); + // LHS didn't short-circuit: evaluate RHS in current context (may be RUNTIME at sub exit). + // For RUNTIME context, preserve it; otherwise use SCALAR for boolean evaluation. + int rhsContext = emitterVisitor.ctx.contextType == RuntimeContextType.RUNTIME + ? RuntimeContextType.RUNTIME + : RuntimeContextType.SCALAR; + node.right.accept(emitterVisitor.with(rhsContext)); emitterVisitor.ctx.javaClassInfo.storeSpillRef(mv, resultRef); // Return whichever side won the short-circuit. diff --git a/src/main/java/org/perlonjava/interpreter/BytecodeCompiler.java b/src/main/java/org/perlonjava/interpreter/BytecodeCompiler.java index 0049c76d7..db4ba2156 100644 --- a/src/main/java/org/perlonjava/interpreter/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/interpreter/BytecodeCompiler.java @@ -3060,43 +3060,45 @@ private int compileBinaryOperatorSwitch(String operator, int rs1, int rs2, int t emit(currentCallContext); } case "&" -> { - // String bitwise AND (default): rs1 & rs2 - // Note: binary& (with use integer) is handled separately - emit(Opcodes.STRING_BITWISE_AND); + // Numeric bitwise AND (default): rs1 & rs2 + emit(Opcodes.BITWISE_AND_BINARY); emitReg(rd); emitReg(rs1); emitReg(rs2); } case "binary&" -> { // Numeric bitwise AND (use integer): rs1 binary& rs2 + // Same as & but explicitly numeric emit(Opcodes.BITWISE_AND_BINARY); emitReg(rd); emitReg(rs1); emitReg(rs2); } case "|" -> { - // String bitwise OR (default): rs1 | rs2 - emit(Opcodes.STRING_BITWISE_OR); + // Numeric bitwise OR (default): rs1 | rs2 + emit(Opcodes.BITWISE_OR_BINARY); emitReg(rd); emitReg(rs1); emitReg(rs2); } case "binary|" -> { // Numeric bitwise OR (use integer): rs1 binary| rs2 + // Same as | but explicitly numeric emit(Opcodes.BITWISE_OR_BINARY); emitReg(rd); emitReg(rs1); emitReg(rs2); } case "^" -> { - // String bitwise XOR (default): rs1 ^ rs2 - emit(Opcodes.STRING_BITWISE_XOR); + // Numeric bitwise XOR (default): rs1 ^ rs2 + emit(Opcodes.BITWISE_XOR_BINARY); emitReg(rd); emitReg(rs1); emitReg(rs2); } case "binary^" -> { // Numeric bitwise XOR (use integer): rs1 binary^ rs2 + // Same as ^ but explicitly numeric emit(Opcodes.BITWISE_XOR_BINARY); emitReg(rd); emitReg(rs1); @@ -5853,6 +5855,566 @@ public void visit(OperatorNode node) { emitReg(2); // Register 2 contains the calling context lastResultReg = rd; + // GENERATED_OPERATORS_START + } else if (op.equals("int")) { + // int($x) - MathOperators.integer + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("int requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.INT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("log")) { + // log($x) - MathOperators.log + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("log requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.LOG); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("sqrt")) { + // sqrt($x) - MathOperators.sqrt + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("sqrt requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.SQRT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("cos")) { + // cos($x) - MathOperators.cos + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("cos requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.COS); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("sin")) { + // sin($x) - MathOperators.sin + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("sin requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.SIN); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("exp")) { + // exp($x) - MathOperators.exp + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("exp requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.EXP); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("abs")) { + // abs($x) - MathOperators.abs + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("abs requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.ABS); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("binary~")) { + // binary~($x) - BitwiseOperators.bitwiseNotBinary + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("binary~ requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.BINARY_NOT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("integerBitwiseNot")) { + // integerBitwiseNot($x) - BitwiseOperators.integerBitwiseNot + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("integerBitwiseNot requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.INTEGER_BITWISE_NOT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("ord")) { + // ord($x) - ScalarOperators.ord + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("ord requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.ORD); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("ordBytes")) { + // ordBytes($x) - ScalarOperators.ordBytes + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("ordBytes requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.ORD_BYTES); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("oct")) { + // oct($x) - ScalarOperators.oct + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("oct requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.OCT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("hex")) { + // hex($x) - ScalarOperators.hex + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("hex requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.HEX); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("srand")) { + // srand($x) - Random.srand + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("srand requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.SRAND); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("chr")) { + // chr($x) - StringOperators.chr + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chr requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CHR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("chrBytes")) { + // chrBytes($x) - StringOperators.chrBytes + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chrBytes requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CHR_BYTES); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("lengthBytes")) { + // lengthBytes($x) - StringOperators.lengthBytes + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("lengthBytes requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.LENGTH_BYTES); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("quotemeta")) { + // quotemeta($x) - StringOperators.quotemeta + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("quotemeta requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.QUOTEMETA); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("fc")) { + // fc($x) - StringOperators.fc + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("fc requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.FC); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("lc")) { + // lc($x) - StringOperators.lc + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("lc requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.LC); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("lcfirst")) { + // lcfirst($x) - StringOperators.lcfirst + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("lcfirst requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.LCFIRST); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("uc")) { + // uc($x) - StringOperators.uc + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("uc requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.UC); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("ucfirst")) { + // ucfirst($x) - StringOperators.ucfirst + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("ucfirst requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.UCFIRST); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("sleep")) { + // sleep($x) - Time.sleep + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("sleep requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.SLEEP); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("tell")) { + // tell($x) - IOOperator.tell + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("tell requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.TELL); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("rmdir")) { + // rmdir($x) - Directory.rmdir + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("rmdir requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.RMDIR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("closedir")) { + // closedir($x) - Directory.closedir + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("closedir requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CLOSEDIR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("rewinddir")) { + // rewinddir($x) - Directory.rewinddir + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("rewinddir requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.REWINDDIR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("telldir")) { + // telldir($x) - Directory.telldir + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("telldir requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.TELLDIR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("chdir")) { + // chdir($x) - Directory.chdir + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("chdir requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.CHDIR); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + } else if (op.equals("exit")) { + // exit($x) - WarnDie.exit + if (node.operand instanceof ListNode) { + ListNode list = (ListNode) node.operand; + if (!list.elements.isEmpty()) { + list.elements.get(0).accept(this); + } else { + throwCompilerException("exit requires an argument"); + } + } else { + node.operand.accept(this); + } + int argReg = lastResultReg; + int rd = allocateRegister(); + emit(Opcodes.EXIT); + emitReg(rd); + emitReg(argReg); + lastResultReg = rd; + // GENERATED_OPERATORS_END } else { throwCompilerException("Unsupported operator: " + op); } diff --git a/src/main/java/org/perlonjava/interpreter/BytecodeInterpreter.java b/src/main/java/org/perlonjava/interpreter/BytecodeInterpreter.java index c49879143..b34f79e4e 100644 --- a/src/main/java/org/perlonjava/interpreter/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/interpreter/BytecodeInterpreter.java @@ -2100,6 +2100,66 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c // DEPRECATED: SLOW_OP removed - all operations now use direct opcodes (114-154) + // ================================================================= + // GENERATED BUILT-IN FUNCTION HANDLERS + // ================================================================= + // Generated by dev/tools/generate_opcode_handlers.pl + // DO NOT EDIT MANUALLY - regenerate using the tool + + // GENERATED_HANDLERS_START + + // scalar_binary + case Opcodes.ATAN2: + case Opcodes.BINARY_AND: + case Opcodes.BINARY_OR: + case Opcodes.BINARY_XOR: + case Opcodes.EQ: + case Opcodes.NE: + case Opcodes.LT: + case Opcodes.LE: + case Opcodes.GT: + case Opcodes.GE: + case Opcodes.CMP: + case Opcodes.X: + pc = ScalarBinaryOpcodeHandler.execute(opcode, bytecode, pc, registers); + break; + + // scalar_unary + case Opcodes.INT: + case Opcodes.LOG: + case Opcodes.SQRT: + case Opcodes.COS: + case Opcodes.SIN: + case Opcodes.EXP: + case Opcodes.ABS: + case Opcodes.BINARY_NOT: + case Opcodes.INTEGER_BITWISE_NOT: + case Opcodes.ORD: + case Opcodes.ORD_BYTES: + case Opcodes.OCT: + case Opcodes.HEX: + case Opcodes.SRAND: + case Opcodes.CHR: + case Opcodes.CHR_BYTES: + case Opcodes.LENGTH_BYTES: + case Opcodes.QUOTEMETA: + case Opcodes.FC: + case Opcodes.LC: + case Opcodes.LCFIRST: + case Opcodes.UC: + case Opcodes.UCFIRST: + case Opcodes.SLEEP: + case Opcodes.TELL: + case Opcodes.RMDIR: + case Opcodes.CLOSEDIR: + case Opcodes.REWINDDIR: + case Opcodes.TELLDIR: + case Opcodes.CHDIR: + case Opcodes.EXIT: + pc = ScalarUnaryOpcodeHandler.execute(opcode, bytecode, pc, registers); + break; + // GENERATED_HANDLERS_END + default: // Unknown opcode int opcodeInt = opcode & 0xFF; diff --git a/src/main/java/org/perlonjava/interpreter/InterpretedCode.java b/src/main/java/org/perlonjava/interpreter/InterpretedCode.java index 56b0da026..460fbe200 100644 --- a/src/main/java/org/perlonjava/interpreter/InterpretedCode.java +++ b/src/main/java/org/perlonjava/interpreter/InterpretedCode.java @@ -1172,6 +1172,67 @@ public String disassemble() { break; // DEPRECATED: SLOW_OP case removed - opcode 87 is no longer emitted // All operations now use direct opcodes (114-154) + + // ================================================================= + // GENERATED BUILT-IN FUNCTION DISASSEMBLY + // ================================================================= + // Generated by dev/tools/generate_opcode_handlers.pl + // DO NOT EDIT MANUALLY - regenerate using the tool + + // GENERATED_DISASM_START + + // scalar_binary + case Opcodes.ATAN2: + case Opcodes.BINARY_AND: + case Opcodes.BINARY_OR: + case Opcodes.BINARY_XOR: + case Opcodes.EQ: + case Opcodes.NE: + case Opcodes.LT: + case Opcodes.LE: + case Opcodes.GT: + case Opcodes.GE: + case Opcodes.CMP: + case Opcodes.X: + pc = ScalarBinaryOpcodeHandler.disassemble(opcode, bytecode, pc, sb); + break; + + // scalar_unary + case Opcodes.INT: + case Opcodes.LOG: + case Opcodes.SQRT: + case Opcodes.COS: + case Opcodes.SIN: + case Opcodes.EXP: + case Opcodes.ABS: + case Opcodes.BINARY_NOT: + case Opcodes.INTEGER_BITWISE_NOT: + case Opcodes.ORD: + case Opcodes.ORD_BYTES: + case Opcodes.OCT: + case Opcodes.HEX: + case Opcodes.SRAND: + case Opcodes.CHR: + case Opcodes.CHR_BYTES: + case Opcodes.LENGTH_BYTES: + case Opcodes.QUOTEMETA: + case Opcodes.FC: + case Opcodes.LC: + case Opcodes.LCFIRST: + case Opcodes.UC: + case Opcodes.UCFIRST: + case Opcodes.SLEEP: + case Opcodes.TELL: + case Opcodes.RMDIR: + case Opcodes.CLOSEDIR: + case Opcodes.REWINDDIR: + case Opcodes.TELLDIR: + case Opcodes.CHDIR: + case Opcodes.EXIT: + pc = ScalarUnaryOpcodeHandler.disassemble(opcode, bytecode, pc, sb); + break; + // GENERATED_DISASM_END + default: sb.append("UNKNOWN(").append(opcode & 0xFF).append(")\n"); break; diff --git a/src/main/java/org/perlonjava/interpreter/Opcodes.java b/src/main/java/org/perlonjava/interpreter/Opcodes.java index 449ddcb47..b75bbc08f 100644 --- a/src/main/java/org/perlonjava/interpreter/Opcodes.java +++ b/src/main/java/org/perlonjava/interpreter/Opcodes.java @@ -464,7 +464,7 @@ public class Opcodes { public static final short ISA = 105; // ================================================================= - // ITERATOR OPERATIONS (106-108) - For efficient foreach loops + // ITERATOR OPERATIONS - For efficient foreach loops // ================================================================= /** Create iterator: rd = rs.iterator() - get Iterator from Iterable */ @@ -595,21 +595,9 @@ public class Opcodes { /** rd = Time.sleep(seconds) - sleep for specified seconds */ public static final short SLEEP_OP = 154; - // ================================================================= - // OPCODES 155-32767: RESERVED FOR FUTURE OPERATIONS - // ================================================================= - // See TODO_SHORT_OPCODES.md for allocation plan: - // - 200-299: Reserved for core expansion - // - 300-399: Comparison operators (CONTIGUOUS blocks!) - // - 400-549: Arithmetic and bitwise operators (CONTIGUOUS blocks!) - // - 550-749: String and array operations (CONTIGUOUS blocks!) - // - 750-949: Hash operations (CONTIGUOUS blocks!) - // - 1000+: OperatorHandler promotions (200+ operators) - // ================================================================= // PHASE 3: OPERATORHANDLER PROMOTIONS (400-499) - Math Operators // ================================================================= - // Promoted from OperatorHandler for 10-100x performance improvement. // IMPORTANT: Keep CONTIGUOUS for JVM tableswitch optimization! // Math Operators (400-409) - CONTIGUOUS @@ -739,7 +727,7 @@ public class Opcodes { public static final short BITWISE_NOT_STRING = 187; // ================================================================= - // FILE TEST AND STAT OPERATIONS (188-218) + // FILE TEST AND STAT OPERATIONS // ================================================================= /** stat operator: rd = stat(rs) [context] @@ -811,7 +799,7 @@ public class Opcodes { public static final short MATCH_REGEX_NOT = 217; // ================================================================= - // LOOP CONTROL OPERATIONS (218-220) - last/next/redo + // LOOP CONTROL OPERATIONS - last/next/redo // ================================================================= /** Loop last: Jump to end of loop or return RuntimeControlFlowList for non-local @@ -830,10 +818,64 @@ public class Opcodes { public static final short REDO = 220; // ================================================================= - // OPCODES 221-32767: RESERVED FOR FUTURE OPERATIONS - // ================================================================= - // See PHASE3_OPERATOR_PROMOTIONS.md for promotion strategy. - // All SLOWOP_* constants have been removed - use direct opcodes 114-154 instead. + // BUILT-IN FUNCTION OPCODES - after LASTOP + // Last manually-assigned opcode (for tool reference) + private static final short LASTOP = 220; + + // ================================================================= + // Generated by dev/tools/generate_opcode_handlers.pl + // DO NOT EDIT MANUALLY - regenerate using the tool + + // GENERATED_OPCODES_START + + // scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.) + public static final short ATAN2 = LASTOP + 1; + public static final short BINARY_AND = LASTOP + 2; + public static final short BINARY_OR = LASTOP + 3; + public static final short BINARY_XOR = LASTOP + 4; + public static final short EQ = LASTOP + 5; + public static final short NE = LASTOP + 6; + public static final short LT = LASTOP + 7; + public static final short LE = LASTOP + 8; + public static final short GT = LASTOP + 9; + public static final short GE = LASTOP + 10; + public static final short CMP = LASTOP + 11; + public static final short X = LASTOP + 12; + + // scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.) + public static final short INT = LASTOP + 13; + public static final short LOG = LASTOP + 14; + public static final short SQRT = LASTOP + 15; + public static final short COS = LASTOP + 16; + public static final short SIN = LASTOP + 17; + public static final short EXP = LASTOP + 18; + public static final short ABS = LASTOP + 19; + public static final short BINARY_NOT = LASTOP + 20; + public static final short INTEGER_BITWISE_NOT = LASTOP + 21; + public static final short ORD = LASTOP + 22; + public static final short ORD_BYTES = LASTOP + 23; + public static final short OCT = LASTOP + 24; + public static final short HEX = LASTOP + 25; + public static final short SRAND = LASTOP + 26; + public static final short CHR = LASTOP + 27; + public static final short CHR_BYTES = LASTOP + 28; + public static final short LENGTH_BYTES = LASTOP + 29; + public static final short QUOTEMETA = LASTOP + 30; + public static final short FC = LASTOP + 31; + public static final short LC = LASTOP + 32; + public static final short LCFIRST = LASTOP + 33; + public static final short UC = LASTOP + 34; + public static final short UCFIRST = LASTOP + 35; + public static final short SLEEP = LASTOP + 36; + public static final short TELL = LASTOP + 37; + public static final short RMDIR = LASTOP + 38; + public static final short CLOSEDIR = LASTOP + 39; + public static final short REWINDDIR = LASTOP + 40; + public static final short TELLDIR = LASTOP + 41; + public static final short CHDIR = LASTOP + 42; + public static final short EXIT = LASTOP + 43; + // GENERATED_OPCODES_END + private Opcodes() {} // Utility class - no instantiation } diff --git a/src/main/java/org/perlonjava/interpreter/ScalarBinaryOpcodeHandler.java b/src/main/java/org/perlonjava/interpreter/ScalarBinaryOpcodeHandler.java new file mode 100644 index 000000000..6ef0d3c14 --- /dev/null +++ b/src/main/java/org/perlonjava/interpreter/ScalarBinaryOpcodeHandler.java @@ -0,0 +1,74 @@ +package org.perlonjava.interpreter; + +import org.perlonjava.runtime.RuntimeBase; +import org.perlonjava.runtime.RuntimeScalar; +import org.perlonjava.operators.BitwiseOperators; +import org.perlonjava.operators.CompareOperators; +import org.perlonjava.operators.MathOperators; +import org.perlonjava.operators.Operator; + +/** + * Handler for scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.) + * Generated by dev/tools/generate_opcode_handlers.pl + * DO NOT EDIT MANUALLY - regenerate using the tool + */ +public class ScalarBinaryOpcodeHandler { + + /** + * Execute scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.) operation. + */ + public static int execute(int opcode, short[] bytecode, int pc, + RuntimeBase[] registers) { + // Read registers (shared by all opcodes in this group) + int rd = bytecode[pc++]; + int rs1 = bytecode[pc++]; + int rs2 = bytecode[pc++]; + + // Dispatch based on specific opcode + registers[rd] = switch (opcode) { + case Opcodes.ATAN2 -> MathOperators.atan2((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.BINARY_AND -> BitwiseOperators.bitwiseAndBinary((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.BINARY_OR -> BitwiseOperators.bitwiseOrBinary((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.BINARY_XOR -> BitwiseOperators.bitwiseXorBinary((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.EQ -> CompareOperators.eq((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.NE -> CompareOperators.ne((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.LT -> CompareOperators.lt((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.LE -> CompareOperators.le((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.GT -> CompareOperators.gt((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.GE -> CompareOperators.ge((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.CMP -> CompareOperators.cmp((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + case Opcodes.X -> Operator.repeat((RuntimeScalar) registers[rs1], (RuntimeScalar) registers[rs2]); + default -> throw new IllegalStateException("Unknown opcode in ScalarBinaryOpcodeHandler: " + opcode); + }; + + return pc; + } + + /** + * Disassemble scalar binary operations (atan2, eq, ne, lt, le, gt, ge, cmp, etc.) operation. + */ + public static int disassemble(int opcode, short[] bytecode, int pc, + StringBuilder sb) { + int rd = bytecode[pc++]; + int rs1 = bytecode[pc++]; + int rs2 = bytecode[pc++]; + + switch (opcode) { + case Opcodes.ATAN2 -> sb.append("ATAN2 r").append(rd).append(" = atan2(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.BINARY_AND -> sb.append("BINARY_AND r").append(rd).append(" = binary&(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.BINARY_OR -> sb.append("BINARY_OR r").append(rd).append(" = binary|(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.BINARY_XOR -> sb.append("BINARY_XOR r").append(rd).append(" = binary^(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.EQ -> sb.append("EQ r").append(rd).append(" = eq(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.NE -> sb.append("NE r").append(rd).append(" = ne(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.LT -> sb.append("LT r").append(rd).append(" = lt(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.LE -> sb.append("LE r").append(rd).append(" = le(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.GT -> sb.append("GT r").append(rd).append(" = gt(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.GE -> sb.append("GE r").append(rd).append(" = ge(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.CMP -> sb.append("CMP r").append(rd).append(" = cmp(r").append(rs1).append(", r").append(rs2).append(")\n"); + case Opcodes.X -> sb.append("X r").append(rd).append(" = x(r").append(rs1).append(", r").append(rs2).append(")\n"); + default -> sb.append("UNKNOWN_").append(opcode).append("\n"); + } + + return pc; + } +} diff --git a/src/main/java/org/perlonjava/interpreter/ScalarUnaryOpcodeHandler.java b/src/main/java/org/perlonjava/interpreter/ScalarUnaryOpcodeHandler.java new file mode 100644 index 000000000..b8098d30f --- /dev/null +++ b/src/main/java/org/perlonjava/interpreter/ScalarUnaryOpcodeHandler.java @@ -0,0 +1,115 @@ +package org.perlonjava.interpreter; + +import org.perlonjava.runtime.RuntimeBase; +import org.perlonjava.runtime.RuntimeScalar; +import org.perlonjava.operators.BitwiseOperators; +import org.perlonjava.operators.Directory; +import org.perlonjava.operators.IOOperator; +import org.perlonjava.operators.MathOperators; +import org.perlonjava.operators.Random; +import org.perlonjava.operators.ScalarOperators; +import org.perlonjava.operators.StringOperators; +import org.perlonjava.operators.Time; +import org.perlonjava.operators.WarnDie; + +/** + * Handler for scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.) + * Generated by dev/tools/generate_opcode_handlers.pl + * DO NOT EDIT MANUALLY - regenerate using the tool + */ +public class ScalarUnaryOpcodeHandler { + + /** + * Execute scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.) operation. + */ + public static int execute(int opcode, short[] bytecode, int pc, + RuntimeBase[] registers) { + // Read registers (shared by all opcodes in this group) + int rd = bytecode[pc++]; + int rs = bytecode[pc++]; + + // Dispatch based on specific opcode + registers[rd] = switch (opcode) { + case Opcodes.INT -> MathOperators.integer((RuntimeScalar) registers[rs]); + case Opcodes.LOG -> MathOperators.log((RuntimeScalar) registers[rs]); + case Opcodes.SQRT -> MathOperators.sqrt((RuntimeScalar) registers[rs]); + case Opcodes.COS -> MathOperators.cos((RuntimeScalar) registers[rs]); + case Opcodes.SIN -> MathOperators.sin((RuntimeScalar) registers[rs]); + case Opcodes.EXP -> MathOperators.exp((RuntimeScalar) registers[rs]); + case Opcodes.ABS -> MathOperators.abs((RuntimeScalar) registers[rs]); + case Opcodes.BINARY_NOT -> BitwiseOperators.bitwiseNotBinary((RuntimeScalar) registers[rs]); + case Opcodes.INTEGER_BITWISE_NOT -> BitwiseOperators.integerBitwiseNot((RuntimeScalar) registers[rs]); + case Opcodes.ORD -> ScalarOperators.ord((RuntimeScalar) registers[rs]); + case Opcodes.ORD_BYTES -> ScalarOperators.ordBytes((RuntimeScalar) registers[rs]); + case Opcodes.OCT -> ScalarOperators.oct((RuntimeScalar) registers[rs]); + case Opcodes.HEX -> ScalarOperators.hex((RuntimeScalar) registers[rs]); + case Opcodes.SRAND -> Random.srand((RuntimeScalar) registers[rs]); + case Opcodes.CHR -> StringOperators.chr((RuntimeScalar) registers[rs]); + case Opcodes.CHR_BYTES -> StringOperators.chrBytes((RuntimeScalar) registers[rs]); + case Opcodes.LENGTH_BYTES -> StringOperators.lengthBytes((RuntimeScalar) registers[rs]); + case Opcodes.QUOTEMETA -> StringOperators.quotemeta((RuntimeScalar) registers[rs]); + case Opcodes.FC -> StringOperators.fc((RuntimeScalar) registers[rs]); + case Opcodes.LC -> StringOperators.lc((RuntimeScalar) registers[rs]); + case Opcodes.LCFIRST -> StringOperators.lcfirst((RuntimeScalar) registers[rs]); + case Opcodes.UC -> StringOperators.uc((RuntimeScalar) registers[rs]); + case Opcodes.UCFIRST -> StringOperators.ucfirst((RuntimeScalar) registers[rs]); + case Opcodes.SLEEP -> Time.sleep((RuntimeScalar) registers[rs]); + case Opcodes.TELL -> IOOperator.tell((RuntimeScalar) registers[rs]); + case Opcodes.RMDIR -> Directory.rmdir((RuntimeScalar) registers[rs]); + case Opcodes.CLOSEDIR -> Directory.closedir((RuntimeScalar) registers[rs]); + case Opcodes.REWINDDIR -> Directory.rewinddir((RuntimeScalar) registers[rs]); + case Opcodes.TELLDIR -> Directory.telldir((RuntimeScalar) registers[rs]); + case Opcodes.CHDIR -> Directory.chdir((RuntimeScalar) registers[rs]); + case Opcodes.EXIT -> WarnDie.exit((RuntimeScalar) registers[rs]); + default -> throw new IllegalStateException("Unknown opcode in ScalarUnaryOpcodeHandler: " + opcode); + }; + + return pc; + } + + /** + * Disassemble scalar unary operations (chr, ord, abs, sin, cos, lc, uc, etc.) operation. + */ + public static int disassemble(int opcode, short[] bytecode, int pc, + StringBuilder sb) { + int rd = bytecode[pc++]; + int rs = bytecode[pc++]; + + switch (opcode) { + case Opcodes.INT -> sb.append("INT r").append(rd).append(" = int(r").append(rs).append(")\n"); + case Opcodes.LOG -> sb.append("LOG r").append(rd).append(" = log(r").append(rs).append(")\n"); + case Opcodes.SQRT -> sb.append("SQRT r").append(rd).append(" = sqrt(r").append(rs).append(")\n"); + case Opcodes.COS -> sb.append("COS r").append(rd).append(" = cos(r").append(rs).append(")\n"); + case Opcodes.SIN -> sb.append("SIN r").append(rd).append(" = sin(r").append(rs).append(")\n"); + case Opcodes.EXP -> sb.append("EXP r").append(rd).append(" = exp(r").append(rs).append(")\n"); + case Opcodes.ABS -> sb.append("ABS r").append(rd).append(" = abs(r").append(rs).append(")\n"); + case Opcodes.BINARY_NOT -> sb.append("BINARY_NOT r").append(rd).append(" = binary~(r").append(rs).append(")\n"); + case Opcodes.INTEGER_BITWISE_NOT -> sb.append("INTEGER_BITWISE_NOT r").append(rd).append(" = integerBitwiseNot(r").append(rs).append(")\n"); + case Opcodes.ORD -> sb.append("ORD r").append(rd).append(" = ord(r").append(rs).append(")\n"); + case Opcodes.ORD_BYTES -> sb.append("ORD_BYTES r").append(rd).append(" = ordBytes(r").append(rs).append(")\n"); + case Opcodes.OCT -> sb.append("OCT r").append(rd).append(" = oct(r").append(rs).append(")\n"); + case Opcodes.HEX -> sb.append("HEX r").append(rd).append(" = hex(r").append(rs).append(")\n"); + case Opcodes.SRAND -> sb.append("SRAND r").append(rd).append(" = srand(r").append(rs).append(")\n"); + case Opcodes.CHR -> sb.append("CHR r").append(rd).append(" = chr(r").append(rs).append(")\n"); + case Opcodes.CHR_BYTES -> sb.append("CHR_BYTES r").append(rd).append(" = chrBytes(r").append(rs).append(")\n"); + case Opcodes.LENGTH_BYTES -> sb.append("LENGTH_BYTES r").append(rd).append(" = lengthBytes(r").append(rs).append(")\n"); + case Opcodes.QUOTEMETA -> sb.append("QUOTEMETA r").append(rd).append(" = quotemeta(r").append(rs).append(")\n"); + case Opcodes.FC -> sb.append("FC r").append(rd).append(" = fc(r").append(rs).append(")\n"); + case Opcodes.LC -> sb.append("LC r").append(rd).append(" = lc(r").append(rs).append(")\n"); + case Opcodes.LCFIRST -> sb.append("LCFIRST r").append(rd).append(" = lcfirst(r").append(rs).append(")\n"); + case Opcodes.UC -> sb.append("UC r").append(rd).append(" = uc(r").append(rs).append(")\n"); + case Opcodes.UCFIRST -> sb.append("UCFIRST r").append(rd).append(" = ucfirst(r").append(rs).append(")\n"); + case Opcodes.SLEEP -> sb.append("SLEEP r").append(rd).append(" = sleep(r").append(rs).append(")\n"); + case Opcodes.TELL -> sb.append("TELL r").append(rd).append(" = tell(r").append(rs).append(")\n"); + case Opcodes.RMDIR -> sb.append("RMDIR r").append(rd).append(" = rmdir(r").append(rs).append(")\n"); + case Opcodes.CLOSEDIR -> sb.append("CLOSEDIR r").append(rd).append(" = closedir(r").append(rs).append(")\n"); + case Opcodes.REWINDDIR -> sb.append("REWINDDIR r").append(rd).append(" = rewinddir(r").append(rs).append(")\n"); + case Opcodes.TELLDIR -> sb.append("TELLDIR r").append(rd).append(" = telldir(r").append(rs).append(")\n"); + case Opcodes.CHDIR -> sb.append("CHDIR r").append(rd).append(" = chdir(r").append(rs).append(")\n"); + case Opcodes.EXIT -> sb.append("EXIT r").append(rd).append(" = exit(r").append(rs).append(")\n"); + default -> sb.append("UNKNOWN_").append(opcode).append("\n"); + } + + return pc; + } +} diff --git a/src/main/java/org/perlonjava/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/regex/RegexPreprocessor.java index 9dd956897..78d4d8871 100644 --- a/src/main/java/org/perlonjava/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/regex/RegexPreprocessor.java @@ -77,6 +77,10 @@ static String preProcessRegex(String s, RegexFlags regexFlags) { captureGroupCount = 0; deferredUnicodePropertyEncountered = false; + // First, escape invalid quantifier braces (Perl compatibility) + // DISABLED: Causes test regressions - needs more work + // s = escapeInvalidQuantifierBraces(s); + s = convertPythonStyleGroups(s); s = transformSimpleConditionals(s); s = removeUnderscoresFromEscapes(s); @@ -93,6 +97,214 @@ static String preProcessRegex(String s, RegexFlags regexFlags) { return result; } + /** + * Escape unescaped braces that don't form valid quantifiers. + * Perl allows invalid quantifier braces and treats them as literals. + * Java Pattern.compile() rejects them, so we must escape them. + * + * Valid quantifiers: {n}, {n,}, {n,m} where n and m are non-negative integers + * Invalid quantifiers: {(.*?)}, {abc}, {}, {,5}, etc. + * + * IMPORTANT: This is a high-risk preprocessing step that modifies brace characters. + * Known edge cases that must be handled correctly: + * + * 1. ESCAPE SEQUENCES WITH BRACES (must NOT be escaped): + * - \N{name} - Named Unicode character (e.g., \N{LATIN SMALL LETTER A}) + * - \x{...} - Hexadecimal character code (e.g., \x{1F600}) + * - \o{...} - Octal character code (e.g., \o{777}) + * - \p{...} - Unicode property (e.g., \p{Letter}) + * - \P{...} - Negated Unicode property (e.g., \P{Number}) + * - \g{...} - Named or relative backreference (e.g., \g{name}, \g{-1}) + * Currently handled: N, x, o, p, P, g + * + * 2. CHARACTER CLASSES (braces inside [...] are always literal): + * - [a{3}] means "match 'a', '{', '3', or '}'" not "match 'aaa'" + * - Nested classes like [a-z[0-9]{3}] must track nesting depth + * + * 3. VALID QUANTIFIERS (must NOT be escaped): + * - {n} - exactly n times (e.g., a{3}) + * - {n,} - n or more times (e.g., a{2,}) + * - {n,m} - between n and m times (e.g., a{2,5}) + * + * 4. ALREADY ESCAPED BRACES (must NOT be double-escaped): + * - \{ and \} should remain as-is + * - Track backslash escaping carefully to avoid double-escaping + * + * 5. POSSESSIVE AND LAZY QUANTIFIERS: + * - {n}+ (possessive) and {n}? (lazy) should work with valid quantifiers + * + * POTENTIAL ISSUES NOT YET HANDLED: + * - Extended bracketed character classes: (?[...]) may contain braces + * - Conditional patterns: (?(condition){yes}{no}) uses braces for branches + * - Subroutine definitions: (?(DEFINE)(?...)) may have complex nesting + * - Code blocks: (?{...}) and (??{...}) use braces but are handled elsewhere + * - Named capture definitions: (?...) - are braces allowed in names? + * - Unicode named sequences: \N{...} may contain nested braces in some contexts + * + * If new regex features are added that use braces, this function MUST be updated. + * Test changes thoroughly with unit/regex/unescaped_braces.t and regex test suite. + */ + private static String escapeInvalidQuantifierBraces(String pattern) { + StringBuilder result = new StringBuilder(); + boolean inCharClass = false; + boolean escaped = false; + + for (int i = 0; i < pattern.length(); i++) { + char c = pattern.charAt(i); + + // Handle escape sequences + if (escaped) { + result.append(c); + + // Check if this is an escape sequence that uses braces: \N{...}, \x{...}, \o{...}, \p{...}, \P{...}, \g{...} + if ((c == 'N' || c == 'x' || c == 'o' || c == 'p' || c == 'P' || c == 'g') && + i + 1 < pattern.length() && pattern.charAt(i + 1) == '{') { + // Skip the entire escape sequence with braces + result.append('{'); + i++; // Move past '{' + int braceDepth = 1; + i++; // Move to first character inside braces + + while (i < pattern.length() && braceDepth > 0) { + char ch = pattern.charAt(i); + result.append(ch); + if (ch == '\\' && i + 1 < pattern.length()) { + // Skip escaped character inside the escape sequence + i++; + if (i < pattern.length()) { + result.append(pattern.charAt(i)); + } + } else if (ch == '{') { + braceDepth++; + } else if (ch == '}') { + braceDepth--; + } + i++; + } + i--; // Back up one since the loop will increment + } + + escaped = false; + continue; + } + + if (c == '\\') { + result.append(c); + escaped = true; + continue; + } + + // Track character class boundaries (braces inside [...] are always literal) + if (c == '[') { + inCharClass = true; + result.append(c); + continue; + } + if (c == ']') { + inCharClass = false; + result.append(c); + continue; + } + + // Only process braces outside character classes + if (!inCharClass && c == '{') { + // Look ahead to check if this is a valid quantifier + int closePos = findMatchingCloseBraceForEscape(pattern, i); + if (closePos > 0 && isValidQuantifierContent(pattern, i + 1, closePos)) { + result.append(c); // Keep valid quantifier as-is + } else { + result.append("\\{"); // Escape invalid quantifier + } + } else if (!inCharClass && c == '}') { + // Check if this closes a quantifier that we kept unescaped + if (!closesValidQuantifier(result, pattern, i)) { + result.append("\\}"); // Escape unmatched closing brace + } else { + result.append(c); + } + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Find the position of closing brace that matches opening brace at pos. + * Returns -1 if no matching brace found. + */ + private static int findMatchingCloseBraceForEscape(String pattern, int openPos) { + for (int i = openPos + 1; i < pattern.length(); i++) { + char c = pattern.charAt(i); + if (c == '\\') { + i++; // Skip escaped character + continue; + } + if (c == '}') { + return i; + } + } + return -1; // No closing brace found + } + + /** + * Check if content between braces forms a valid quantifier. + * Valid: {n}, {n,}, {n,m} where n and m are non-negative integers + * Invalid: {(.*?)}, {abc}, {}, {,5}, etc. + */ + private static boolean isValidQuantifierContent(String pattern, int start, int end) { + if (start >= end) { + return false; // Empty braces {} + } + + String content = pattern.substring(start, end); + + // Check for {n}, {n,}, or {n,m} pattern + if (content.matches("\\d+")) { + return true; // {n} + } + if (content.matches("\\d+,")) { + return true; // {n,} + } + if (content.matches("\\d+,\\d+")) { + return true; // {n,m} + } + + return false; + } + + /** + * Check if closing brace at position closePos closes a valid quantifier + * that we kept unescaped in the result buffer. + */ + private static boolean closesValidQuantifier(StringBuilder result, String pattern, int closePos) { + // Find the most recent unescaped opening brace in result + int openPos = -1; + for (int i = result.length() - 1; i >= 0; i--) { + if (result.charAt(i) == '{') { + // Check if it's escaped + int backslashCount = 0; + for (int j = i - 1; j >= 0 && result.charAt(j) == '\\'; j--) { + backslashCount++; + } + if (backslashCount % 2 == 0) { + // Even number of backslashes (or zero) means { is not escaped + openPos = i; + break; + } + } + } + + if (openPos < 0) { + return false; // No unescaped opening brace found + } + + // Extract content and validate + String content = result.substring(openPos + 1); + return content.matches("\\d+") || content.matches("\\d+,") || content.matches("\\d+,\\d+"); + } + /** * Expand characters with multi-character case folds into alternations. * For example: ß → (?:ß|ss|SS|Ss|sS) diff --git a/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java index 71b6b2915..2cc6db9be 100644 --- a/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java @@ -19,49 +19,53 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) char nextChar = s.charAt(offset); // Check for numeric backreferences vs octal escapes - // In Perl: \400, \600, \777 are octals (> 255), not backreferences - // But \1-\9 followed by non-octal digits are backreferences + // In Perl: + // - \1 through \9 are backreferences (when groups exist) + // - \10, \11, etc. are also backreferences (when groups exist) + // - \0 through \377 (up to 3 digits) are octal escapes (values 0-255) + // - \400 and above are octal escapes (values > 255) + // - If no groups exist, \1-\9 are treated as octals, not errors + // + // Key insight: A sequence like \337 is a 3-digit octal (decimal 223 = ß) + // It should NOT be treated as backreference \3 followed by literal "37" + // + // Strategy: + // 1. Check if we have a valid 3-digit octal sequence -> always treat as octal + // 2. If we have 1-2 digits starting with \1-\9: + // - If capture groups exist -> treat as backreference + // - If no capture groups exist -> treat as octal boolean isOctalNotBackref = false; - if (nextChar >= '1' && nextChar <= '9') { - // Check if this might be a 3-digit octal > 255 - if (nextChar >= '1' && nextChar <= '7' && offset + 2 < length) { - int d1 = nextChar - '0'; + if (nextChar >= '0' && nextChar <= '7') { + // Potential octal - check if we have 2 more octal digits + if (offset + 2 < length) { char c2 = s.charAt(offset + 1); - char c3 = offset + 2 < length ? s.charAt(offset + 2) : '\0'; + char c3 = s.charAt(offset + 2); if (c2 >= '0' && c2 <= '7' && c3 >= '0' && c3 <= '7') { - int octalValue = d1 * 64 + (c2 - '0') * 8 + (c3 - '0'); - if (octalValue > 255) { - // This is an octal escape, not a backreference - // Fall through to octal handling below at line ~320 - // Leave the backslash in sb for the octal handler to manage - // offset stays pointing to the first octal digit ('4' in \400) - isOctalNotBackref = true; - } - // else: It's a 3-digit octal <= 255, treat as backreference - // (Perl's behavior: \1-\377 are backreferences if groups exist) + // We have 3 octal digits - this is ALWAYS an octal escape + // Example: \337, \123, \400, etc. + isOctalNotBackref = true; } } + // Note: If we have fewer than 3 octal digits, we'll check for backreferences below + // Example: \1, \12 could be backreferences if groups exist, octals if not } if (!isOctalNotBackref && nextChar >= '1' && nextChar <= '9') { - // This is a backreference like \1, \2, etc. - int refNum = nextChar - '0'; - - // Check if we have ANY capture groups at all - // If there are no groups, this is always an error - // But if there are groups, allow forward references + // Check if we have capture groups if (RegexPreprocessor.captureGroupCount == 0) { - sb.setLength(sb.length() - 1); // Remove the backslash - RegexPreprocessor.regexError(s, offset + 1, "Reference to nonexistent group"); + // No capture groups - treat as octal + // Fall through to octal handling below + isOctalNotBackref = true; + } else { + // This is a backreference like \1, \2, etc. + // Forward references are allowed when there are capture groups + // Perl allows forward references like (\3|b)\2(a) where \3 refers to group 3 + // which hasn't been captured yet. This is valid and the reference just won't match + // until group 3 is actually captured. + sb.append(nextChar); + return offset; } - // Forward references are allowed when there are capture groups - // Perl allows forward references like (\3|b)\2(a) where \3 refers to group 3 - // which hasn't been captured yet. This is valid and the reference just won't match - // until group 3 is actually captured. - - sb.append(nextChar); - return offset; } if (nextChar == 'k' && offset + 1 < length && s.charAt(offset + 1) == '\'') { // Handle \k'name' backreference (Perl syntax) @@ -374,21 +378,22 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) sb.setLength(sb.length() - 1); // Remove the backslash sb.append(String.format("\\x{%X}", octalValue)); offset += octalLength - 1; // -1 because caller will increment - } else if (octalValue <= 255 && octalLength == 3) { - // Standard 3-digit octal, prepend 0 for Java + } else if (octalLength == 3) { + // 3-digit octal, prepend 0 for Java + // Java requires \0nnn format sb.append('0'); sb.append(Character.toChars(c2)); - } else if (c2 == '0' && octalLength == 1) { - // Single \0 becomes \00 - sb.append('0'); - sb.append('0'); - } else if (c2 >= '1' && c2 <= '3' && octalLength == 3) { - // 3-digit octal starting with 1-3, prepend 0 - sb.append('0'); - sb.append(Character.toChars(c2)); - } else { - // Short octal or single digit, pass through - sb.append(Character.toChars(c2)); + // The remaining 2 digits will be added by caller's loop + } else if (octalLength == 2) { + // 2-digit octal like \12 becomes \012 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); + offset += octalLength - 1; // Skip the second digit + } else if (octalLength == 1) { + // Single digit octal: \0 through \7 + // Convert to 2-digit format for Java: \00 through \07 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); } } else if (c2 == '8' || c2 == '9') { // \8 and \9 are not valid octals - treat as literal digits @@ -610,25 +615,23 @@ static int handleRegexCharacterClassEscape(int offset, String s, StringBuilder s sb.append(String.format("x{%X}", octalValue)); offset += octalLength - 1; // -1 because outer loop will increment lastChar = octalValue; - } else if (octalValue <= 255 && octalLength == 3) { - // Standard 3-digit octal, prepend 0 for Java + } else if (octalLength == 3) { + // 3-digit octal, prepend 0 for Java sb.append('0'); sb.append(Character.toChars(c2)); lastChar = octalValue; - } else if (c2 == '0' && octalLength == 1) { - // Single \0 becomes \00 - sb.append('0'); - sb.append('0'); - lastChar = 0; - } else if (c2 >= '1' && c2 <= '3' && octalLength == 3) { - // 3-digit octal starting with 1-3, prepend 0 - sb.append('0'); - sb.append(Character.toChars(c2)); + } else if (octalLength == 2) { + // 2-digit octal like \12 becomes \012 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); + offset += octalLength - 1; // Skip the second digit + lastChar = octalValue; + } else if (octalLength == 1) { + // Single digit octal: \0 through \7 + // Convert to 2-digit format for Java: \00 through \07 + sb.setLength(sb.length() - 1); // Remove the backslash + sb.append(String.format("\\0%o", octalValue)); lastChar = octalValue; - } else { - // Short octal or single digit, pass through - sb.append(Character.toChars(c2)); - lastChar = c2; } } else if (c2 == '8' || c2 == '9') { // \8 and \9 are not valid octals - treat as literal digits diff --git a/src/main/java/org/perlonjava/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/regex/RuntimeRegex.java index a65296840..6e29f039e 100644 --- a/src/main/java/org/perlonjava/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/regex/RuntimeRegex.java @@ -25,6 +25,9 @@ */ public class RuntimeRegex extends RuntimeBase implements RuntimeScalarReference { + // Debug flag for regex compilation (set at class load time) + private static final boolean DEBUG_REGEX = System.getenv("DEBUG_REGEX") != null; + // Constants for regex pattern flags private static final int CASE_INSENSITIVE = Pattern.CASE_INSENSITIVE; private static final int MULTILINE = Pattern.MULTILINE; @@ -80,11 +83,20 @@ public RuntimeRegex() { * @throws IllegalStateException if regex compilation fails. */ public static RuntimeRegex compile(String patternString, String modifiers) { + // Debug logging + if (DEBUG_REGEX) { + System.err.println("RuntimeRegex.compile: pattern=" + patternString + " modifiers=" + modifiers); + System.err.println(" caller stack: " + Thread.currentThread().getStackTrace()[2]); + } + String cacheKey = patternString + "/" + modifiers; // Check if the regex is already cached RuntimeRegex regex = regexCache.get(cacheKey); if (regex == null) { + if (DEBUG_REGEX) { + System.err.println(" cache miss, compiling new regex"); + } regex = new RuntimeRegex(); if (patternString != null && patternString.contains("\\Q")) { @@ -102,6 +114,11 @@ public static RuntimeRegex compile(String patternString, String modifiers) { try { javaPattern = preProcessRegex(patternString, regex.regexFlags); + // Debug logging + if (DEBUG_REGEX) { + System.err.println(" preprocessed pattern=" + javaPattern); + } + // Track if preprocessing deferred user-defined Unicode properties. // These need to be resolved later, once the corresponding Perl subs are defined. regex.deferredUserDefinedUnicodeProperties = RegexPreprocessor.hadDeferredUnicodePropertyEncountered(); @@ -149,6 +166,11 @@ public static RuntimeRegex compile(String patternString, String modifiers) { if (regexCache.size() < MAX_REGEX_CACHE_SIZE) { regexCache.put(cacheKey, regex); } + } else { + // Debug logging for cache hit + if (DEBUG_REGEX) { + System.err.println(" cache hit, reusing cached regex"); + } } return regex; } @@ -357,7 +379,8 @@ public static RuntimeBase matchRegex(RuntimeScalar quotedRegex, RuntimeScalar st } // Fast path: no alarm active, use direct matching - return matchRegexDirect(quotedRegex, string, ctx); + RuntimeBase result = matchRegexDirect(quotedRegex, string, ctx); + return result; } /** @@ -367,6 +390,12 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc RuntimeRegex regex = resolveRegex(quotedRegex); regex = ensureCompiledForRuntime(regex); + // Debug logging + if (DEBUG_REGEX) { + System.err.println("matchRegexDirect: pattern=" + regex.pattern.pattern() + + " input=" + string.toString() + " ctx=" + ctx); + } + if (regex.regexFlags.isMatchExactlyOnce() && regex.matched) { // m?PAT? already matched once; now return false if (ctx == RuntimeContextType.LIST) { @@ -503,6 +532,11 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc posScalar.set(scalarUndef); } + // Debug logging + if (DEBUG_REGEX) { + System.err.println(" match result: found=" + found); + } + if (!found) { // No match: scalar match vars ($`, $&, $') should become undef. // Keep lastSuccessful* and the previous globalMatcher intact so @-/@+ do not get clobbered @@ -540,6 +574,11 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc } if (ctx == RuntimeContextType.LIST) { + // In LIST context: return captured groups, or (1) for success with no captures (non-global) + if (found && result.elements.isEmpty() && !regex.regexFlags.isGlobalMatch()) { + // Non-global match with no captures in LIST context returns (1) + result.elements.add(RuntimeScalarCache.getScalarInt(1)); + } return result; } else if (ctx == RuntimeContextType.SCALAR) { return RuntimeScalarCache.getScalarBoolean(found);