diff --git a/dev/design/cpan_client.md b/dev/design/cpan_client.md new file mode 100644 index 000000000..302871193 --- /dev/null +++ b/dev/design/cpan_client.md @@ -0,0 +1,267 @@ +# CPAN Client Support for PerlOnJava + +## Overview + +This document analyzes what's needed to run CPAN.pm (or alternatives) on PerlOnJava. + +## Current Status + +CPAN.pm has deep dependencies that make it challenging to port. The main blocker is `Safe`/`Opcode` which requires access to Perl's internal opcode system. + +--- + +## CPAN.pm Dependency Analysis + +### Available (Already Working) + +| Module | Status | +|--------|--------| +| File::Spec, File::Basename, File::Copy, File::Find, File::Path, File::Temp | ✅ | +| Text::ParseWords, Text::Wrap | ✅ | +| Config, Carp, Cwd, Exporter, Fcntl | ✅ | +| FileHandle, IO::File, IO::Handle | ✅ | +| HTTP::Tiny, Compress::Zlib | ✅ | +| Digest::MD5, Digest::SHA, MIME::Base64 | ✅ | +| YAML, JSON, Term::ReadLine | ✅ | + +### Critical Missing Modules + +| Module | Status | Complexity | Notes | +|--------|--------|------------|-------| +| **Safe** | ❌ Missing | High | Sandbox/compartment module - requires Opcode | +| **Opcode** | ❌ Missing | Very High | Core opcodes restriction - deeply tied to Perl internals | +| **DirHandle** | ✅ Done | Low | OO interface to opendir/readdir - imported via sync.pl | +| **Sys::Hostname** | ✅ Done | Low | `hostname()` function - SysHostname.java XS module | +| **ExtUtils::MakeMaker** | ❌ Missing | Very High | Build system - huge module with many dependencies | +| **LWP::UserAgent** | ❌ Missing | Medium | Web client (HTTP::Tiny exists as alternative) | +| **Archive::Tar** | ✅ Done | Medium | Imported via sync.pl | +| **Archive::Zip** | ❌ Missing | Medium | Zip handling - Java has built-in support | +| **Net::FTP** | ✅ Done | Medium | Imported via sync.pl | +| **IPC::Open3** | ❌ Missing | Medium | Process I/O - needs Java ProcessBuilder | +| **IO::Socket** | ✅ Done | Medium | Imported via sync.pl | +| **Dumpvalue** | ✅ Done | Low | Imported via sync.pl | + +### Built-in Functions Missing + +| Function | Status | Notes | +|----------|--------|-------| +| `flock()` | ✅ Implemented | File locking - using java.nio.channels.FileLock | + +--- + +## Import Strategy via sync.pl + +The `dev/import-perl5/sync.pl` script can import pure Perl modules from the perl5 source tree. + +### Quick Wins - Add to config.yaml + +These modules can be imported directly: + +```yaml +# DirHandle - OO directory handle interface +- source: perl5/lib/DirHandle.pm + target: src/main/perl/lib/DirHandle.pm + +# Dumpvalue - Debug dump utility +- source: perl5/dist/Dumpvalue/lib/Dumpvalue.pm + target: src/main/perl/lib/Dumpvalue.pm + +# Sys::Hostname - Get system hostname +- source: perl5/ext/Sys-Hostname/Hostname.pm + target: src/main/perl/lib/Sys/Hostname.pm + +# IPC::Open3 - Open process with 3 filehandles +- source: perl5/ext/IPC-Open3/lib/IPC/Open3.pm + target: src/main/perl/lib/IPC/Open3.pm + +# Archive::Tar (if IO::Zlib is available) +- source: perl5/cpan/Archive-Tar/lib/Archive/Tar.pm + target: src/main/perl/lib/Archive/Tar.pm +- source: perl5/cpan/Archive-Tar/lib/Archive/Tar + target: src/main/perl/lib/Archive/Tar + type: directory + +# Net::FTP and libnet modules +- source: perl5/cpan/libnet/lib/Net + target: src/main/perl/lib/Net + type: directory +``` + +### Modules Requiring Java Implementation + +| Module | Java Implementation Needed | +|--------|---------------------------| +| **flock()** | `java.nio.channels.FileLock` in RuntimeIO.java | +| **IO::Socket** | Wrap `java.net.Socket` / `java.net.ServerSocket` | +| **Sys::Hostname** (XS part) | `java.net.InetAddress.getLocalHost().getHostName()` | + +--- + +## The Safe/Opcode Blocker + +**Safe.pm** is used by CPAN.pm to safely evaluate CPAN metadata (like `META.yml` code). It depends on **Opcode.pm** which: + +1. Uses XSLoader (has C code) +2. Manipulates Perl's internal opcode tree +3. Restricts which operations can run in a compartment + +### Why This Is Hard + +Opcode works by: +- Enumerating all Perl opcodes (300+) +- Creating bitmasks to allow/deny specific operations +- Hooking into Perl's internal compilation + +PerlOnJava compiles to JVM bytecode, not Perl opcodes. Implementing Opcode would require: +- Mapping Perl opcodes to JVM operations +- Implementing compartmentalization at the JVM level +- Possibly using Java SecurityManager (deprecated in newer Java) + +**Verdict**: Opcode/Safe would require significant architectural work. + +--- + +## Alternative Approaches + +### Option 1: Use cpanm (App::cpanminus) + +cpanm is a lighter CPAN client. Need to analyze its dependencies. + +```bash +# Check cpanm dependencies +curl -s https://cpanmin.us | head -200 +``` + +### Option 2: Minimal CPAN Client + +Create a simple CPAN client using modules that already work: + +```perl +# Pseudo-code for minimal CPAN client +use HTTP::Tiny; +use Archive::Tar; # needs import +use File::Temp; + +sub install_module { + my ($module) = @_; + + # 1. Query MetaCPAN API + my $http = HTTP::Tiny->new; + my $resp = $http->get("https://fastapi.metacpan.org/v1/download_url/$module"); + + # 2. Download tarball + my $tarball = download($resp->{download_url}); + + # 3. Extract + Archive::Tar->extract_archive($tarball); + + # 4. Run Makefile.PL or Build.PL (this is the hard part) +} +``` + +### Option 3: Pre-bundle Modules + +Instead of a CPAN client, import pure-Perl modules directly: + +1. Identify commonly needed CPAN modules +2. Add them to `dev/import-perl5/config.yaml` +3. Run `perl dev/import-perl5/sync.pl` + +This is already working for many modules (Pod::*, Test::*, Getopt::Long, etc.) + +--- + +## Implementation Priority + +### Phase 1: Low-hanging fruit (Easy) + +1. **DirHandle** - Add to config.yaml, pure Perl +2. **Dumpvalue** - Add to config.yaml, pure Perl +3. **Sys::Hostname** - Import + Java fallback +4. **flock()** - Implement in Java using FileLock + +### Phase 2: Archive/Network (Medium) + +5. **Archive::Tar** - Import from perl5 tree (needs IO::Zlib check) +6. **Archive::Zip** - Java implementation using `java.util.zip` +7. **IO::Socket** - Java implementation wrapping sockets +8. **Net::FTP** - Import if IO::Socket works + +### Phase 3: Process Control (Medium) + +9. **IPC::Open3** - Import + verify pipe support +10. **IPC::Cmd** - Import if Open3 works + +### Phase 4: Consider Alternatives + +11. Evaluate cpanm dependencies +12. Consider minimal custom CPAN client +13. Document "how to add a CPAN module" for users + +--- + +## Testing Commands + +```bash +# Test module availability +./jperl -e 'use DirHandle; print "OK\n"' +./jperl -e 'use Sys::Hostname; print hostname(), "\n"' +./jperl -e 'use Archive::Tar; print "OK\n"' + +# Test flock (currently fails) +./jperl -e 'use Fcntl qw(:flock); open my $fh, "<", $0; flock($fh, LOCK_SH); print "OK\n"' +``` + +--- + +## Related Documents + +- `dev/design/xsloader.md` - How XSLoader/Java integration works +- `dev/design/http_server.md` - HTTP capabilities +- `.cognition/skills/port-cpan-module/` - Skill for porting CPAN modules + +--- + +## Progress Tracking + +### Current Status: Phase 2 complete + +### Completed +- [x] Analyze CPAN.pm dependencies (2024-03-13) +- [x] Identify modules available in perl5 tree +- [x] Document sync.pl import strategy +- [x] Identify Safe/Opcode blocker +- [x] **Phase 1: Low-hanging fruit** (2024-03-13) + - DirHandle - imported via sync.pl, fixed Symbol::gensym() to return GLOB reference + - Dumpvalue - imported via sync.pl, fixed parser bug with `%package:: and` syntax + - Sys::Hostname - imported via sync.pl, implemented syscall() operator + - flock() - implemented in CustomFileChannel.java using java.nio.channels.FileLock + - syscall() - implemented in SyscallOperator.java with gethostname support +- [x] **Phase 2: Archive/Network modules** (2024-03-13) + - IO::Socket, IO::Socket::INET, IO::Socket::UNIX - imported via sync.pl + - IO::Zlib - imported via sync.pl + - Archive::Tar - imported via sync.pl, patched GZIP_MAGIC_NUM regex (octal to hex) + - Net::FTP, Net::Cmd, Net::* - imported via sync.pl + - Tie::StdHandle - added for IO::Zlib dependency + - File::Spec platform modules - added for Archive::Tar dependency + - Socket.pm - added $VERSION and additional constants (INADDR_*, IPPROTO_*, SHUT_*, etc.) + - Parser fix: `@{${...}}` nested dereference now works in push/unshift + - SysHostname.java XS module - provides ghname() via InetAddress.getLocalHost() + - XSLoader caller() support - load() now uses caller() when no argument provided + +### Files Changed (Phase 2) +- `dev/import-perl5/config.yaml` - Added IO::Socket, IO::Zlib, Archive::Tar, Net::*, Tie::StdHandle, File::Spec imports +- `src/main/java/org/perlonjava/runtime/perlmodule/Socket.java` - Added 20+ socket constants +- `src/main/perl/lib/Socket.pm` - Added $VERSION and expanded exports +- `src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java` - Fixed `$` followed by `{` in braced variable parsing +- `src/main/java/org/perlonjava/runtime/perlmodule/SysHostname.java` - New XS module for Sys::Hostname +- `src/main/java/org/perlonjava/runtime/perlmodule/XSLoader.java` - Added caller() support for no-argument load() + +### Next Steps +1. Phase 3: Process control (IPC::Open3) +2. Evaluate cpanm as alternative to CPAN.pm + +### Open Questions +- Is cpanm lighter on dependencies than CPAN.pm? +- Should we create a PerlOnJava-specific minimal CPAN client? +- How important is Safe compartmentalization for users? diff --git a/dev/import-perl5/config.yaml b/dev/import-perl5/config.yaml index e8df5dd7e..7e215339d 100644 --- a/dev/import-perl5/config.yaml +++ b/dev/import-perl5/config.yaml @@ -400,6 +400,57 @@ imports: target: perl5_t/Term-Table type: directory + # DirHandle - OO interface to directory handles (pure Perl) + - source: perl5/lib/DirHandle.pm + target: src/main/perl/lib/DirHandle.pm + + # Dumpvalue - Debug dump utility (pure Perl) + - source: perl5/dist/Dumpvalue/lib/Dumpvalue.pm + target: src/main/perl/lib/Dumpvalue.pm + + # Sys::Hostname - Get system hostname + - source: perl5/ext/Sys-Hostname/Hostname.pm + target: src/main/perl/lib/Sys/Hostname.pm + + # Phase 2: IO::Socket - OO socket interface + - source: perl5/dist/IO/lib/IO/Socket.pm + target: src/main/perl/lib/IO/Socket.pm + + - source: perl5/dist/IO/lib/IO/Socket + target: src/main/perl/lib/IO/Socket + type: directory + + # Phase 2: IO::Zlib - Compressed I/O (for Archive::Tar) + - source: perl5/cpan/IO-Zlib/Zlib.pm + target: src/main/perl/lib/IO/Zlib.pm + + # Tie::StdHandle - Required by IO::Zlib + - source: perl5/lib/Tie/StdHandle.pm + target: src/main/perl/lib/Tie/StdHandle.pm + + # File::Spec platform modules - Required by Archive::Tar + - source: perl5/dist/PathTools/lib/File/Spec + target: src/main/perl/lib/File/Spec + type: directory + + # Phase 2: Archive::Tar - Tar archive handling + - source: perl5/cpan/Archive-Tar/lib/Archive/Tar.pm + target: src/main/perl/lib/Archive/Tar.pm + + - source: perl5/cpan/Archive-Tar/lib/Archive/Tar + target: src/main/perl/lib/Archive/Tar + type: directory + + # Phase 2: Net::FTP and libnet modules + - source: perl5/cpan/libnet/lib/Net + target: src/main/perl/lib/Net + type: directory + + # Symbol - manipulate Perl symbols and their names (pure Perl) + # Required by constant.pm which is used by File::Spec::Unix + - source: perl5/lib/Symbol.pm + target: src/main/perl/lib/Symbol.pm + # Add more imports below as needed # Example with minimal fields: # - source: perl5/lib/SomeModule.pm diff --git a/dev/prompts/fix-interpreter-array-test.md b/dev/prompts/fix-interpreter-array-test.md new file mode 100644 index 000000000..c85e2f48e --- /dev/null +++ b/dev/prompts/fix-interpreter-array-test.md @@ -0,0 +1,84 @@ +# Plan: Fix `./jperl --interpreter src/test/resources/unit/array.t` + +## Problem Summary + +The test produces no output in interpreter mode but works in JVM mode. The issue is that Test::More/Test2 output handles are broken in interpreter mode. + +## Root Cause Analysis + +1. **Test2::Formatter::TAP** gets output handles via `clone_io(Test2::API::test2_stdout())` +2. The cloned GLOB has a **null RuntimeIO** - `fileno()` throws NPE, `print` silently fails +3. The GLOB looks valid (stringifies to `GLOB(0x...)`) but isn't connected to any actual I/O + +### Reproduction + +```bash +# Works - JVM mode +./jperl src/test/resources/unit/array.t + +# Fails - no output in interpreter mode +./jperl --interpreter src/test/resources/unit/array.t + +# Minimal reproduction showing broken handle +./jperl --interpreter -e ' +use Test2::Formatter::TAP; +my $fmt = Test2::Formatter::TAP->new(); +my $h = $fmt->handles->[0]; +print $h "TEST\n"; # Silently fails - no output +' +``` + +## Investigation Findings + +| Test | Result | +|------|--------| +| `clone_io(\*STDOUT)` directly in -e code | Works | +| `clone_io()` from Test2::Util module | Works | +| `Test2::API::test2_stdout()` | Returns GLOB but printing fails | +| `Test2::Formatter::TAP->new()->handles` | Returns GLOBs with null RuntimeIO | + +The bug appears when the handle passes through Test2::API's storage/retrieval mechanism. + +## Hypothesis + +The interpreter's handling of GLOB values in hash/object storage loses the RuntimeIO connection. When a GLOB is stored in a hash (like `$self->{handles}`) and later retrieved, the RuntimeIO becomes null. + +## Implementation Plan + +### Phase 1: Create Failing Unit Test +- [x] Create `dev/sandbox/closure_capture_package_level.t` (done, but passes) +- [ ] Create minimal test that reproduces the GLOB storage bug + +### Phase 2: Debug GLOB Storage +1. Add debug logging to track when RuntimeIO becomes null +2. Trace the GLOB through: + - Creation in `clone_io()` + - Storage in Test2::API package variable + - Retrieval via `test2_stdout()` + - Storage in formatter's `handles` array + - Retrieval for printing + +### Phase 3: Fix the Bug +Location candidates: +- `src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeIO.java` - IO handle management +- `src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeGlob.java` - GLOB storage +- `src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java` - interpreter operations + +### Phase 4: Verify +```bash +./jperl --interpreter src/test/resources/unit/array.t +# Should output: ok 1 - Array has correct length ... 1..52 +``` + +## Files to Investigate + +- `RuntimeGlob.java` - How GLOBs store/retrieve RuntimeIO +- `RuntimeIO.java` - IO handle lifecycle +- `BytecodeInterpreter.java` - GLOB operations in interpreter +- `InlineOpcodeHandler.java` / `OpcodeHandlerExtended.java` - GLOB-related opcodes + +## Next Steps + +1. Add `print STDERR` debugging to trace where RuntimeIO becomes null +2. Compare JVM vs interpreter execution paths for GLOB handling +3. Identify the specific operation that loses the IO connection diff --git a/dev/sandbox/closure_capture_package_level.t b/dev/sandbox/closure_capture_package_level.t new file mode 100644 index 000000000..08a65f19d --- /dev/null +++ b/dev/sandbox/closure_capture_package_level.t @@ -0,0 +1,108 @@ +use strict; +use warnings; +use Test::More; + +# Test: Closures should capture lexical variables assigned at package level +# +# This tests the basic pattern used by Test2::API: +# my $STDOUT = clone_io(\*STDOUT); +# sub test2_stdout { $STDOUT ||= clone_io(\*STDOUT) } + +# Test 1: Simple string assignment +{ + package Test1; + + my $VAR = "test value"; + sub get_var { return $VAR; } + + package main; + is(Test1::get_var(), "test value", "Closure captures string at package level"); +} + +# Test 2: Assignment from function call +{ + package Test2; + + sub make_value { return "from function"; } + my $VAR = make_value(); + sub get_var { return $VAR; } + + package main; + is(Test2::get_var(), "from function", "Closure captures function return value"); +} + +# Test 3: Filehandle reference +{ + package Test3; + + my $FH = \*STDOUT; + sub get_fh { return $FH; } + sub is_fh_defined { return defined($FH) ? "yes" : "no"; } + + package main; + is(Test3::is_fh_defined(), "yes", "Filehandle ref is defined in closure"); + ok(defined(Test3::get_fh()), "Filehandle ref can be retrieved"); +} + +# Test 4: Cloned filehandle (simulates Test2::Util::clone_io) +{ + package Test4; + + sub clone_fh { + my ($fh) = @_; + my $fileno = fileno($fh); + open(my $out, ">&$fileno") or die "Can't dup: $!"; + return $out; + } + + my $CLONED = clone_fh(\*STDOUT); + sub get_cloned { return $CLONED; } + sub is_cloned_defined { return defined($CLONED) ? "yes" : "no"; } + + package main; + is(Test4::is_cloned_defined(), "yes", "Cloned filehandle is defined in closure"); + ok(defined(Test4::get_cloned()), "Cloned filehandle can be retrieved"); +} + +# Test 5: The ||= pattern (exactly like Test2::API::test2_stdout) +{ + package Test5; + + sub make_value { return "initial"; } + my $VAR = make_value(); + + # This pattern fails if closure doesn't capture initial value + sub get_with_fallback { $VAR ||= "fallback" } + + package main; + my $result = Test5::get_with_fallback(); + is($result, "initial", "Closure with ||= sees initial value (not fallback)"); +} + +# Test 6: Multiple lexicals +{ + package Test6; + + my $VAR1 = "first"; + my $VAR2 = "second"; + + sub get_var1 { return $VAR1; } + sub get_var2 { return $VAR2; } + + package main; + is(Test6::get_var1(), "first", "First variable captured"); + is(Test6::get_var2(), "second", "Second variable captured"); +} + +# Test 7: Variable is defined check +{ + package Test7; + + my $VAR = "value"; + sub is_defined { return defined($VAR) ? "yes" : "no"; } + + package main; + is(Test7::is_defined(), "yes", "Variable is defined inside closure"); +} + +done_testing(); diff --git a/docs/about/changelog.md b/docs/about/changelog.md index dab2d051e..b3c6f908c 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -9,7 +9,10 @@ Release history of PerlOnJava. See [Roadmap](roadmap.md) for future plans. - Add `defer` feature - Non-local control flow: `last`/`next`/`redo`/`goto LABEL` - Tail call with trampoline for `goto &NAME` and `goto __SUB__` -- Add modules: `Time::Piece`, `TOML`. +- Add modules: `Time::Piece`, `TOML`, `DirHandle`, `Dumpvalue`, `Sys::Hostname`, `IO::Socket`, `IO::Socket::INET`, `IO::Socket::UNIX`, `IO::Zlib`, `Archive::Tar`, `Net::FTP`, `Net::Cmd`. +- Add operators: `flock`, `syscall`. +- Bugfix: parser now handles `@{${...}}` nested dereference in push/unshift. +- Bugfix: regex octal escapes `\10`-`\377` now work correctly. - Bugfix: operator override in Time::Hires now works. - Bugfix: internal temp variables are now pre-initialized. - Optimization: faster list assignment. diff --git a/docs/reference/feature-matrix.md b/docs/reference/feature-matrix.md index b992f7408..569cea84a 100644 --- a/docs/reference/feature-matrix.md +++ b/docs/reference/feature-matrix.md @@ -522,6 +522,8 @@ my @copy = @{$z}; # ERROR - ✅ **Tied Handles**: Tied file handles are implemented. See also [Tied Scalars](#scalars), [Tied Arrays](#arrays-hashes-and-lists), [Tied Hashes](#arrays-hashes-and-lists). - ✅ **`DATA`**: `DATA` file handle is implemented. - ✅ **`truncate`**: File truncation +- ✅ **`flock`**: File locking with LOCK_SH, LOCK_EX, LOCK_UN, LOCK_NB +- ✅ **`syscall`**: System calls (SYS_gethostname) ### Socket Operations - ✅ **`socket`**: Socket creation with domain, type, and protocol support @@ -669,6 +671,8 @@ The `:encoding()` layer supports all encodings provided by Java's `Charset.forNa - ✅ **Config** module. - ✅ **Cwd** module - ✅ **Data::Dumper**: use the same version as Perl. +- ✅ **DirHandle** module. +- ✅ **Dumpvalue** module. - ✅ **Digest** module - ✅ **Digest::MD5** module - ✅ **Digest::SHA** module @@ -688,6 +692,10 @@ The `:encoding()` layer supports all encodings provided by Java's `Charset.forNa - ✅ **Internals**: `Internals::SvREADONLY` is implemented as a no-op. - ✅ **IO::File** module. - ✅ **IO::Seekable** module. +- ✅ **IO::Socket** module. +- ✅ **IO::Socket::INET** module. +- ✅ **IO::Socket::UNIX** module. +- ✅ **IO::Zlib** module. - ✅ **List::Util**: module. - ✅ **MIME::Base64** module - ✅ **MIME::QuotedPrint** module @@ -695,6 +703,7 @@ The `:encoding()` layer supports all encodings provided by Java's `Charset.forNa - ✅ **Scalar::Util**: `blessed`, `reftype`, `set_prototype`, `dualvar` are implemented. - ✅ **SelectSaver**: module. - ✅ **Storable**: module. +- ✅ **Sys::Hostname** module. - ✅ **Symbol**: `gensym`, `qualify` and `qualify_to_ref` are implemented. - ✅ **Term::ANSIColor** module. - ✅ **Test** module. @@ -715,7 +724,9 @@ The `:encoding()` layer supports all encodings provided by Java's `Charset.forNa - 🚧 **HTTP::Tiny** some features untested: proxy settings. - 🚧 **POSIX** module. - 🚧 **Unicode::Normalize** `normalize`, `NFC`, `NFD`, `NFKC`, `NFKD`. -- ❌ **IO::Socket** module, and related modules or asynchronous I/O operations. +- ✅ **Archive::Tar** module. +- ✅ **Net::FTP** module. +- ✅ **Net::Cmd** module. - ❌ **Safe** module. ### Non-core modules diff --git a/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java b/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java index bf48365c5..3fb60e353 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java +++ b/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java @@ -678,6 +678,8 @@ public static void visitOperator(BytecodeCompiler bytecodeCompiler, OperatorNode case "accept" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.ACCEPT); case "sysseek" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.SYSSEEK); case "truncate" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.TRUNCATE); + case "flock" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.FLOCK); + case "syscall" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.SYSCALL); case "read" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.READ); case "chown" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.CHOWN); case "waitpid" -> visitGenericListOpCase(bytecodeCompiler, node, Opcodes.WAITPID); diff --git a/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java b/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java index d90173235..0f2bba039 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java @@ -1993,6 +1993,7 @@ public static String disassemble(InterpretedCode interpretedCode) { case Opcodes.ACCEPT: case Opcodes.SYSSEEK: case Opcodes.TRUNCATE: + case Opcodes.FLOCK: case Opcodes.READ: case Opcodes.OPENDIR: case Opcodes.READDIR: @@ -2019,6 +2020,7 @@ public static String disassemble(InterpretedCode interpretedCode) { case Opcodes.ACCEPT -> "accept"; case Opcodes.SYSSEEK -> "sysseek"; case Opcodes.TRUNCATE -> "truncate"; + case Opcodes.FLOCK -> "flock"; case Opcodes.READ -> "read"; case Opcodes.OPENDIR -> "opendir"; case Opcodes.READDIR -> "readdir"; diff --git a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java index 944e50080..089853705 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java @@ -1597,6 +1597,10 @@ public class Opcodes { * truncate FILEHANDLE,LENGTH: Format: TRUNCATE rd argsReg ctx */ public static final short TRUNCATE = 326; + /** + * flock FILEHANDLE,OPERATION: Format: FLOCK rd argsReg ctx + */ + public static final short FLOCK = 385; /** * read FILEHANDLE,SCALAR,LENGTH: Format: READ rd argsReg ctx */ diff --git a/src/main/java/org/perlonjava/backend/bytecode/SlowOpcodeHandler.java b/src/main/java/org/perlonjava/backend/bytecode/SlowOpcodeHandler.java index a66983229..547de6b37 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/SlowOpcodeHandler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/SlowOpcodeHandler.java @@ -250,13 +250,16 @@ public static int executeSyscall(int[] bytecode, int pc, RuntimeBase[] registers int numberReg = bytecode[pc++]; int argCount = bytecode[pc++]; - // Skip argument registers + // Collect arguments + RuntimeBase[] args = new RuntimeBase[argCount + 1]; + args[0] = registers[numberReg]; // syscall number for (int i = 0; i < argCount; i++) { - pc++; // Skip each argument register + args[i + 1] = registers[bytecode[pc++]]; } - // TODO: Implement via JNI or Panama FFM API - throw new UnsupportedOperationException("syscall() not yet implemented"); + // Call SyscallOperator + registers[rd] = SyscallOperator.syscall(RuntimeContextType.SCALAR, args); + return pc; } /** diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index 063b5626c..3b339d312 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,14 +33,14 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "1cd764d82"; + public static final String gitCommitId = "dfdf6d3bd"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitDate = "2026-03-12"; + public static final String gitCommitDate = "2026-03-13"; // Prevent instantiation private Configuration() { diff --git a/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java b/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java index 6286cec7a..c8b7627d3 100644 --- a/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java @@ -331,10 +331,12 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr return prefix; } if (token.text.equals("$") && (nextToken.text.equals("$") + || nextToken.text.equals("{") || nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER) || nextToken.text.equals("::")) { - // `@$` can't be followed by `$`, `::`, name or number + // `@$` can't be followed by `$`, `{`, `::`, name or number + // `@{${...}` should fall back to block parsing return null; } if (token.text.equals("^") && nextToken.type == LexerTokenType.IDENTIFIER && Character.isUpperCase(nextToken.text.charAt(0))) { @@ -393,6 +395,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr nextToken = parser.tokens.get(parser.tokenIndex + 1); // After ::, only identifiers or another :: are allowed (or ' as package separator) + // Note: Keywords CAN be valid identifier parts after :: (e.g., $Foo::and, &UNIVERSAL::isa) if (token.type != LexerTokenType.IDENTIFIER && !token.text.equals("::") && !token.text.equals("'")) { // Nothing valid follows ::, so return what we have return variableName.toString(); diff --git a/src/main/java/org/perlonjava/runtime/io/CustomFileChannel.java b/src/main/java/org/perlonjava/runtime/io/CustomFileChannel.java index 01dc2212d..235d894e4 100644 --- a/src/main/java/org/perlonjava/runtime/io/CustomFileChannel.java +++ b/src/main/java/org/perlonjava/runtime/io/CustomFileChannel.java @@ -9,6 +9,8 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.channels.FileLock; +import java.nio.channels.OverlappingFileLockException; import java.nio.charset.Charset; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -56,6 +58,12 @@ */ public class CustomFileChannel implements IOHandle { + // Perl flock constants + private static final int LOCK_SH = 1; // Shared lock + private static final int LOCK_EX = 2; // Exclusive lock + private static final int LOCK_NB = 4; // Non-blocking + private static final int LOCK_UN = 8; // Unlock + /** * The underlying Java NIO FileChannel for actual I/O operations */ @@ -68,6 +76,11 @@ public class CustomFileChannel implements IOHandle { // When true, writes should always occur at end-of-file (Perl's append semantics). private boolean appendMode; + /** + * Current file lock, if any + */ + private FileLock currentLock; + /** * Helper for handling multi-byte character decoding across read boundaries */ @@ -366,6 +379,76 @@ public RuntimeScalar truncate(long length) { } } + /** + * Applies or removes an advisory lock on the file. + * + *

This implements Perl's flock() function using Java's FileLock API. + * The operation is a bitmask of: + *

+ * + * @param operation the lock operation bitmask + * @return RuntimeScalar with true on success, false on failure + */ + @Override + public RuntimeScalar flock(int operation) { + try { + boolean nonBlocking = (operation & LOCK_NB) != 0; + boolean unlock = (operation & LOCK_UN) != 0; + boolean shared = (operation & LOCK_SH) != 0; + boolean exclusive = (operation & LOCK_EX) != 0; + + if (unlock) { + // Release any existing lock + if (currentLock != null) { + currentLock.release(); + currentLock = null; + } + return scalarTrue; + } + + // Release any existing lock before acquiring a new one + if (currentLock != null) { + currentLock.release(); + currentLock = null; + } + + if (exclusive || shared) { + // shared=true for LOCK_SH, shared=false for LOCK_EX + boolean isShared = shared && !exclusive; + + if (nonBlocking) { + // Non-blocking: use tryLock + currentLock = fileChannel.tryLock(0, Long.MAX_VALUE, isShared); + if (currentLock == null) { + // Would block - return false + getGlobalVariable("main::!").set(11); // EAGAIN/EWOULDBLOCK + return RuntimeScalarCache.scalarFalse; + } + } else { + // Blocking: use lock (will wait until lock is available) + currentLock = fileChannel.lock(0, Long.MAX_VALUE, isShared); + } + return scalarTrue; + } + + // Invalid operation (neither lock nor unlock specified) + getGlobalVariable("main::!").set(22); // EINVAL + return RuntimeScalarCache.scalarFalse; + + } catch (OverlappingFileLockException e) { + // This happens when trying to lock a region already locked by this JVM + getGlobalVariable("main::!").set(11); // EAGAIN + return RuntimeScalarCache.scalarFalse; + } catch (IOException e) { + return handleIOException(e, "flock failed"); + } + } + @Override public RuntimeScalar sysread(int length) { try { diff --git a/src/main/java/org/perlonjava/runtime/io/IOHandle.java b/src/main/java/org/perlonjava/runtime/io/IOHandle.java index 6926d1525..ca219eeab 100644 --- a/src/main/java/org/perlonjava/runtime/io/IOHandle.java +++ b/src/main/java/org/perlonjava/runtime/io/IOHandle.java @@ -224,6 +224,25 @@ default RuntimeScalar truncate(long length) { return RuntimeIO.handleIOError("Truncate operation is not supported."); } + /** + * Applies or removes an advisory lock on a file. + * + *

This is equivalent to Perl's {@code flock(FILEHANDLE, OPERATION)}. + * The operation is a bitmask of:

+ * + * + * @param operation the lock operation bitmask + * @return RuntimeScalar with true on success, false on failure + */ + default RuntimeScalar flock(int operation) { + return RuntimeIO.handleIOError("flock operation is not supported on this handle type."); + } + // System-level I/O operations default RuntimeScalar sysread(int length) { return RuntimeIO.handleIOError("sysread operation is not supported."); diff --git a/src/main/java/org/perlonjava/runtime/operators/IOOperator.java b/src/main/java/org/perlonjava/runtime/operators/IOOperator.java index 813729ac6..3dd012dfa 100644 --- a/src/main/java/org/perlonjava/runtime/operators/IOOperator.java +++ b/src/main/java/org/perlonjava/runtime/operators/IOOperator.java @@ -1635,6 +1635,47 @@ public static RuntimeScalar truncate(int ctx, RuntimeBase... args) { } } + /** + * flock(FILEHANDLE, OPERATION) + * Applies or removes an advisory lock on a file. + * + * OPERATION is a bitmask: + * LOCK_SH (1) - Shared lock (for reading) + * LOCK_EX (2) - Exclusive lock (for writing) + * LOCK_UN (8) - Unlock + * LOCK_NB (4) - Non-blocking (can be OR'd with SH or EX) + * + * Returns true on success, false on failure. + */ + public static RuntimeScalar flock(int ctx, RuntimeBase... args) { + if (args.length < 2) { + getGlobalVariable("main::!").set("Not enough arguments for flock"); + return scalarFalse; + } + + try { + RuntimeScalar fileHandle = args[0].scalar(); + int operation = args[1].scalar().getInt(); + + RuntimeIO fh = fileHandle.getRuntimeIO(); + if (fh == null) { + getGlobalVariable("main::!").set(9); // EBADF - Bad file descriptor + return scalarFalse; + } + + if (fh.ioHandle == null) { + getGlobalVariable("main::!").set(9); // EBADF + return scalarFalse; + } + + return fh.ioHandle.flock(operation); + + } catch (Exception e) { + getGlobalVariable("main::!").set("flock failed: " + e.getMessage()); + return scalarFalse; + } + } + /** * getsockname(SOCKET) * Returns the packed sockaddr structure for the local end of the socket. diff --git a/src/main/java/org/perlonjava/runtime/operators/OperatorHandler.java b/src/main/java/org/perlonjava/runtime/operators/OperatorHandler.java index 5d63913ad..ff5353638 100644 --- a/src/main/java/org/perlonjava/runtime/operators/OperatorHandler.java +++ b/src/main/java/org/perlonjava/runtime/operators/OperatorHandler.java @@ -126,6 +126,8 @@ public record OperatorHandler(String className, String methodName, int methodTyp put("seek", "seek", "org/perlonjava/runtime/operators/IOOperator", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;Lorg/perlonjava/runtime/runtimetypes/RuntimeList;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); put("select", "select", "org/perlonjava/runtime/operators/IOOperator", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeList;I)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); put("truncate", "truncate", "org/perlonjava/runtime/operators/IOOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); + put("flock", "flock", "org/perlonjava/runtime/operators/IOOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); + put("syscall", "syscall", "org/perlonjava/runtime/operators/SyscallOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); put("sysread", "sysread", "org/perlonjava/runtime/operators/IOOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); put("syswrite", "syswrite", "org/perlonjava/runtime/operators/IOOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); put("write", "write", "org/perlonjava/runtime/operators/IOOperator", "(I[Lorg/perlonjava/runtime/runtimetypes/RuntimeBase;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;"); diff --git a/src/main/java/org/perlonjava/runtime/operators/SyscallOperator.java b/src/main/java/org/perlonjava/runtime/operators/SyscallOperator.java new file mode 100644 index 000000000..6440b6b73 --- /dev/null +++ b/src/main/java/org/perlonjava/runtime/operators/SyscallOperator.java @@ -0,0 +1,109 @@ +package org.perlonjava.runtime.operators; + +import org.perlonjava.runtime.runtimetypes.RuntimeBase; +import org.perlonjava.runtime.runtimetypes.RuntimeScalar; + +import java.net.InetAddress; + +import static org.perlonjava.runtime.runtimetypes.GlobalVariable.getGlobalVariable; +import static org.perlonjava.runtime.runtimetypes.RuntimeScalarCache.scalarFalse; + +/** + * Implementation of Perl's syscall operator for PerlOnJava. + * + * Note: syscall numbers are platform-specific. This implementation provides + * Java-based emulation for commonly used syscalls. + */ +public class SyscallOperator { + + // Common syscall numbers (platform-dependent, these are Linux x86_64 values) + // On macOS, gethostname is a library function, not a syscall + private static final int SYS_GETHOSTNAME_LINUX = 74; // Linux + + /** + * Implements Perl's syscall() function. + * + * syscall NUMBER, LIST + * + * Calls the system call specified by NUMBER with the arguments in LIST. + * Returns the result of the syscall, or -1 on error with $! set. + * + * @param ctx Context + * @param args Arguments: syscall number followed by arguments + * @return Result of the syscall + */ + public static RuntimeScalar syscall(int ctx, RuntimeBase... args) { + if (args.length < 1) { + getGlobalVariable("main::!").set(22); // EINVAL + return new RuntimeScalar(-1); + } + + int syscallNum = args[0].scalar().getInt(); + + // Handle known syscalls with Java implementations + switch (syscallNum) { + case SYS_GETHOSTNAME_LINUX: + return sysGethostname(args); + default: + // For unknown syscalls, check if it might be gethostname on another platform + // by looking at the argument pattern (buffer, length) + if (args.length >= 3) { + // Heuristic: if we have 3 args and arg[2] is a reasonable buffer size, + // assume it might be gethostname + int possibleLen = args[2].scalar().getInt(); + if (possibleLen >= 64 && possibleLen <= 256) { + return sysGethostname(args); + } + } + + // Unsupported syscall + getGlobalVariable("main::!").set(38); // ENOSYS - Function not implemented + return new RuntimeScalar(-1); + } + } + + /** + * Emulates gethostname syscall. + * + * gethostname(char *name, size_t len) + * + * Perl usage: syscall(&SYS_gethostname, $host, 65) + * where $host is pre-allocated with "\0" x 65 + */ + private static RuntimeScalar sysGethostname(RuntimeBase... args) { + if (args.length < 3) { + getGlobalVariable("main::!").set(22); // EINVAL + return new RuntimeScalar(-1); + } + + try { + String hostname = InetAddress.getLocalHost().getHostName(); + int maxLen = args[2].scalar().getInt(); + + // Truncate hostname if necessary (like real gethostname) + if (hostname.length() >= maxLen) { + hostname = hostname.substring(0, maxLen - 1); + } + + // Pad with nulls to expected length (Perl pre-allocates the buffer) + StringBuilder result = new StringBuilder(hostname); + while (result.length() < maxLen) { + result.append('\0'); + } + + // Modify the buffer argument in place + // args[1] should be the scalar that receives the hostname + if (args[1] instanceof RuntimeScalar) { + ((RuntimeScalar) args[1]).set(result.toString()); + } else { + args[1].scalar().set(result.toString()); + } + + return new RuntimeScalar(0); // Success + + } catch (Exception e) { + getGlobalVariable("main::!").set(1); // EPERM or general error + return new RuntimeScalar(-1); + } + } +} diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Socket.java b/src/main/java/org/perlonjava/runtime/perlmodule/Socket.java index 786ef6050..ce677d14e 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Socket.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Socket.java @@ -20,11 +20,30 @@ public class Socket extends PerlModuleBase { // Socket constants public static final int AF_INET = 2; + public static final int AF_INET6 = 10; + public static final int AF_UNIX = 1; public static final int PF_INET = 2; // Protocol family same as address family + public static final int PF_INET6 = 10; + public static final int PF_UNIX = 1; public static final int SOCK_STREAM = 1; public static final int SOCK_DGRAM = 2; + public static final int SOCK_RAW = 3; public static final int SOL_SOCKET = 1; public static final int SO_REUSEADDR = 2; + public static final int SO_KEEPALIVE = 9; + public static final int SO_BROADCAST = 6; + public static final int SO_LINGER = 13; + public static final int TCP_NODELAY = 1; + public static final int IPPROTO_TCP = 6; + public static final int IPPROTO_UDP = 17; + public static final int IPPROTO_ICMP = 1; + public static final int SHUT_RD = 0; + public static final int SHUT_WR = 1; + public static final int SHUT_RDWR = 2; + // INADDR constants as 4-byte packed binary strings + public static final String INADDR_ANY = "\0\0\0\0"; // 0.0.0.0 + public static final String INADDR_LOOPBACK = "\177\0\0\1"; // 127.0.0.1 + public static final String INADDR_BROADCAST = "\377\377\377\377"; // 255.255.255.255 public Socket() { super("Socket", false); @@ -43,11 +62,29 @@ public static void initialize() { // Register constants as subroutines socket.registerMethod("AF_INET", null); + socket.registerMethod("AF_INET6", null); + socket.registerMethod("AF_UNIX", null); socket.registerMethod("PF_INET", null); + socket.registerMethod("PF_INET6", null); + socket.registerMethod("PF_UNIX", null); socket.registerMethod("SOCK_STREAM", null); socket.registerMethod("SOCK_DGRAM", null); + socket.registerMethod("SOCK_RAW", null); socket.registerMethod("SOL_SOCKET", null); socket.registerMethod("SO_REUSEADDR", null); + socket.registerMethod("SO_KEEPALIVE", null); + socket.registerMethod("SO_BROADCAST", null); + socket.registerMethod("SO_LINGER", null); + socket.registerMethod("TCP_NODELAY", null); + socket.registerMethod("IPPROTO_TCP", null); + socket.registerMethod("IPPROTO_UDP", null); + socket.registerMethod("IPPROTO_ICMP", null); + socket.registerMethod("SHUT_RD", null); + socket.registerMethod("SHUT_WR", null); + socket.registerMethod("SHUT_RDWR", null); + socket.registerMethod("INADDR_ANY", null); + socket.registerMethod("INADDR_LOOPBACK", null); + socket.registerMethod("INADDR_BROADCAST", null); } catch (NoSuchMethodException e) { System.err.println("Warning: Missing Socket method: " + e.getMessage()); @@ -242,4 +279,76 @@ public static RuntimeList SOL_SOCKET(RuntimeArray args, int ctx) { public static RuntimeList SO_REUSEADDR(RuntimeArray args, int ctx) { return new RuntimeScalar(SO_REUSEADDR).getList(); } + + public static RuntimeList AF_INET6(RuntimeArray args, int ctx) { + return new RuntimeScalar(AF_INET6).getList(); + } + + public static RuntimeList AF_UNIX(RuntimeArray args, int ctx) { + return new RuntimeScalar(AF_UNIX).getList(); + } + + public static RuntimeList PF_INET6(RuntimeArray args, int ctx) { + return new RuntimeScalar(PF_INET6).getList(); + } + + public static RuntimeList PF_UNIX(RuntimeArray args, int ctx) { + return new RuntimeScalar(PF_UNIX).getList(); + } + + public static RuntimeList SOCK_RAW(RuntimeArray args, int ctx) { + return new RuntimeScalar(SOCK_RAW).getList(); + } + + public static RuntimeList SO_KEEPALIVE(RuntimeArray args, int ctx) { + return new RuntimeScalar(SO_KEEPALIVE).getList(); + } + + public static RuntimeList SO_BROADCAST(RuntimeArray args, int ctx) { + return new RuntimeScalar(SO_BROADCAST).getList(); + } + + public static RuntimeList SO_LINGER(RuntimeArray args, int ctx) { + return new RuntimeScalar(SO_LINGER).getList(); + } + + public static RuntimeList TCP_NODELAY(RuntimeArray args, int ctx) { + return new RuntimeScalar(TCP_NODELAY).getList(); + } + + public static RuntimeList IPPROTO_TCP(RuntimeArray args, int ctx) { + return new RuntimeScalar(IPPROTO_TCP).getList(); + } + + public static RuntimeList IPPROTO_UDP(RuntimeArray args, int ctx) { + return new RuntimeScalar(IPPROTO_UDP).getList(); + } + + public static RuntimeList IPPROTO_ICMP(RuntimeArray args, int ctx) { + return new RuntimeScalar(IPPROTO_ICMP).getList(); + } + + public static RuntimeList SHUT_RD(RuntimeArray args, int ctx) { + return new RuntimeScalar(SHUT_RD).getList(); + } + + public static RuntimeList SHUT_WR(RuntimeArray args, int ctx) { + return new RuntimeScalar(SHUT_WR).getList(); + } + + public static RuntimeList SHUT_RDWR(RuntimeArray args, int ctx) { + return new RuntimeScalar(SHUT_RDWR).getList(); + } + + public static RuntimeList INADDR_ANY(RuntimeArray args, int ctx) { + return new RuntimeScalar(INADDR_ANY).getList(); + } + + public static RuntimeList INADDR_LOOPBACK(RuntimeArray args, int ctx) { + return new RuntimeScalar(INADDR_LOOPBACK).getList(); + } + + public static RuntimeList INADDR_BROADCAST(RuntimeArray args, int ctx) { + return new RuntimeScalar(INADDR_BROADCAST).getList(); + } } diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Symbol.java b/src/main/java/org/perlonjava/runtime/perlmodule/Symbol.java index c748b782b..004860cb3 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Symbol.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Symbol.java @@ -42,16 +42,24 @@ public static void initialize() { } /** - * Placeholder for the gensym functionality. + * Creates a new anonymous glob and returns a reference to it. + * This is equivalent to Perl's Symbol::gensym(). * * @param args The arguments passed to the method. * @param ctx The context in which the method is called. - * @return A RuntimeList. - * @throws PerlCompilerException if the method is not implemented. + * @return A RuntimeList containing a reference to a new anonymous glob. */ public static RuntimeList gensym(RuntimeArray args, int ctx) { - return qualify_to_ref( - new RuntimeArray(new RuntimeScalar("PerlOnJava::__symbol" + EmitterMethodCreator.classCounter++)), SCALAR); + // Create a unique anonymous glob + String globName = "Symbol::GEN" + EmitterMethodCreator.classCounter++; + RuntimeGlob glob = new RuntimeGlob(globName); + + // Return a reference to the glob (not the glob itself) + RuntimeScalar globRef = new RuntimeScalar(); + globRef.type = RuntimeScalarType.GLOBREFERENCE; + globRef.value = glob; + + return globRef.getList(); } /** diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/SysHostname.java b/src/main/java/org/perlonjava/runtime/perlmodule/SysHostname.java new file mode 100644 index 000000000..ea92a404e --- /dev/null +++ b/src/main/java/org/perlonjava/runtime/perlmodule/SysHostname.java @@ -0,0 +1,57 @@ +package org.perlonjava.runtime.perlmodule; + +import org.perlonjava.runtime.runtimetypes.RuntimeArray; +import org.perlonjava.runtime.runtimetypes.RuntimeList; +import org.perlonjava.runtime.runtimetypes.RuntimeScalar; + +import java.net.InetAddress; + +/** + * Sys::Hostname - Try every conceivable way to get hostname + *

+ * This class provides the XS portion of Sys::Hostname, specifically the ghname() + * function that returns the local hostname using Java's networking APIs. + */ +public class SysHostname extends PerlModuleBase { + + /** + * Constructor for SysHostname. + * Initializes the module with the name "Sys::Hostname". + */ + public SysHostname() { + super("Sys::Hostname"); + } + + /** + * Static initializer to set up the Sys::Hostname module. + */ + public static void initialize() { + SysHostname sysHostname = new SysHostname(); + try { + sysHostname.registerMethod("ghname", ""); + } catch (NoSuchMethodException e) { + System.err.println("Warning: Missing Sys::Hostname method: " + e.getMessage()); + } + } + + /** + * Returns the local hostname. + *

+ * This is the XS implementation that Sys::Hostname.pm tries to load first. + * If successful, the hostname() function will use this instead of falling + * back to syscall or external commands. + * + * @param args The arguments passed to the method (none expected). + * @param ctx The context in which the method is called. + * @return A RuntimeList containing the hostname. + */ + public static RuntimeList ghname(RuntimeArray args, int ctx) { + try { + String hostname = InetAddress.getLocalHost().getHostName(); + return new RuntimeScalar(hostname).getList(); + } catch (Exception e) { + // Return undef on failure - Sys::Hostname.pm will try other methods + return new RuntimeScalar().getList(); + } + } +} diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/XSLoader.java b/src/main/java/org/perlonjava/runtime/perlmodule/XSLoader.java index e25ab0774..1a71c8ae9 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/XSLoader.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/XSLoader.java @@ -2,11 +2,13 @@ import org.perlonjava.runtime.operators.WarnDie; import org.perlonjava.runtime.runtimetypes.RuntimeArray; +import org.perlonjava.runtime.runtimetypes.RuntimeCode; import org.perlonjava.runtime.runtimetypes.RuntimeList; import org.perlonjava.runtime.runtimetypes.RuntimeScalar; import java.lang.reflect.Method; +import static org.perlonjava.runtime.runtimetypes.RuntimeContextType.SCALAR; import static org.perlonjava.runtime.runtimetypes.RuntimeScalarCache.scalarTrue; public class XSLoader extends PerlModuleBase { @@ -33,13 +35,36 @@ public static void initialize() { /** * Loads a PerlOnJava module. + *

+ * If no module name is provided as an argument, uses caller() to determine + * the calling package name, matching standard Perl XSLoader behavior. * * @param args The arguments passed to the method. * @param ctx The context in which the method is called. - * @return A RuntimeList. + * @return A RuntimeList containing true on success, false on failure. */ public static RuntimeList load(RuntimeArray args, int ctx) { - String moduleName = args.getFirst().toString(); + String moduleName; + + if (args.isEmpty() || args.getFirst().toString().isEmpty()) { + // No module name provided - use caller() to get the calling package + RuntimeList callerInfo = RuntimeCode.caller(new RuntimeList(), SCALAR); + if (callerInfo.isEmpty()) { + return WarnDie.die( + new RuntimeScalar("Can't determine module name for XSLoader::load"), + new RuntimeScalar("\n") + ).getList(); + } + moduleName = callerInfo.scalar().toString(); + if (moduleName.isEmpty()) { + return WarnDie.die( + new RuntimeScalar("Can't determine module name for XSLoader::load"), + new RuntimeScalar("\n") + ).getList(); + } + } else { + moduleName = args.getFirst().toString(); + } // Convert Perl::Module::Name to org.perlonjava.runtime.perlmodule.PerlModuleName String[] parts = moduleName.split("::"); diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java index 4924879b9..2db73d45e 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java @@ -19,27 +19,41 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) char nextChar = s.charAt(offset); // Check for numeric backreferences vs octal escapes - // In Perl: \400, \600, \777 are octals (> 255), not backreferences - // But \1-\9 followed by non-octal digits are backreferences + // Perl disambiguation rules (from perlrebackslash): + // 1. If the backslash is followed by a single digit, it's a backreference. + // 2. If the first digit following the backslash is a 0, it's an octal escape. + // 3. If the number N (in decimal) and Perl has already seen N capture groups, + // it's a backreference. Otherwise, it's an octal escape. + // + // Examples: + // \100 with 100 capture groups -> backreference to group 100 + // \100 with 0 capture groups -> octal 100 = '@' + // \037 with 0 capture groups -> octal 037 = unit separator (used in Archive::Tar) + // \9 is always a backreference (single digit rule) boolean isOctalNotBackref = false; if (nextChar >= '1' && nextChar <= '9') { - // Check if this might be a 3-digit octal > 255 - if (nextChar >= '1' && nextChar <= '7' && offset + 2 < length) { - int d1 = nextChar - '0'; - char c2 = s.charAt(offset + 1); - char c3 = offset + 2 < length ? s.charAt(offset + 2) : '\0'; - - if (c2 >= '0' && c2 <= '7' && c3 >= '0' && c3 <= '7') { - int octalValue = d1 * 64 + (c2 - '0') * 8 + (c3 - '0'); - if (octalValue > 255) { - // This is an octal escape, not a backreference - // Fall through to octal handling below at line ~320 - // Leave the backslash in sb for the octal handler to manage - // offset stays pointing to the first octal digit ('4' in \400) - isOctalNotBackref = true; + // Parse all consecutive digits to get the full number + int endDigits = offset; + while (endDigits < length && s.charAt(endDigits) >= '0' && s.charAt(endDigits) <= '9') { + endDigits++; + } + String digitStr = s.substring(offset, endDigits); + int refNum = Integer.parseInt(digitStr); + + // Rule 3: If refNum > captureGroupCount, it's an octal escape (if valid octal) + if (refNum > RegexPreprocessor.captureGroupCount) { + // Check if all digits are octal (0-7) + boolean allOctal = true; + for (int i = 0; i < digitStr.length(); i++) { + char d = digitStr.charAt(i); + if (d > '7') { + allOctal = false; + break; } - // else: It's a 3-digit octal <= 255, treat as backreference - // (Perl's behavior: \1-\377 are backreferences if groups exist) + } + if (allOctal && digitStr.length() >= 2) { + // This is an octal escape, not a backreference + isOctalNotBackref = true; } } } diff --git a/src/main/perl/lib/Archive/Tar.pm b/src/main/perl/lib/Archive/Tar.pm new file mode 100644 index 000000000..2df0931e8 --- /dev/null +++ b/src/main/perl/lib/Archive/Tar.pm @@ -0,0 +1,2450 @@ +### the gnu tar specification: +### https://www.gnu.org/software/tar/manual/tar.html +### +### and the pax format spec, which tar derives from: +### https://www.opengroup.org/onlinepubs/007904975/utilities/pax.html + +package Archive::Tar; +require 5.005_03; + +use Cwd; +use IO::Zlib; +use IO::File; +use Carp qw(carp croak); +use File::Spec (); +use File::Spec::Unix (); +use File::Path (); + +use Archive::Tar::File; +use Archive::Tar::Constant; + +require Exporter; + +use strict; +use vars qw[$DEBUG $error $VERSION $WARN $FOLLOW_SYMLINK $CHOWN $CHMOD + $DO_NOT_USE_PREFIX $HAS_PERLIO $HAS_IO_STRING $SAME_PERMISSIONS + $INSECURE_EXTRACT_MODE $ZERO_PAD_NUMBERS @ISA @EXPORT $RESOLVE_SYMLINK + $EXTRACT_BLOCK_SIZE + ]; + +@ISA = qw[Exporter]; +@EXPORT = qw[ COMPRESS_GZIP COMPRESS_BZIP COMPRESS_XZ ]; +$DEBUG = 0; +$WARN = 1; +$FOLLOW_SYMLINK = 0; +$VERSION = "3.04"; +$CHOWN = 1; +$CHMOD = 1; +$SAME_PERMISSIONS = $> == 0 ? 1 : 0; +$DO_NOT_USE_PREFIX = 0; +$INSECURE_EXTRACT_MODE = 0; +$ZERO_PAD_NUMBERS = 0; +$RESOLVE_SYMLINK = $ENV{'PERL5_AT_RESOLVE_SYMLINK'} || 'speed'; +$EXTRACT_BLOCK_SIZE = 1024 * 1024 * 1024; + +BEGIN { + use Config; + $HAS_PERLIO = $Config::Config{useperlio}; + + ### try and load IO::String anyway, so you can dynamically + ### switch between perlio and IO::String + $HAS_IO_STRING = eval { + require IO::String; + IO::String->import; + 1; + } || 0; +} + +=head1 NAME + +Archive::Tar - module for manipulations of tar archives + +=head1 SYNOPSIS + + use Archive::Tar; + my $tar = Archive::Tar->new; + + $tar->read('origin.tgz'); + $tar->extract(); + + $tar->add_files('file/foo.pl', 'docs/README'); + $tar->add_data('file/baz.txt', 'This is the contents now'); + + $tar->rename('oldname', 'new/file/name'); + $tar->chown('/', 'root'); + $tar->chown('/', 'root:root'); + $tar->chmod('/tmp', '1777'); + + $tar->write('files.tar'); # plain tar + $tar->write('files.tgz', COMPRESS_GZIP); # gzip compressed + $tar->write('files.tbz', COMPRESS_BZIP); # bzip2 compressed + $tar->write('files.txz', COMPRESS_XZ); # xz compressed + +=head1 DESCRIPTION + +Archive::Tar provides an object oriented mechanism for handling tar +files. It provides class methods for quick and easy files handling +while also allowing for the creation of tar file objects for custom +manipulation. If you have the IO::Zlib module installed, +Archive::Tar will also support compressed or gzipped tar files. + +An object of class Archive::Tar represents a .tar(.gz) archive full +of files and things. + +=head1 Object Methods + +=head2 Archive::Tar->new( [$file, $compressed] ) + +Returns a new Tar object. If given any arguments, C calls the +C method automatically, passing on the arguments provided to +the C method. + +If C is invoked with arguments and the C method fails +for any reason, C returns undef. + +=cut + +my $tmpl = { + _data => [ ], + _file => 'Unknown', +}; + +### install get/set accessors for this object. +for my $key ( keys %$tmpl ) { + no strict 'refs'; + *{__PACKAGE__."::$key"} = sub { + my $self = shift; + $self->{$key} = $_[0] if @_; + return $self->{$key}; + } +} + +sub new { + my $class = shift; + $class = ref $class if ref $class; + + ### copying $tmpl here since a shallow copy makes it use the + ### same aref, causing for files to remain in memory always. + my $obj = bless { _data => [ ], _file => 'Unknown', _error => '' }, $class; + + if (@_) { + unless ( $obj->read( @_ ) ) { + $obj->_error(qq[No data could be read from file]); + return; + } + } + + return $obj; +} + +=head2 $tar->read ( $filename|$handle, [$compressed, {opt => 'val'}] ) + +Read the given tar file into memory. +The first argument can either be the name of a file or a reference to +an already open filehandle (or an IO::Zlib object if it's compressed) + +The C will I any previous content in C<$tar>! + +The second argument may be considered optional, but remains for +backwards compatibility. Archive::Tar now looks at the file +magic to determine what class should be used to open the file +and will transparently Do The Right Thing. + +Archive::Tar will warn if you try to pass a bzip2 / xz compressed file and the +IO::Uncompress::Bunzip2 / IO::Uncompress::UnXz are not available and simply return. + +Note that you can currently B pass a C compressed +filehandle, which is not opened with C, a C compressed +filehandle, which is not opened with C, a C compressed +filehandle, which is not opened with C, nor a string +containing the full archive information (either compressed or +uncompressed). These are worth while features, but not currently +implemented. See the C section. + +The third argument can be a hash reference with options. Note that +all options are case-sensitive. + +=over 4 + +=item limit + +Do not read more than C files. This is useful if you have +very big archives, and are only interested in the first few files. + +=item filter + +Can be set to a regular expression. Only files with names that match +the expression will be read. + +=item md5 + +Set to 1 and the md5sum of files will be returned (instead of file data) + my $iter = Archive::Tar->iter( $file, 1, {md5 => 1} ); + while( my $f = $iter->() ) { + print $f->data . "\t" . $f->full_path . $/; + } + +=item extract + +If set to true, immediately extract entries when reading them. This +gives you the same memory break as the C function. +Note however that entries will not be read into memory, but written +straight to disk. This means no C objects are +created for you to inspect. + +=back + +All files are stored internally as C objects. +Please consult the L documentation for details. + +Returns the number of files read in scalar context, and a list of +C objects in list context. + +=cut + +sub read { + my $self = shift; + my $file = shift; + my $gzip = shift || 0; + my $opts = shift || {}; + + unless( defined $file ) { + $self->_error( qq[No file to read from!] ); + return; + } else { + $self->_file( $file ); + } + + my $handle = $self->_get_handle($file, $gzip, READ_ONLY->( ZLIB ) ) + or return; + + my $data = $self->_read_tar( $handle, $opts ) or return; + + $self->_data( $data ); + + return wantarray ? @$data : scalar @$data; +} + +sub _get_handle { + my $self = shift; + my $file = shift; return unless defined $file; + my $compress = shift || 0; + my $mode = shift || READ_ONLY->( ZLIB ); # default to read only + + ### Check if file is a file handle or IO glob + if ( ref $file ) { + return $file if eval{ *$file{IO} }; + return $file if eval{ $file->isa(q{IO::Handle}) }; + $file = q{}.$file; + } + + ### get a FH opened to the right class, so we can use it transparently + ### throughout the program + my $fh; + { ### reading magic only makes sense if we're opening a file for + ### reading. otherwise, just use what the user requested. + my $magic = ''; + if( MODE_READ->($mode) ) { + open my $tmp, $file or do { + $self->_error( qq[Could not open '$file' for reading: $!] ); + return; + }; + + ### read the first 6 bytes of the file to figure out which class to + ### use to open the file. + sysread( $tmp, $magic, 6 ); + close $tmp; + } + + ### is it xz? + ### if you asked specifically for xz compression, or if we're in + ### read mode and the magic numbers add up, use xz + if( XZ and ( + ($compress eq COMPRESS_XZ) or + ( MODE_READ->($mode) and $magic =~ XZ_MAGIC_NUM ) + ) + ) { + if( MODE_READ->($mode) ) { + $fh = IO::Uncompress::UnXz->new( $file ) or do { + $self->_error( qq[Could not read '$file': ] . + $IO::Uncompress::UnXz::UnXzError + ); + return; + }; + } else { + $fh = IO::Compress::Xz->new( $file ) or do { + $self->_error( qq[Could not write to '$file': ] . + $IO::Compress::Xz::XzError + ); + return; + }; + } + + ### is it bzip? + ### if you asked specifically for bzip compression, or if we're in + ### read mode and the magic numbers add up, use bzip + } elsif( BZIP and ( + ($compress eq COMPRESS_BZIP) or + ( MODE_READ->($mode) and $magic =~ BZIP_MAGIC_NUM ) + ) + ) { + + ### different reader/writer modules, different error vars... sigh + if( MODE_READ->($mode) ) { + $fh = IO::Uncompress::Bunzip2->new( $file, MultiStream => 1 ) or do { + $self->_error( qq[Could not read '$file': ] . + $IO::Uncompress::Bunzip2::Bunzip2Error + ); + return; + }; + + } else { + $fh = IO::Compress::Bzip2->new( $file ) or do { + $self->_error( qq[Could not write to '$file': ] . + $IO::Compress::Bzip2::Bzip2Error + ); + return; + }; + } + + ### is it gzip? + ### if you asked for compression, if you wanted to read or the gzip + ### magic number is present (redundant with read) + } elsif( ZLIB and ( + $compress or MODE_READ->($mode) or $magic =~ GZIP_MAGIC_NUM + ) + ) { + $fh = IO::Zlib->new; + + unless( $fh->open( $file, $mode ) ) { + $self->_error(qq[Could not create filehandle for '$file': $!]); + return; + } + + ### is it plain tar? + } else { + $fh = IO::File->new; + + unless( $fh->open( $file, $mode ) ) { + $self->_error(qq[Could not create filehandle for '$file': $!]); + return; + } + + ### enable bin mode on tar archives + binmode $fh; + } + } + + return $fh; +} + + +sub _read_tar { + my $self = shift; + my $handle = shift or return; + my $opts = shift || {}; + + my $count = $opts->{limit} || 0; + my $filter = $opts->{filter}; + my $md5 = $opts->{md5} || 0; # cdrake + my $filter_cb = $opts->{filter_cb}; + my $extract = $opts->{extract} || 0; + + ### set a cap on the amount of files to extract ### + my $limit = 0; + $limit = 1 if $count > 0; + + my $tarfile = [ ]; + my $chunk; + my $read = 0; + my $real_name; # to set the name of a file when + # we're encountering @longlink + my $data; + + LOOP: + while( $handle->read( $chunk, HEAD ) ) { + ### IO::Zlib doesn't support this yet + my $offset; + if ( ref($handle) ne 'IO::Zlib' ) { + local $@; + $offset = eval { tell $handle } || 'unknown'; + $@ = ''; + } + else { + $offset = 'unknown'; + } + + unless( $read++ ) { + my $gzip = GZIP_MAGIC_NUM; + if( $chunk =~ /$gzip/ ) { + $self->_error( qq[Cannot read compressed format in tar-mode] ); + return; + } + + ### size is < HEAD, which means a corrupted file, as the minimum + ### length is _at least_ HEAD + if (length $chunk != HEAD) { + $self->_error( qq[Cannot read enough bytes from the tarfile] ); + return; + } + } + + ### if we can't read in all bytes... ### + last if length $chunk != HEAD; + + ### Apparently this should really be two blocks of 512 zeroes, + ### but GNU tar sometimes gets it wrong. See comment in the + ### source code (tar.c) to GNU cpio. + next if $chunk eq TAR_END; + + ### according to the posix spec, the last 12 bytes of the header are + ### null bytes, to pad it to a 512 byte block. That means if these + ### bytes are NOT null bytes, it's a corrupt header. See: + ### www.koders.com/c/fidCE473AD3D9F835D690259D60AD5654591D91D5BA.aspx + ### line 111 + { my $nulls = join '', "\0" x 12; + unless( $nulls eq substr( $chunk, 500, 12 ) ) { + $self->_error( qq[Invalid header block at offset $offset] ); + next LOOP; + } + } + + ### pass the realname, so we can set it 'proper' right away + ### some of the heuristics are done on the name, so important + ### to set it ASAP + my $entry; + { my %extra_args = (); + $extra_args{'name'} = $$real_name if defined $real_name; + + unless( $entry = Archive::Tar::File->new( chunk => $chunk, + %extra_args ) + ) { + $self->_error( qq[Couldn't read chunk at offset $offset] ); + next LOOP; + } + } + + ### ignore labels: + ### https://www.gnu.org/software/tar/manual/html_chapter/Media.html#SEC159 + next if $entry->is_label; + + if( length $entry->type and ($entry->is_file || $entry->is_longlink) ) { + + if ( $entry->is_file && !$entry->validate ) { + ### sometimes the chunk is rather fux0r3d and a whole 512 + ### bytes ends up in the ->name area. + ### clean it up, if need be + my $name = $entry->name; + $name = substr($name, 0, 100) if length $name > 100; + $name =~ s/\n/ /g; + + $self->_error( $name . qq[: checksum error] ); + next LOOP; + } + + my $block = BLOCK_SIZE->( $entry->size ); + + $data = $entry->get_content_by_ref; + + my $skip = 0; + my $ctx; # cdrake + ### skip this entry if we're filtering + + if($md5) { # cdrake + $ctx = Digest::MD5->new; # cdrake + $skip=5; # cdrake + + } elsif ($filter && $entry->name !~ $filter) { + $skip = 1; + + } elsif ($filter_cb && ! $filter_cb->($entry)) { + $skip = 2; + + ### skip this entry if it's a pax header. This is a special file added + ### by, among others, git-generated tarballs. It holds comments and is + ### not meant for extracting. See #38932: pax_global_header extracted + } elsif ( $entry->name eq PAX_HEADER or $entry->type =~ /^(x|g)$/ ) { + $skip = 3; + } + + if ($skip) { + # + # Since we're skipping, do not allocate memory for the + # whole file. Read it 64 BLOCKS at a time. Do not + # complete the skip yet because maybe what we read is a + # longlink and it won't get skipped after all + # + my $amt = $block; + my $fsz=$entry->size; # cdrake + while ($amt > 0) { + $$data = ''; + my $this = 64 * BLOCK; + $this = $amt if $this > $amt; + if( $handle->read( $$data, $this ) < $this ) { + $self->_error( qq[Read error on tarfile (missing data) ']. + $entry->full_path ."' at offset $offset" ); + next LOOP; + } + $amt -= $this; + $fsz -= $this; # cdrake + substr ($$data, $fsz) = "" if ($fsz<0); # remove external junk prior to md5 # cdrake + $ctx->add($$data) if($skip==5); # cdrake + } + $$data = $ctx->hexdigest if($skip==5 && !$entry->is_longlink && !$entry->is_unknown && !$entry->is_label ) ; # cdrake + } else { + + ### just read everything into memory + ### can't do lazy loading since IO::Zlib doesn't support 'seek' + ### this is because Compress::Zlib doesn't support it =/ + ### this reads in the whole data in one read() call. + if ( $handle->read( $$data, $block ) < $block ) { + $self->_error( qq[Read error on tarfile (missing data) ']. + $entry->full_path ."' at offset $offset" ); + next LOOP; + } + ### throw away trailing garbage ### + substr ($$data, $entry->size) = "" if defined $$data; + } + + ### part II of the @LongLink munging -- need to do /after/ + ### the checksum check. + if( $entry->is_longlink ) { + ### weird thing in tarfiles -- if the file is actually a + ### @LongLink, the data part seems to have a trailing ^@ + ### (unprintable) char. to display, pipe output through less. + ### but that doesn't *always* happen.. so check if the last + ### character is a control character, and if so remove it + ### at any rate, we better remove that character here, or tests + ### like 'eq' and hash lookups based on names will SO not work + ### remove it by calculating the proper size, and then + ### tossing out everything that's longer than that size. + + ### count number of nulls + my $nulls = $$data =~ tr/\0/\0/; + + ### cut data + size by that many bytes + $entry->size( $entry->size - $nulls ); + substr ($$data, $entry->size) = ""; + } + } + + ### clean up of the entries.. posix tar /apparently/ has some + ### weird 'feature' that allows for filenames > 255 characters + ### they'll put a header in with as name '././@LongLink' and the + ### contents will be the name of the /next/ file in the archive + ### pretty crappy and kludgy if you ask me + + ### set the name for the next entry if this is a @LongLink; + ### this is one ugly hack =/ but needed for direct extraction + if( $entry->is_longlink ) { + $real_name = $data; + next LOOP; + } elsif ( defined $real_name ) { + $entry->name( $$real_name ); + $entry->prefix(''); + undef $real_name; + } + + if ($filter && $entry->name !~ $filter) { + next LOOP; + + } elsif ($filter_cb && ! $filter_cb->($entry)) { + next LOOP; + + ### skip this entry if it's a pax header. This is a special file added + ### by, among others, git-generated tarballs. It holds comments and is + ### not meant for extracting. See #38932: pax_global_header extracted + } elsif ( $entry->name eq PAX_HEADER or $entry->type =~ /^(x|g)$/ ) { + next LOOP; + } + + if ( $extract && !$entry->is_longlink + && !$entry->is_unknown + && !$entry->is_label ) { + $self->_extract_file( $entry ) or return; + } + + ### Guard against tarfiles with garbage at the end + last LOOP if $entry->name eq ''; + + ### push only the name on the rv if we're extracting + ### -- for extract_archive + push @$tarfile, ($extract ? $entry->name : $entry); + + if( $limit ) { + $count-- unless $entry->is_longlink || $entry->is_dir; + last LOOP unless $count; + } + } continue { + undef $data; + } + + return $tarfile; +} + +=head2 $tar->contains_file( $filename ) + +Check if the archive contains a certain file. +It will return true if the file is in the archive, false otherwise. + +Note however, that this function does an exact match using C +on the full path. So it cannot compensate for case-insensitive file- +systems or compare 2 paths to see if they would point to the same +underlying file. + +=cut + +sub contains_file { + my $self = shift; + my $full = shift; + + return unless defined $full; + + ### don't warn if the entry isn't there.. that's what this function + ### is for after all. + local $WARN = 0; + return 1 if $self->_find_entry($full); + return; +} + +=head2 $tar->extract( [@filenames] ) + +Write files whose names are equivalent to any of the names in +C<@filenames> to disk, creating subdirectories as necessary. This +might not work too well under VMS. +Under MacPerl, the file's modification time will be converted to the +MacOS zero of time, and appropriate conversions will be done to the +path. However, the length of each element of the path is not +inspected to see whether it's longer than MacOS currently allows (32 +characters). + +If C is called without a list of file names, the entire +contents of the archive are extracted. + +Returns a list of filenames extracted. + +=cut + +sub extract { + my $self = shift; + my @args = @_; + my @files; + my $hashmap; + + # use the speed optimization for all extracted files + local($self->{cwd}) = cwd() unless $self->{cwd}; + + ### you requested the extraction of only certain files + if( @args ) { + for my $file ( @args ) { + + ### it's already an object? + if( UNIVERSAL::isa( $file, 'Archive::Tar::File' ) ) { + push @files, $file; + next; + + ### go find it then + } else { + + # create hash-map once to speed up lookup + $hashmap = $hashmap || { + map { $_->full_path, $_ } @{$self->_data} + }; + + if (exists $hashmap->{$file}) { + ### we found the file you're looking for + push @files, $hashmap->{$file}; + } else { + return $self->_error( + qq[Could not find '$file' in archive] ); + } + } + } + + ### just grab all the file items + } else { + @files = $self->get_files; + } + + ### nothing found? that's an error + unless( scalar @files ) { + $self->_error( qq[No files found for ] . $self->_file ); + return; + } + + ### now extract them + for my $entry ( @files ) { + unless( $self->_extract_file( $entry ) ) { + $self->_error(q[Could not extract ']. $entry->full_path .q['] ); + return; + } + } + + return @files; +} + +=head2 $tar->extract_file( $file, [$extract_path] ) + +Write an entry, whose name is equivalent to the file name provided to +disk. Optionally takes a second parameter, which is the full native +path (including filename) the entry will be written to. + +For example: + + $tar->extract_file( 'name/in/archive', 'name/i/want/to/give/it' ); + + $tar->extract_file( $at_file_object, 'name/i/want/to/give/it' ); + +Returns true on success, false on failure. + +=cut + +sub extract_file { + my $self = shift; + my $file = shift; return unless defined $file; + my $alt = shift; + + my $entry = $self->_find_entry( $file ) + or $self->_error( qq[Could not find an entry for '$file'] ), return; + + return $self->_extract_file( $entry, $alt ); +} + +sub _extract_file { + my $self = shift; + my $entry = shift or return; + my $alt = shift; + + ### you wanted an alternate extraction location ### + my $name = defined $alt ? $alt : $entry->full_path; + + ### splitpath takes a bool at the end to indicate + ### that it's splitting a dir + my ($vol,$dirs,$file); + if ( defined $alt ) { # It's a local-OS path + ($vol,$dirs,$file) = File::Spec->splitpath( $alt, + $entry->is_dir ); + } else { + ($vol,$dirs,$file) = File::Spec::Unix->splitpath( $name, + $entry->is_dir ); + } + + my $dir; + ### is $name an absolute path? ### + if( $vol || File::Spec->file_name_is_absolute( $dirs ) ) { + + ### absolute names are not allowed to be in tarballs under + ### strict mode, so only allow it if a user tells us to do it + if( not defined $alt and not $INSECURE_EXTRACT_MODE ) { + $self->_error( + q[Entry ']. $entry->full_path .q[' is an absolute path. ]. + q[Not extracting absolute paths under SECURE EXTRACT MODE] + ); + return; + } + + ### user asked us to, it's fine. + $dir = File::Spec->catpath( $vol, $dirs, "" ); + + ### it's a relative path ### + } else { + my $cwd = (ref $self and defined $self->{cwd}) + ? $self->{cwd} + : cwd(); + + my @dirs = defined $alt + ? File::Spec->splitdir( $dirs ) # It's a local-OS path + : File::Spec::Unix->splitdir( $dirs ); # it's UNIX-style, likely + # straight from the tarball + + if( not defined $alt and + not $INSECURE_EXTRACT_MODE + ) { + + ### paths that leave the current directory are not allowed under + ### strict mode, so only allow it if a user tells us to do this. + if( grep { $_ eq '..' } @dirs ) { + + $self->_error( + q[Entry ']. $entry->full_path .q[' is attempting to leave ]. + q[the current working directory. Not extracting under ]. + q[SECURE EXTRACT MODE] + ); + return; + } + + ### the archive may be asking us to extract into a symlink. This + ### is not sane and a possible security issue, as outlined here: + ### https://rt.cpan.org/Ticket/Display.html?id=30380 + ### https://bugzilla.redhat.com/show_bug.cgi?id=295021 + ### https://issues.rpath.com/browse/RPL-1716 + my $full_path = $cwd; + for my $d ( @dirs ) { + $full_path = File::Spec->catdir( $full_path, $d ); + + ### we've already checked this one, and it's safe. Move on. + next if ref $self and $self->{_link_cache}->{$full_path}; + + if( -l $full_path ) { + my $to = readlink $full_path; + my $diag = "symlinked directory ($full_path => $to)"; + + $self->_error( + q[Entry ']. $entry->full_path .q[' is attempting to ]. + qq[extract to a $diag. This is considered a security ]. + q[vulnerability and not allowed under SECURE EXTRACT ]. + q[MODE] + ); + return; + } + + ### XXX keep a cache if possible, so the stats become cheaper: + $self->{_link_cache}->{$full_path} = 1 if ref $self; + } + } + + ### '.' is the directory delimiter on VMS, which has to be escaped + ### or changed to '_' on vms. vmsify is used, because older versions + ### of vmspath do not handle this properly. + ### Must not add a '/' to an empty directory though. + map { length() ? VMS::Filespec::vmsify($_.'/') : $_ } @dirs if ON_VMS; + + my ($cwd_vol,$cwd_dir,$cwd_file) + = File::Spec->splitpath( $cwd ); + my @cwd = File::Spec->splitdir( $cwd_dir ); + push @cwd, $cwd_file if length $cwd_file; + + ### We need to pass '' as the last element to catpath. Craig Berry + ### explains why (msgid ): + ### The root problem is that splitpath on UNIX always returns the + ### final path element as a file even if it is a directory, and of + ### course there is no way it can know the difference without checking + ### against the filesystem, which it is documented as not doing. When + ### you turn around and call catpath, on VMS you have to know which bits + ### are directory bits and which bits are file bits. In this case we + ### know the result should be a directory. I had thought you could omit + ### the file argument to catpath in such a case, but apparently on UNIX + ### you can't. + $dir = File::Spec->catpath( + $cwd_vol, File::Spec->catdir( @cwd, @dirs ), '' + ); + + ### catdir() returns undef if the path is longer than 255 chars on + ### older VMS systems. + unless ( defined $dir ) { + $^W && $self->_error( qq[Could not compose a path for '$dirs'\n] ); + return; + } + + } + + if( -e $dir && !-d _ ) { + $^W && $self->_error( qq['$dir' exists, but it's not a directory!\n] ); + return; + } + + unless ( -d _ ) { + eval { File::Path::mkpath( $dir, 0, 0777 ) }; + if( $@ ) { + my $fp = $entry->full_path; + $self->_error(qq[Could not create directory '$dir' for '$fp': $@]); + return; + } + + ### XXX chown here? that might not be the same as in the archive + ### as we're only chown'ing to the owner of the file we're extracting + ### not to the owner of the directory itself, which may or may not + ### be another entry in the archive + ### Answer: no, gnu tar doesn't do it either, it'd be the wrong + ### way to go. + #if( $CHOWN && CAN_CHOWN ) { + # chown $entry->uid, $entry->gid, $dir or + # $self->_error( qq[Could not set uid/gid on '$dir'] ); + #} + } + + ### we're done if we just needed to create a dir ### + return 1 if $entry->is_dir; + + my $full = File::Spec->catfile( $dir, $file ); + + if( $entry->is_unknown ) { + $self->_error( qq[Unknown file type for file '$full'] ); + return; + } + + ### If a file system already contains a block device with the same name as + ### the being extracted regular file, we would write the file's content + ### to the block device. So remove the existing file (block device) now. + ### If an archive contains multiple same-named entries, the last one + ### should replace the previous ones. So remove the old file now. + ### If the old entry is a symlink to a file outside of the CWD, the new + ### entry would create a file there. This is CVE-2018-12015 + ### . + if (-l $full || -e _) { + if (!unlink $full) { + $self->_error( qq[Could not remove old file '$full': $!] ); + return; + } + } + if( length $entry->type && $entry->is_file ) { + my $fh = IO::File->new; + $fh->open( $full, '>' ) or ( + $self->_error( qq[Could not open file '$full': $!] ), + return + ); + + if( $entry->size ) { + binmode $fh; + my $offset = 0; + my $content = $entry->get_content_by_ref(); + while ($offset < $entry->size) { + my $written + = syswrite $fh, $$content, $EXTRACT_BLOCK_SIZE, $offset; + if (defined $written) { + $offset += $written; + } else { + $self->_error( qq[Could not write data to '$full': $!] ); + return; + } + } + } + + close $fh or ( + $self->_error( qq[Could not close file '$full'] ), + return + ); + + } else { + $self->_make_special_file( $entry, $full ) or return; + } + + ### only update the timestamp if it's not a symlink; that will change the + ### timestamp of the original. This addresses bug #33669: Could not update + ### timestamp warning on symlinks + if( not -l $full ) { + utime time, $entry->mtime - TIME_OFFSET, $full or + $self->_error( qq[Could not update timestamp] ); + } + + if( $CHOWN && CAN_CHOWN->() and not -l $full ) { + CORE::chown( $entry->uid, $entry->gid, $full ) or + $self->_error( qq[Could not set uid/gid on '$full'] ); + } + + ### only chmod if we're allowed to, but never chmod symlinks, since they'll + ### change the perms on the file they're linking too... + if( $CHMOD and not -l $full ) { + my $mode = $entry->mode; + unless ($SAME_PERMISSIONS) { + $mode &= ~(oct(7000) | umask); + } + CORE::chmod( $mode, $full ) or + $self->_error( qq[Could not chown '$full' to ] . $entry->mode ); + } + + return 1; +} + +sub _make_special_file { + my $self = shift; + my $entry = shift or return; + my $file = shift; return unless defined $file; + + my $err; + + if( $entry->is_symlink ) { + my $fail; + if( ON_UNIX ) { + symlink( $entry->linkname, $file ) or $fail++; + + } else { + $self->_extract_special_file_as_plain_file( $entry, $file ) + or $fail++; + } + + $err = qq[Making symbolic link '$file' to '] . + $entry->linkname .q[' failed] if $fail; + + } elsif ( $entry->is_hardlink ) { + my $fail; + if( ON_UNIX ) { + link( $entry->linkname, $file ) or $fail++; + + } else { + $self->_extract_special_file_as_plain_file( $entry, $file ) + or $fail++; + } + + $err = qq[Making hard link from '] . $entry->linkname . + qq[' to '$file' failed] if $fail; + + } elsif ( $entry->is_fifo ) { + ON_UNIX && !system('mknod', $file, 'p') or + $err = qq[Making fifo ']. $entry->name .qq[' failed]; + + } elsif ( $entry->is_blockdev or $entry->is_chardev ) { + my $mode = $entry->is_blockdev ? 'b' : 'c'; + + ON_UNIX && !system('mknod', $file, $mode, + $entry->devmajor, $entry->devminor) or + $err = qq[Making block device ']. $entry->name .qq[' (maj=] . + $entry->devmajor . qq[ min=] . $entry->devminor . + qq[) failed.]; + + } elsif ( $entry->is_socket ) { + ### the original doesn't do anything special for sockets.... ### + 1; + } + + return $err ? $self->_error( $err ) : 1; +} + +### don't know how to make symlinks, let's just extract the file as +### a plain file +sub _extract_special_file_as_plain_file { + my $self = shift; + my $entry = shift or return; + my $file = shift; return unless defined $file; + + my $err; + TRY: { + my $orig = $self->_find_entry( $entry->linkname, $entry ); + + unless( $orig ) { + $err = qq[Could not find file '] . $entry->linkname . + qq[' in memory.]; + last TRY; + } + + ### clone the entry, make it appear as a normal file ### + my $clone = $orig->clone; + $clone->_downgrade_to_plainfile; + $self->_extract_file( $clone, $file ) or last TRY; + + return 1; + } + + return $self->_error($err); +} + +=head2 $tar->list_files( [\@properties] ) + +Returns a list of the names of all the files in the archive. + +If C is passed an array reference as its first argument +it returns a list of hash references containing the requested +properties of each file. The following list of properties is +supported: name, size, mtime (last modified date), mode, uid, gid, +linkname, uname, gname, devmajor, devminor, prefix. + +Passing an array reference containing only one element, 'name', is +special cased to return a list of names rather than a list of hash +references, making it equivalent to calling C without +arguments. + +=cut + +sub list_files { + my $self = shift; + my $aref = shift || [ ]; + + unless( $self->_data ) { + $self->read() or return; + } + + if( @$aref == 0 or ( @$aref == 1 and $aref->[0] eq 'name' ) ) { + return map { $_->full_path } @{$self->_data}; + } else { + + #my @rv; + #for my $obj ( @{$self->_data} ) { + # push @rv, { map { $_ => $obj->$_() } @$aref }; + #} + #return @rv; + + ### this does the same as the above.. just needs a +{ } + ### to make sure perl doesn't confuse it for a block + return map { my $o=$_; + +{ map { $_ => $o->$_() } @$aref } + } @{$self->_data}; + } +} + +sub _find_entry { + my $self = shift; + my $file = shift; + + unless( defined $file ) { + $self->_error( qq[No file specified] ); + return; + } + + ### it's an object already + return $file if UNIVERSAL::isa( $file, 'Archive::Tar::File' ); + +seach_entry: + if($self->_data){ + for my $entry ( @{$self->_data} ) { + my $path = $entry->full_path; + return $entry if $path eq $file; + } + } + + if($Archive::Tar::RESOLVE_SYMLINK!~/none/){ + if(my $link_entry = shift()){#fallback mode when symlinks are using relative notations ( ../a/./b/text.bin ) + $file = _symlinks_resolver( $link_entry->name, $file ); + goto seach_entry if $self->_data; + + #this will be slower than never, but won't failed! + + my $iterargs = $link_entry->{'_archive'}; + if($Archive::Tar::RESOLVE_SYMLINK=~/speed/ && @$iterargs==3){ + #faster but whole archive will be read in memory + #read whole archive and share data + my $archive = Archive::Tar->new; + $archive->read( @$iterargs ); + push @$iterargs, $archive; #take a trace for destruction + if($archive->_data){ + $self->_data( $archive->_data ); + goto seach_entry; + } + }#faster + + {#slower but lower memory usage + # $iterargs = [$filename, $compressed, $opts]; + my $next = Archive::Tar->iter( @$iterargs ); + while(my $e = $next->()){ + if($e->full_path eq $file){ + undef $next; + return $e; + } + } + }#slower + } + } + + $self->_error( qq[No such file in archive: '$file'] ); + return; +} + +=head2 $tar->get_files( [@filenames] ) + +Returns the C objects matching the filenames +provided. If no filename list was passed, all C +objects in the current Tar object are returned. + +Please refer to the C documentation on how to +handle these objects. + +=cut + +sub get_files { + my $self = shift; + + return @{ $self->_data } unless @_; + + my @list; + for my $file ( @_ ) { + push @list, grep { defined } $self->_find_entry( $file ); + } + + return @list; +} + +=head2 $tar->get_content( $file ) + +Return the content of the named file. + +=cut + +sub get_content { + my $self = shift; + my $entry = $self->_find_entry( shift ) or return; + + return $entry->data; +} + +=head2 $tar->replace_content( $file, $content ) + +Make the string $content be the content for the file named $file. + +=cut + +sub replace_content { + my $self = shift; + my $entry = $self->_find_entry( shift ) or return; + + return $entry->replace_content( shift ); +} + +=head2 $tar->rename( $file, $new_name ) + +Rename the file of the in-memory archive to $new_name. + +Note that you must specify a Unix path for $new_name, since per tar +standard, all files in the archive must be Unix paths. + +Returns true on success and false on failure. + +=cut + +sub rename { + my $self = shift; + my $file = shift; return unless defined $file; + my $new = shift; return unless defined $new; + + my $entry = $self->_find_entry( $file ) or return; + + return $entry->rename( $new ); +} + +=head2 $tar->chmod( $file, $mode ) + +Change mode of $file to $mode. + +Returns true on success and false on failure. + +=cut + +sub chmod { + my $self = shift; + my $file = shift; return unless defined $file; + my $mode = shift; return unless defined $mode && $mode =~ /^[0-7]{1,4}$/; + my @args = ("$mode"); + + my $entry = $self->_find_entry( $file ) or return; + my $x = $entry->chmod( @args ); + return $x; +} + +=head2 $tar->chown( $file, $uname [, $gname] ) + +Change owner $file to $uname and $gname. + +Returns true on success and false on failure. + +=cut + +sub chown { + my $self = shift; + my $file = shift; return unless defined $file; + my $uname = shift; return unless defined $uname; + my @args = ($uname); + push(@args, shift); + + my $entry = $self->_find_entry( $file ) or return; + my $x = $entry->chown( @args ); + return $x; +} + +=head2 $tar->remove (@filenamelist) + +Removes any entries with names matching any of the given filenames +from the in-memory archive. Returns a list of C +objects that remain. + +=cut + +sub remove { + my $self = shift; + my @list = @_; + + my %seen = map { $_->full_path => $_ } @{$self->_data}; + delete $seen{ $_ } for @list; + + $self->_data( [values %seen] ); + + return values %seen; +} + +=head2 $tar->clear + +C clears the current in-memory archive. This effectively gives +you a 'blank' object, ready to be filled again. Note that C +only has effect on the object, not the underlying tarfile. + +=cut + +sub clear { + my $self = shift or return; + + $self->_data( [] ); + $self->_file( '' ); + + return 1; +} + + +=head2 $tar->write ( [$file, $compressed, $prefix] ) + +Write the in-memory archive to disk. The first argument can either +be the name of a file or a reference to an already open filehandle (a +GLOB reference). + +The second argument is used to indicate compression. You can +compress using C, C or C. If you pass a digit, it's assumed +to be the C compression level (between 1 and 9), but the use of +constants is preferred: + + # write a gzip compressed file + $tar->write( 'out.tgz', COMPRESS_GZIP ); + + # write a bzip compressed file + $tar->write( 'out.tbz', COMPRESS_BZIP ); + + # write a xz compressed file + $tar->write( 'out.txz', COMPRESS_XZ ); + +Note that when you pass in a filehandle, the compression argument +is ignored, as all files are printed verbatim to your filehandle. +If you wish to enable compression with filehandles, use an +C, C or C filehandle instead. + +The third argument is an optional prefix. All files will be tucked +away in the directory you specify as prefix. So if you have files +'a' and 'b' in your archive, and you specify 'foo' as prefix, they +will be written to the archive as 'foo/a' and 'foo/b'. + +If no arguments are given, C returns the entire formatted +archive as a string, which could be useful if you'd like to stuff the +archive into a socket or a pipe to gzip or something. + + +=cut + +sub write { + my $self = shift; + my $file = shift; $file = '' unless defined $file; + my $gzip = shift || 0; + my $ext_prefix = shift; $ext_prefix = '' unless defined $ext_prefix; + my $dummy = ''; + + ### only need a handle if we have a file to print to ### + my $handle = length($file) + ? ( $self->_get_handle($file, $gzip, WRITE_ONLY->($gzip) ) + or return ) + : $HAS_PERLIO ? do { open my $h, '>', \$dummy; $h } + : $HAS_IO_STRING ? IO::String->new + : __PACKAGE__->no_string_support(); + + ### Addresses: #41798: Nonempty $\ when writing a TAR file produces a + ### corrupt TAR file. Must clear out $\ to make sure no garbage is + ### printed to the archive + local $\; + + for my $entry ( @{$self->_data} ) { + ### entries to be written to the tarfile ### + my @write_me; + + ### only now will we change the object to reflect the current state + ### of the name and prefix fields -- this needs to be limited to + ### write() only! + my $clone = $entry->clone; + + + ### so, if you don't want use to use the prefix, we'll stuff + ### everything in the name field instead + if( $DO_NOT_USE_PREFIX ) { + + ### you might have an extended prefix, if so, set it in the clone + ### XXX is ::Unix right? + $clone->name( length $ext_prefix + ? File::Spec::Unix->catdir( $ext_prefix, + $clone->full_path) + : $clone->full_path ); + $clone->prefix( '' ); + + ### otherwise, we'll have to set it properly -- prefix part in the + ### prefix and name part in the name field. + } else { + + ### split them here, not before! + my ($prefix,$name) = $clone->_prefix_and_file( $clone->full_path ); + + ### you might have an extended prefix, if so, set it in the clone + ### XXX is ::Unix right? + $prefix = File::Spec::Unix->catdir( $ext_prefix, $prefix ) + if length $ext_prefix; + + $clone->prefix( $prefix ); + $clone->name( $name ); + } + + ### names are too long, and will get truncated if we don't add a + ### '@LongLink' file... + my $make_longlink = ( length($clone->name) > NAME_LENGTH or + length($clone->prefix) > PREFIX_LENGTH + ) || 0; + + ### perhaps we need to make a longlink file? + if( $make_longlink ) { + my $longlink = Archive::Tar::File->new( + data => LONGLINK_NAME, + $clone->full_path, + { type => LONGLINK } + ); + + unless( $longlink ) { + $self->_error( qq[Could not create 'LongLink' entry for ] . + qq[oversize file '] . $clone->full_path ."'" ); + return; + }; + + push @write_me, $longlink; + } + + push @write_me, $clone; + + ### write the one, optionally 2 a::t::file objects to the handle + for my $clone (@write_me) { + + ### if the file is a symlink, there are 2 options: + ### either we leave the symlink intact, but then we don't write any + ### data OR we follow the symlink, which means we actually make a + ### copy. if we do the latter, we have to change the TYPE of the + ### clone to 'FILE' + my $link_ok = $clone->is_symlink && $Archive::Tar::FOLLOW_SYMLINK; + my $data_ok = !$clone->is_symlink && $clone->has_content; + + ### downgrade to a 'normal' file if it's a symlink we're going to + ### treat as a regular file + $clone->_downgrade_to_plainfile if $link_ok; + + ### get the header for this block + my $header = $self->_format_tar_entry( $clone ); + unless( $header ) { + $self->_error(q[Could not format header for: ] . + $clone->full_path ); + return; + } + + unless( print $handle $header ) { + $self->_error(q[Could not write header for: ] . + $clone->full_path); + return; + } + + if( $link_ok or $data_ok ) { + unless( print $handle $clone->data ) { + $self->_error(q[Could not write data for: ] . + $clone->full_path); + return; + } + + ### pad the end of the clone if required ### + print $handle TAR_PAD->( $clone->size ) if $clone->size % BLOCK + } + + } ### done writing these entries + } + + ### write the end markers ### + print $handle TAR_END x 2 or + return $self->_error( qq[Could not write tar end markers] ); + + ### did you want it written to a file, or returned as a string? ### + my $rv = length($file) ? 1 + : $HAS_PERLIO ? $dummy + : do { seek $handle, 0, 0; local $/; <$handle> }; + + ### make sure to close the handle if we created it + if ( $file ne $handle ) { + unless( close $handle ) { + $self->_error( qq[Could not write tar] ); + return; + } + } + + return $rv; +} + +sub _format_tar_entry { + my $self = shift; + my $entry = shift or return; + my $ext_prefix = shift; $ext_prefix = '' unless defined $ext_prefix; + my $no_prefix = shift || 0; + + my $file = $entry->name; + my $prefix = $entry->prefix; $prefix = '' unless defined $prefix; + + ### remove the prefix from the file name + ### not sure if this is still needed --kane + ### no it's not -- Archive::Tar::File->_new_from_file will take care of + ### this for us. Even worse, this would break if we tried to add a file + ### like x/x. + #if( length $prefix ) { + # $file =~ s/^$match//; + #} + + $prefix = File::Spec::Unix->catdir($ext_prefix, $prefix) + if length $ext_prefix; + + ### not sure why this is... ### + my $l = PREFIX_LENGTH; # is ambiguous otherwise... + substr ($prefix, 0, -$l) = "" if length $prefix >= PREFIX_LENGTH; + + my $f1 = "%06o"; my $f2 = $ZERO_PAD_NUMBERS ? "%011o" : "%11o"; + + ### this might be optimizable with a 'changed' flag in the file objects ### + my $tar = pack ( + PACK, + $file, + + (map { sprintf( $f1, $entry->$_() ) } qw[mode uid gid]), + (map { sprintf( $f2, $entry->$_() ) } qw[size mtime]), + + "", # checksum field - space padded a bit down + + (map { $entry->$_() } qw[type linkname magic]), + + $entry->version || TAR_VERSION, + + (map { $entry->$_() } qw[uname gname]), + (map { sprintf( $f1, $entry->$_() ) } qw[devmajor devminor]), + + ($no_prefix ? '' : $prefix) + ); + + ### add the checksum ### + my $checksum_fmt = $ZERO_PAD_NUMBERS ? "%06o\0" : "%06o\0"; + substr($tar,148,7) = sprintf("%6o\0", unpack("%16C*",$tar)); + + return $tar; +} + +=head2 $tar->add_files( @filenamelist ) + +Takes a list of filenames and adds them to the in-memory archive. + +The path to the file is automatically converted to a Unix like +equivalent for use in the archive, and, if on MacOS, the file's +modification time is converted from the MacOS epoch to the Unix epoch. +So tar archives created on MacOS with B can be read +both with I on Unix and applications like I or +I on MacOS. + +Be aware that the file's type/creator and resource fork will be lost, +which is usually what you want in cross-platform archives. + +Instead of a filename, you can also pass it an existing C +object from, for example, another archive. The object will be clone, and +effectively be a copy of the original, not an alias. + +Returns a list of C objects that were just added. + +=cut + +sub add_files { + my $self = shift; + my @files = @_ or return; + + my @rv; + for my $file ( @files ) { + + ### you passed an Archive::Tar::File object + ### clone it so we don't accidentally have a reference to + ### an object from another archive + if( UNIVERSAL::isa( $file,'Archive::Tar::File' ) ) { + push @rv, $file->clone; + next; + } + + eval { + if( utf8::is_utf8( $file )) { + utf8::encode( $file ); + } + }; + + unless( -e $file || -l $file ) { + $self->_error( qq[No such file: '$file'] ); + next; + } + + my $obj = Archive::Tar::File->new( file => $file ); + unless( $obj ) { + $self->_error( qq[Unable to add file: '$file'] ); + next; + } + + push @rv, $obj; + } + + push @{$self->{_data}}, @rv; + + return @rv; +} + +=head2 $tar->add_data ( $filename, $data, [$opthashref] ) + +Takes a filename, a scalar full of data and optionally a reference to +a hash with specific options. + +Will add a file to the in-memory archive, with name C<$filename> and +content C<$data>. Specific properties can be set using C<$opthashref>. +The following list of properties is supported: name, size, mtime +(last modified date), mode, uid, gid, linkname, uname, gname, +devmajor, devminor, prefix, type. (On MacOS, the file's path and +modification times are converted to Unix equivalents.) + +Valid values for the file type are the following constants defined by +Archive::Tar::Constant: + +=over 4 + +=item FILE + +Regular file. + +=item HARDLINK + +=item SYMLINK + +Hard and symbolic ("soft") links; linkname should specify target. + +=item CHARDEV + +=item BLOCKDEV + +Character and block devices. devmajor and devminor should specify the major +and minor device numbers. + +=item DIR + +Directory. + +=item FIFO + +FIFO (named pipe). + +=item SOCKET + +Socket. + +=back + +Returns the C object that was just added, or +C on failure. + +=cut + +sub add_data { + my $self = shift; + my ($file, $data, $opt) = @_; + + my $obj = Archive::Tar::File->new( data => $file, $data, $opt ); + unless( $obj ) { + $self->_error( qq[Unable to add file: '$file'] ); + return; + } + + push @{$self->{_data}}, $obj; + + return $obj; +} + +=head2 $tar->error( [$BOOL] ) + +Returns the current error string (usually, the last error reported). +If a true value was specified, it will give the C +equivalent of the error, in effect giving you a stacktrace. + +For backwards compatibility, this error is also available as +C<$Archive::Tar::error> although it is much recommended you use the +method call instead. + +=cut + +{ + $error = ''; + my $longmess; + + sub _error { + my $self = shift; + my $msg = $error = shift; + $longmess = Carp::longmess($error); + if (ref $self) { + $self->{_error} = $error; + $self->{_longmess} = $longmess; + } + + ### set Archive::Tar::WARN to 0 to disable printing + ### of errors + if( $WARN ) { + carp $DEBUG ? $longmess : $msg; + } + + return; + } + + sub error { + my $self = shift; + if (ref $self) { + return shift() ? $self->{_longmess} : $self->{_error}; + } else { + return shift() ? $longmess : $error; + } + } +} + +=head2 $tar->setcwd( $cwd ); + +C needs to know the current directory, and it will run +C I time it extracts a I entry from the +tarfile and saves it in the file system. (As of version 1.30, however, +C will use the speed optimization described below +automatically, so it's only relevant if you're using C). + +Since C doesn't change the current directory internally +while it is extracting the items in a tarball, all calls to C +can be avoided if we can guarantee that the current directory doesn't +get changed externally. + +To use this performance boost, set the current directory via + + use Cwd; + $tar->setcwd( cwd() ); + +once before calling a function like C and +C will use the current directory setting from then on +and won't call C internally. + +To switch back to the default behaviour, use + + $tar->setcwd( undef ); + +and C will call C internally again. + +If you're using C's C method, C will +be called for you. + +=cut + +sub setcwd { + my $self = shift; + my $cwd = shift; + + $self->{cwd} = $cwd; +} + +=head1 Class Methods + +=head2 Archive::Tar->create_archive($file, $compressed, @filelist) + +Creates a tar file from the list of files provided. The first +argument can either be the name of the tar file to create or a +reference to an open file handle (e.g. a GLOB reference). + +The second argument is used to indicate compression. You can +compress using C, C or C. If you pass a digit, it's assumed +to be the C compression level (between 1 and 9), but the use of +constants is preferred: + + # write a gzip compressed file + Archive::Tar->create_archive( 'out.tgz', COMPRESS_GZIP, @filelist ); + + # write a bzip compressed file + Archive::Tar->create_archive( 'out.tbz', COMPRESS_BZIP, @filelist ); + + # write a xz compressed file + Archive::Tar->create_archive( 'out.txz', COMPRESS_XZ, @filelist ); + +Note that when you pass in a filehandle, the compression argument +is ignored, as all files are printed verbatim to your filehandle. +If you wish to enable compression with filehandles, use an +C, C or C filehandle instead. + +The remaining arguments list the files to be included in the tar file. +These files must all exist. Any files which don't exist or can't be +read are silently ignored. + +If the archive creation fails for any reason, C will +return false. Please use the C method to find the cause of the +failure. + +Note that this method does not write C as it were; it +still reads all the files into memory before writing out the archive. +Consult the FAQ below if this is a problem. + +=cut + +sub create_archive { + my $class = shift; + + my $file = shift; return unless defined $file; + my $gzip = shift || 0; + my @files = @_; + + unless( @files ) { + return $class->_error( qq[Cowardly refusing to create empty archive!] ); + } + + my $tar = $class->new; + $tar->add_files( @files ); + return $tar->write( $file, $gzip ); +} + +=head2 Archive::Tar->iter( $filename, [ $compressed, {opt => $val} ] ) + +Returns an iterator function that reads the tar file without loading +it all in memory. Each time the function is called it will return the +next file in the tarball. The files are returned as +C objects. The iterator function returns the +empty list once it has exhausted the files contained. + +The second argument can be a hash reference with options, which are +identical to the arguments passed to C. + +Example usage: + + my $next = Archive::Tar->iter( "example.tar.gz", 1, {filter => qr/\.pm$/} ); + + while( my $f = $next->() ) { + print $f->name, "\n"; + + $f->extract or warn "Extraction failed"; + + # .... + } + +=cut + + +sub iter { + my $class = shift; + my $filename = shift; + return unless defined $filename; + my $compressed = shift || 0; + my $opts = shift || {}; + + ### get a handle to read from. + my $handle = $class->_get_handle( + $filename, + $compressed, + READ_ONLY->( ZLIB ) + ) or return; + + my @data; + my $CONSTRUCT_ARGS = [ $filename, $compressed, $opts ]; + return sub { + return shift(@data) if @data; # more than one file returned? + return unless $handle; # handle exhausted? + + ### read data, should only return file + my $tarfile = $class->_read_tar($handle, { %$opts, limit => 1 }); + @data = @$tarfile if ref $tarfile && ref $tarfile eq 'ARRAY'; + if($Archive::Tar::RESOLVE_SYMLINK!~/none/){ + foreach(@data){ + #may refine this heuristic for ON_UNIX? + if($_->linkname){ + #is there a better slot to store/share it ? + $_->{'_archive'} = $CONSTRUCT_ARGS; + } + } + } + + ### return one piece of data + return shift(@data) if @data; + + ### data is exhausted, free the filehandle + undef $handle; + if(@$CONSTRUCT_ARGS == 4){ + #free archive in memory + undef $CONSTRUCT_ARGS->[-1]; + } + return; + }; +} + +=head2 Archive::Tar->list_archive($file, $compressed, [\@properties]) + +Returns a list of the names of all the files in the archive. The +first argument can either be the name of the tar file to list or a +reference to an open file handle (e.g. a GLOB reference). + +If C is passed an array reference as its third +argument it returns a list of hash references containing the requested +properties of each file. The following list of properties is +supported: full_path, name, size, mtime (last modified date), mode, +uid, gid, linkname, uname, gname, devmajor, devminor, prefix, type. + +See C for details about supported properties. + +Passing an array reference containing only one element, 'name', is +special cased to return a list of names rather than a list of hash +references. + +=cut + +sub list_archive { + my $class = shift; + my $file = shift; return unless defined $file; + my $gzip = shift || 0; + + my $tar = $class->new($file, $gzip); + return unless $tar; + + return $tar->list_files( @_ ); +} + +=head2 Archive::Tar->extract_archive($file, $compressed) + +Extracts the contents of the tar file. The first argument can either +be the name of the tar file to create or a reference to an open file +handle (e.g. a GLOB reference). All relative paths in the tar file will +be created underneath the current working directory. + +C will return a list of files it extracted. +If the archive extraction fails for any reason, C +will return false. Please use the C method to find the cause +of the failure. + +=cut + +sub extract_archive { + my $class = shift; + my $file = shift; return unless defined $file; + my $gzip = shift || 0; + + my $tar = $class->new( ) or return; + + return $tar->read( $file, $gzip, { extract => 1 } ); +} + +=head2 $bool = Archive::Tar->has_io_string + +Returns true if we currently have C support loaded. + +Either C or C support is needed to support writing +stringified archives. Currently, C is the preferred method, if +available. + +See the C section to see how to change this preference. + +=cut + +sub has_io_string { return $HAS_IO_STRING; } + +=head2 $bool = Archive::Tar->has_perlio + +Returns true if we currently have C support loaded. + +This requires C or higher, compiled with C + +Either C or C support is needed to support writing +stringified archives. Currently, C is the preferred method, if +available. + +See the C section to see how to change this preference. + +=cut + +sub has_perlio { return $HAS_PERLIO; } + +=head2 $bool = Archive::Tar->has_zlib_support + +Returns true if C can extract C compressed archives + +=cut + +sub has_zlib_support { return ZLIB } + +=head2 $bool = Archive::Tar->has_bzip2_support + +Returns true if C can extract C compressed archives + +=cut + +sub has_bzip2_support { return BZIP } + +=head2 $bool = Archive::Tar->has_xz_support + +Returns true if C can extract C compressed archives + +=cut + +sub has_xz_support { return XZ } + +=head2 Archive::Tar->can_handle_compressed_files + +A simple checking routine, which will return true if C +is able to uncompress compressed archives on the fly with C, +C and C or false if not both are installed. + +You can use this as a shortcut to determine whether C +will do what you think before passing compressed archives to its +C method. + +=cut + +sub can_handle_compressed_files { return ZLIB && BZIP ? 1 : 0 } + +sub no_string_support { + croak("You have to install IO::String to support writing archives to strings"); +} + +sub _symlinks_resolver{ + my ($src, $trg) = @_; + my @src = split /[\/\\]/, $src; + my @trg = split /[\/\\]/, $trg; + pop @src; #strip out current object name + if(@trg and $trg[0] eq ''){ + shift @trg; + #restart path from scratch + @src = ( ); + } + foreach my $part ( @trg ){ + next if $part eq '.'; #ignore current + if($part eq '..'){ + #got to parent + pop @src; + } + else{ + #append it + push @src, $part; + } + } + my $path = join('/', @src); + warn "_symlinks_resolver('$src','$trg') = $path" if $DEBUG; + return $path; +} + +1; + +__END__ + +=head1 GLOBAL VARIABLES + +=head2 $Archive::Tar::FOLLOW_SYMLINK + +Set this variable to C<1> to make C effectively make a +copy of the file when extracting. Default is C<0>, which +means the symlink stays intact. Of course, you will have to pack the +file linked to as well. + +This option is checked when you write out the tarfile using C +or C. + +This works just like C's C<-h> option. + +=head2 $Archive::Tar::CHOWN + +By default, C will try to C your files if it is +able to. In some cases, this may not be desired. In that case, set +this variable to C<0> to disable C-ing, even if it were +possible. + +The default is C<1>. + +=head2 $Archive::Tar::CHMOD + +By default, C will try to C your files to +whatever mode was specified for the particular file in the archive. +In some cases, this may not be desired. In that case, set this +variable to C<0> to disable C-ing. + +The default is C<1>. + +=head2 $Archive::Tar::SAME_PERMISSIONS + +When, C<$Archive::Tar::CHMOD> is enabled, this setting controls whether +the permissions on files from the archive are used without modification +of if they are filtered by removing any setid bits and applying the +current umask. + +The default is C<1> for the root user and C<0> for normal users. + +=head2 $Archive::Tar::DO_NOT_USE_PREFIX + +By default, C will try to put paths that are over +100 characters in the C field of your tar header, as +defined per POSIX-standard. However, some (older) tar programs +do not implement this spec. To retain compatibility with these older +or non-POSIX compliant versions, you can set the C<$DO_NOT_USE_PREFIX> +variable to a true value, and C will use an alternate +way of dealing with paths over 100 characters by using the +C feature. + +Note that clients who do not support the C +feature will not be able to read these archives. Such clients include +tars on C, C and C. + +The default is C<0>. + +=head2 $Archive::Tar::DEBUG + +Set this variable to C<1> to always get the C output +of the warnings, instead of the regular C. This is the same +message you would get by doing: + + $tar->error(1); + +Defaults to C<0>. + +=head2 $Archive::Tar::WARN + +Set this variable to C<0> if you do not want any warnings printed. +Personally I recommend against doing this, but people asked for the +option. Also, be advised that this is of course not threadsafe. + +Defaults to C<1>. + +=head2 $Archive::Tar::error + +Holds the last reported error. Kept for historical reasons, but its +use is very much discouraged. Use the C method instead: + + warn $tar->error unless $tar->extract; + +Note that in older versions of this module, the C method +would return an effectively global value even when called an instance +method as above. This has since been fixed, and multiple instances of +C now have separate error strings. + +=head2 $Archive::Tar::INSECURE_EXTRACT_MODE + +This variable indicates whether C should allow +files to be extracted outside their current working directory. + +Allowing this could have security implications, as a malicious +tar archive could alter or replace any file the extracting user +has permissions to. Therefor, the default is to not allow +insecure extractions. + +If you trust the archive, or have other reasons to allow the +archive to write files outside your current working directory, +set this variable to C. + +Note that this is a backwards incompatible change from version +C<1.36> and before. + +=head2 $Archive::Tar::HAS_PERLIO + +This variable holds a boolean indicating if we currently have +C support loaded. This will be enabled for any perl +greater than C<5.8> compiled with C. + +If you feel strongly about disabling it, set this variable to +C. Note that you will then need C installed +to support writing stringified archives. + +Don't change this variable unless you B know what you're +doing. + +=head2 $Archive::Tar::HAS_IO_STRING + +This variable holds a boolean indicating if we currently have +C support loaded. This will be enabled for any perl +that has a loadable C module. + +If you feel strongly about disabling it, set this variable to +C. Note that you will then need C support from +your perl to be able to write stringified archives. + +Don't change this variable unless you B know what you're +doing. + +=head2 $Archive::Tar::ZERO_PAD_NUMBERS + +This variable holds a boolean indicating if we will create +zero padded numbers for C, C and C. +The default is C<0>, indicating that we will create space padded +numbers. Added for compatibility with C implementations. + +=head2 Tuning the way RESOLVE_SYMLINK will works + +You can tune the behaviour by setting the $Archive::Tar::RESOLVE_SYMLINK variable, +or $ENV{PERL5_AT_RESOLVE_SYMLINK} before loading the module Archive::Tar. + +Values can be one of the following: + +=over 4 + +=item none + +Disable this mechanism and failed as it was in previous version (<1.88) + +=item speed (default) + +If you prefer speed +this will read again the whole archive using read() so all entries +will be available + +=item memory + +If you prefer memory + +=back + +Limitation: It won't work for terminal, pipe or sockets or every non seekable +source. + +=head2 $Archive::Tar::EXTRACT_BLOCK_SIZE + +This variable holds an integer with the block size that should be used when +writing files during extraction. It defaults to 1 GiB. Please note that this +cannot be arbitrarily large since some operating systems limit the number of +bytes that can be written in one call to C, so if this is too large, +extraction may fail with an error. + +=cut + +=head1 FAQ + +=over 4 + +=item What's the minimum perl version required to run Archive::Tar? + +You will need perl version 5.005_03 or newer. + +=item Isn't Archive::Tar slow? + +Yes it is. It's pure perl, so it's a lot slower then your C +However, it's very portable. If speed is an issue, consider using +C instead. + +=item Isn't Archive::Tar heavier on memory than /bin/tar? + +Yes it is, see previous answer. Since C and therefore +C doesn't support C on their filehandles, there is little +choice but to read the archive into memory. +This is ok if you want to do in-memory manipulation of the archive. + +If you just want to extract, use the C class method +instead. It will optimize and write to disk immediately. + +Another option is to use the C class method to iterate over +the files in the tarball without reading them all in memory at once. + +=item Can you lazy-load data instead? + +In some cases, yes. You can use the C class method to iterate +over the files in the tarball without reading them all in memory at once. + +=item How much memory will an X kb tar file need? + +Probably more than X kb, since it will all be read into memory. If +this is a problem, and you don't need to do in memory manipulation +of the archive, consider using the C class method, or C +instead. + +=item What do you do with unsupported filetypes in an archive? + +C has a few filetypes that aren't supported on other platforms, +like C. If we encounter a C or C we'll just +try to make a copy of the original file, rather than throwing an error. + +This does require you to read the entire archive in to memory first, +since otherwise we wouldn't know what data to fill the copy with. +(This means that you cannot use the class methods, including C +on archives that have incompatible filetypes and still expect things +to work). + +For other filetypes, like C and C we'll warn that +the extraction of this particular item didn't work. + +=item I'm using WinZip, or some other non-POSIX client, and files are not being extracted properly! + +By default, C is in a completely POSIX-compatible +mode, which uses the POSIX-specification of C to store files. +For paths greater than 100 characters, this is done using the +C. Non-POSIX-compatible clients may not support +this part of the specification, and may only support the C functionality. To facilitate those clients, you can set the +C<$Archive::Tar::DO_NOT_USE_PREFIX> variable to C. See the +C section for details on this variable. + +Note that GNU tar earlier than version 1.14 does not cope well with +the C. If you use such a version, consider setting +the C<$Archive::Tar::DO_NOT_USE_PREFIX> variable to C. + +=item How do I extract only files that have property X from an archive? + +Sometimes, you might not wish to extract a complete archive, just +the files that are relevant to you, based on some criteria. + +You can do this by filtering a list of C objects +based on your criteria. For example, to extract only files that have +the string C in their title, you would use: + + $tar->extract( + grep { $_->full_path =~ /foo/ } $tar->get_files + ); + +This way, you can filter on any attribute of the files in the archive. +Consult the C documentation on how to use these +objects. + +=item How do I access .tar.Z files? + +The C module can optionally use C (via +the C module) to access tar files that have been compressed +with C. Unfortunately tar files compressed with the Unix C +utility cannot be read by C and so cannot be directly +accesses by C. + +If the C or C programs are available, you can use +one of these workarounds to read C<.tar.Z> files from C + +Firstly with C + + use Archive::Tar; + + open F, "uncompress -c $filename |"; + my $tar = Archive::Tar->new(*F); + ... + +and this with C + + use Archive::Tar; + + open F, "gunzip -c $filename |"; + my $tar = Archive::Tar->new(*F); + ... + +Similarly, if the C program is available, you can use this to +write a C<.tar.Z> file + + use Archive::Tar; + use IO::File; + + my $fh = IO::File->new( "| compress -c >$filename" ); + my $tar = Archive::Tar->new(); + ... + $tar->write($fh); + $fh->close ; + +=item How do I handle Unicode strings? + +C uses byte semantics for any files it reads from or writes +to disk. This is not a problem if you only deal with files and never +look at their content or work solely with byte strings. But if you use +Unicode strings with character semantics, some additional steps need +to be taken. + +For example, if you add a Unicode string like + + # Problem + $tar->add_data('file.txt', "Euro: \x{20AC}"); + +then there will be a problem later when the tarfile gets written out +to disk via C<< $tar->write() >>: + + Wide character in print at .../Archive/Tar.pm line 1014. + +The data was added as a Unicode string and when writing it out to disk, +the C<:utf8> line discipline wasn't set by C, so Perl +tried to convert the string to ISO-8859 and failed. The written file +now contains garbage. + +For this reason, Unicode strings need to be converted to UTF-8-encoded +bytestrings before they are handed off to C: + + use Encode; + my $data = "Accented character: \x{20AC}"; + $data = encode('utf8', $data); + + $tar->add_data('file.txt', $data); + +A opposite problem occurs if you extract a UTF8-encoded file from a +tarball. Using C on the C object +will return its content as a bytestring, not as a Unicode string. + +If you want it to be a Unicode string (because you want character +semantics with operations like regular expression matching), you need +to decode the UTF8-encoded content and have Perl convert it into +a Unicode string: + + use Encode; + my $data = $tar->get_content(); + + # Make it a Unicode string + $data = decode('utf8', $data); + +There is no easy way to provide this functionality in C, +because a tarball can contain many files, and each of which could be +encoded in a different way. + +=back + +=head1 CAVEATS + +The AIX tar does not fill all unused space in the tar archive with 0x00. +This sometimes leads to warning messages from C. + + Invalid header block at offset nnn + +A fix for that problem is scheduled to be released in the following levels +of AIX, all of which should be coming out in the 4th quarter of 2009: + + AIX 5.3 TL7 SP10 + AIX 5.3 TL8 SP8 + AIX 5.3 TL9 SP5 + AIX 5.3 TL10 SP2 + + AIX 6.1 TL0 SP11 + AIX 6.1 TL1 SP7 + AIX 6.1 TL2 SP6 + AIX 6.1 TL3 SP3 + +The IBM APAR number for this problem is IZ50240 (Reported component ID: +5765G0300 / AIX 5.3). It is possible to get an ifix for that problem. +If you need an ifix please contact your local IBM AIX support. + +=head1 TODO + +=over 4 + +=item Check if passed in handles are open for read/write + +Currently I don't know of any portable pure perl way to do this. +Suggestions welcome. + +=item Allow archives to be passed in as string + +Currently, we only allow opened filehandles or filenames, but +not strings. The internals would need some reworking to facilitate +stringified archives. + +=item Facilitate processing an opened filehandle of a compressed archive + +Currently, we only support this if the filehandle is an IO::Zlib object. +Environments, like apache, will present you with an opened filehandle +to an uploaded file, which might be a compressed archive. + +=back + +=head1 SEE ALSO + +=over 4 + +=item The GNU tar specification + +L + +=item The PAX format specification + +The specification which tar derives from; L + +=back + +=head1 AUTHOR + +This module by Jos Boumans Ekane@cpan.orgE. + +Please reports bugs to Ebug-archive-tar@rt.cpan.orgE. + +=head1 ACKNOWLEDGEMENTS + +Thanks to Sean Burke, Chris Nandor, Chip Salzenberg, Tim Heaney, Gisle Aas, +Rainer Tammer and especially Andrew Savige for their help and suggestions. + +=head1 COPYRIGHT + +This module is copyright (c) 2002 - 2009 Jos Boumans +Ekane@cpan.orgE. All rights reserved. + +This library is free software; you may redistribute and/or modify +it under the same terms as Perl itself. + +=cut diff --git a/src/main/perl/lib/Archive/Tar/Constant.pm b/src/main/perl/lib/Archive/Tar/Constant.pm new file mode 100644 index 000000000..4612092ba --- /dev/null +++ b/src/main/perl/lib/Archive/Tar/Constant.pm @@ -0,0 +1,127 @@ +package Archive::Tar::Constant; + +use strict; +use warnings; + +use vars qw[$VERSION @ISA @EXPORT]; + +BEGIN { + require Exporter; + + $VERSION = '3.04'; + @ISA = qw[Exporter]; + + require Time::Local if $^O eq "MacOS"; +} + +@EXPORT = Archive::Tar::Constant->_list_consts( __PACKAGE__ ); + +use constant FILE => 0; +use constant HARDLINK => 1; +use constant SYMLINK => 2; +use constant CHARDEV => 3; +use constant BLOCKDEV => 4; +use constant DIR => 5; +use constant FIFO => 6; +use constant SOCKET => 8; +use constant UNKNOWN => 9; +use constant LONGLINK => 'L'; +use constant LABEL => 'V'; + +use constant BUFFER => 4096; +use constant HEAD => 512; +use constant BLOCK => 512; + +use constant COMPRESS_GZIP => 9; +use constant COMPRESS_BZIP => 'bzip2'; +use constant COMPRESS_XZ => 'xz'; + +use constant BLOCK_SIZE => sub { my $n = int($_[0]/BLOCK); $n++ if $_[0] % BLOCK; $n * BLOCK }; +use constant TAR_PAD => sub { my $x = shift || return; return "\0" x (BLOCK - ($x % BLOCK) ) }; +use constant TAR_END => "\0" x BLOCK; + +use constant READ_ONLY => sub { shift() ? 'rb' : 'r' }; +use constant WRITE_ONLY => sub { $_[0] ? 'wb' . shift : 'w' }; +use constant MODE_READ => sub { $_[0] =~ /^r/ ? 1 : 0 }; + +# Pointless assignment to make -w shut up +my $getpwuid; $getpwuid = 'unknown' unless eval { my $f = getpwuid (0); }; +my $getgrgid; $getgrgid = 'unknown' unless eval { my $f = getgrgid (0); }; +use constant UNAME => sub { $getpwuid || scalar getpwuid( shift() ) || '' }; +use constant GNAME => sub { $getgrgid || scalar getgrgid( shift() ) || '' }; +use constant UID => $>; +use constant GID => (split ' ', $) )[0]; + +use constant MODE => do { 0666 & (0777 & ~umask) }; +use constant STRIP_MODE => sub { shift() & 0777 }; +use constant CHECK_SUM => " "; + +use constant UNPACK => 'a100 a8 a8 a8 a12 a12 a8 a1 a100 A6 a2 a32 a32 a8 a8 a155 x12'; # cdrake - size must be a12 - not A12 - or else screws up huge file sizes (>8gb) +use constant PACK => 'a100 a8 a8 a8 a12 a12 A8 a1 a100 a6 a2 a32 a32 a8 a8 a155 x12'; +use constant NAME_LENGTH => 100; +use constant PREFIX_LENGTH => 155; + +use constant TIME_OFFSET => ($^O eq "MacOS") ? Time::Local::timelocal(0,0,0,1,0,1970) : 0; +use constant MAGIC => "ustar"; +use constant TAR_VERSION => "00"; +use constant LONGLINK_NAME => '././@LongLink'; +use constant PAX_HEADER => 'pax_global_header'; + + ### allow ZLIB to be turned off using ENV: DEBUG only +use constant ZLIB => do { !$ENV{'PERL5_AT_NO_ZLIB'} and + eval { require IO::Zlib }; + $ENV{'PERL5_AT_NO_ZLIB'} || $@ ? 0 : 1 + }; + + ### allow BZIP to be turned off using ENV: DEBUG only +use constant BZIP => do { !$ENV{'PERL5_AT_NO_BZIP'} and + eval { require IO::Uncompress::Bunzip2; + require IO::Compress::Bzip2; }; + $ENV{'PERL5_AT_NO_BZIP'} || $@ ? 0 : 1 + }; + + ### allow XZ to be turned off using ENV: DEBUG only +use constant XZ => do { !$ENV{'PERL5_AT_NO_XZ'} and + eval { require IO::Compress::Xz; + require IO::Uncompress::UnXz; }; + $ENV{'PERL5_AT_NO_XZ'} || $@ ? 0 : 1 + }; + +use constant GZIP_MAGIC_NUM => qr/^(?:\037\213|\037\235)/; + + # ASCII: B Z h 0 9 +use constant BZIP_MAGIC_NUM => qr/^\x42\x5A\x68[\x30-\x39]/; + +use constant XZ_MAGIC_NUM => qr/^\xFD\x37\x7A\x58\x5A\x00/; + +use constant CAN_CHOWN => sub { ($> == 0 and $^O ne "MacOS" and $^O ne "MSWin32") }; +use constant CAN_READLINK => ($^O ne 'MSWin32' and $^O !~ /RISC(?:[ _])?OS/i and $^O ne 'VMS'); +use constant ON_UNIX => ($^O ne 'MSWin32' and $^O ne 'MacOS' and $^O ne 'VMS'); +use constant ON_VMS => $^O eq 'VMS'; + +sub _list_consts { + my $class = shift; + my $pkg = shift; + return unless defined $pkg; # some joker might use '0' as a pkg... + + my @rv; + { no strict 'refs'; + my $stash = $pkg . '::'; + + for my $name (sort keys %$stash ) { + + ### is it a subentry? + my $sub = $pkg->can( $name ); + next unless defined $sub; + + next unless defined prototype($sub) and + not length prototype($sub); + + push @rv, $name; + } + } + + return sort @rv; +} + +1; diff --git a/src/main/perl/lib/Archive/Tar/File.pm b/src/main/perl/lib/Archive/Tar/File.pm new file mode 100644 index 000000000..1661a6a03 --- /dev/null +++ b/src/main/perl/lib/Archive/Tar/File.pm @@ -0,0 +1,718 @@ +package Archive::Tar::File; +use strict; + +use Carp (); +use IO::File; +use File::Spec::Unix (); +use File::Spec (); +use File::Basename (); + +use Archive::Tar::Constant; + +use vars qw[@ISA $VERSION]; +#@ISA = qw[Archive::Tar]; +$VERSION = '3.04'; + +### set value to 1 to oct() it during the unpack ### + +my $tmpl = [ + name => 0, # string A100 + mode => 1, # octal A8 + uid => 1, # octal A8 + gid => 1, # octal A8 + size => 0, # octal # cdrake - not *always* octal.. A12 + mtime => 1, # octal A12 + chksum => 1, # octal A8 + type => 0, # character A1 + linkname => 0, # string A100 + magic => 0, # string A6 + version => 0, # 2 bytes A2 + uname => 0, # string A32 + gname => 0, # string A32 + devmajor => 1, # octal A8 + devminor => 1, # octal A8 + prefix => 0, # A155 x 12 + +### end UNPACK items ### + raw => 0, # the raw data chunk + data => 0, # the data associated with the file -- + # This might be very memory intensive +]; + +### install get/set accessors for this object. +for ( my $i=0; $i[$i]; + no strict 'refs'; + *{__PACKAGE__."::$key"} = sub { + my $self = shift; + $self->{$key} = $_[0] if @_; + + ### just in case the key is not there or undef or something ### + { local $^W = 0; + return $self->{$key}; + } + } +} + +=head1 NAME + +Archive::Tar::File - a subclass for in-memory extracted file from Archive::Tar + +=head1 SYNOPSIS + + my @items = $tar->get_files; + + print $_->name, ' ', $_->size, "\n" for @items; + + print $object->get_content; + $object->replace_content('new content'); + + $object->rename( 'new/full/path/to/file.c' ); + +=head1 DESCRIPTION + +Archive::Tar::File provides a neat little object layer for in-memory +extracted files. It's mostly used internally in Archive::Tar to tidy +up the code, but there's no reason users shouldn't use this API as +well. + +=head2 Accessors + +A lot of the methods in this package are accessors to the various +fields in the tar header: + +=over 4 + +=item name + +The file's name + +=item mode + +The file's mode + +=item uid + +The user id owning the file + +=item gid + +The group id owning the file + +=item size + +File size in bytes + +=item mtime + +Modification time. Adjusted to mac-time on MacOS if required + +=item chksum + +Checksum field for the tar header + +=item type + +File type -- numeric, but comparable to exported constants -- see +Archive::Tar's documentation + +=item linkname + +If the file is a symlink, the file it's pointing to + +=item magic + +Tar magic string -- not useful for most users + +=item version + +Tar version string -- not useful for most users + +=item uname + +The user name that owns the file + +=item gname + +The group name that owns the file + +=item devmajor + +Device major number in case of a special file + +=item devminor + +Device minor number in case of a special file + +=item prefix + +Any directory to prefix to the extraction path, if any + +=item raw + +Raw tar header -- not useful for most users + +=back + +=head1 Methods + +=head2 Archive::Tar::File->new( file => $path ) + +Returns a new Archive::Tar::File object from an existing file. + +Returns undef on failure. + +=head2 Archive::Tar::File->new( data => $path, $data, $opt ) + +Returns a new Archive::Tar::File object from data. + +C<$path> defines the file name (which need not exist), C<$data> the +file contents, and C<$opt> is a reference to a hash of attributes +which may be used to override the default attributes (fields in the +tar header), which are described above in the Accessors section. + +Returns undef on failure. + +=head2 Archive::Tar::File->new( chunk => $chunk ) + +Returns a new Archive::Tar::File object from a raw 512-byte tar +archive chunk. + +Returns undef on failure. + +=cut + +sub new { + my $class = shift; + my $what = shift; + + my $obj = ($what eq 'chunk') ? __PACKAGE__->_new_from_chunk( @_ ) : + ($what eq 'file' ) ? __PACKAGE__->_new_from_file( @_ ) : + ($what eq 'data' ) ? __PACKAGE__->_new_from_data( @_ ) : + undef; + + return $obj; +} + +### copies the data, creates a clone ### +sub clone { + my $self = shift; + return bless { %$self }, ref $self; +} + +sub _new_from_chunk { + my $class = shift; + my $chunk = shift or return; # 512 bytes of tar header + my %hash = @_; + + ### filter any arguments on defined-ness of values. + ### this allows overriding from what the tar-header is saying + ### about this tar-entry. Particularly useful for @LongLink files + my %args = map { $_ => $hash{$_} } grep { defined $hash{$_} } keys %hash; + + ### makes it start at 0 actually... :) ### + my $i = -1; + my %entry = map { + my ($s,$v)=($tmpl->[++$i],$tmpl->[++$i]); # cdrake + ($_)=($_=~/^([^\0]*)/) unless($s eq 'size'); # cdrake + $s=> $v ? oct $_ : $_ # cdrake + # $tmpl->[++$i] => $tmpl->[++$i] ? oct $_ : $_ # removed by cdrake - mucks up binary sizes >8gb + } unpack( UNPACK, $chunk ); # cdrake + # } map { /^([^\0]*)/ } unpack( UNPACK, $chunk ); # old - replaced now by cdrake + + + if(substr($entry{'size'}, 0, 1) eq "\x80") { # binary size extension for files >8gigs (> octal 77777777777777) # cdrake + my @sz=unpack("aCSNN",$entry{'size'}); $entry{'size'}=$sz[4]+(2**32)*$sz[3]+$sz[2]*(2**64); # Use the low 80 bits (should use the upper 15 as well, but as at year 2011, that seems unlikely to ever be needed - the numbers are just too big...) # cdrake + } else { # cdrake + ($entry{'size'})=($entry{'size'}=~/^([^\0]*)/); $entry{'size'}=oct $entry{'size'}; # cdrake + } # cdrake + + + my $obj = bless { %entry, %args }, $class; + + ### magic is a filetype string.. it should have something like 'ustar' or + ### something similar... if the chunk is garbage, skip it + return unless $obj->magic !~ /\W/; + + ### store the original chunk ### + $obj->raw( $chunk ); + + $obj->type(FILE) if ( (!length $obj->type) or ($obj->type =~ /\W/) ); + $obj->type(DIR) if ( ($obj->is_file) && ($obj->name =~ m|/$|) ); + + + return $obj; + +} + +sub _new_from_file { + my $class = shift; + my $path = shift; + + ### path has to at least exist + return unless defined $path; + + my $type = __PACKAGE__->_filetype($path); + my $data = ''; + + READ: { + unless ($type == DIR ) { + my $fh = IO::File->new; + + unless( $fh->open($path, 'r') ) { + ### dangling symlinks are fine, stop reading but continue + ### creating the object + last READ if $type == SYMLINK; + + ### otherwise, return from this function -- + ### anything that's *not* a symlink should be + ### resolvable + return; + } + + ### binmode needed to read files properly on win32 ### + binmode $fh; + $data = do { local $/; <$fh> }; + close $fh; + } + } + + my @items = qw[mode uid gid size mtime]; + my %hash = map { shift(@items), $_ } (lstat $path)[2,4,5,7,9]; + + if (ON_VMS) { + ### VMS has two UID modes, traditional and POSIX. Normally POSIX is + ### not used. We currently do not have an easy way to see if we are in + ### POSIX mode. In traditional mode, the UID is actually the VMS UIC. + ### The VMS UIC has the upper 16 bits is the GID, which in many cases + ### the VMS UIC will be larger than 209715, the largest that TAR can + ### handle. So for now, assume it is traditional if the UID is larger + ### than 0x10000. + + if ($hash{uid} > 0x10000) { + $hash{uid} = $hash{uid} & 0xFFFF; + } + + ### The file length from stat() is the physical length of the file + ### However the amount of data read in may be more for some file types. + ### Fixed length files are read past the logical EOF to end of the block + ### containing. Other file types get expanded on read because record + ### delimiters are added. + + my $data_len = length $data; + $hash{size} = $data_len if $hash{size} < $data_len; + + } + ### you *must* set size == 0 on symlinks, or the next entry will be + ### though of as the contents of the symlink, which is wrong. + ### this fixes bug #7937 + $hash{size} = 0 if ($type == DIR or $type == SYMLINK); + $hash{mtime} -= TIME_OFFSET; + + ### strip the high bits off the mode, which we don't need to store + $hash{mode} = STRIP_MODE->( $hash{mode} ); + + + ### probably requires some file path munging here ... ### + ### name and prefix are set later + my $obj = { + %hash, + name => '', + chksum => CHECK_SUM, + type => $type, + linkname => ($type == SYMLINK and CAN_READLINK) + ? readlink $path + : '', + magic => MAGIC, + version => TAR_VERSION, + uname => UNAME->( $hash{uid} ), + gname => GNAME->( $hash{gid} ), + devmajor => 0, # not handled + devminor => 0, # not handled + prefix => '', + data => $data, + }; + + bless $obj, $class; + + ### fix up the prefix and file from the path + my($prefix,$file) = $obj->_prefix_and_file( $path ); + $obj->prefix( $prefix ); + $obj->name( $file ); + + return $obj; +} + +sub _new_from_data { + my $class = shift; + my $path = shift; return unless defined $path; + my $data = shift; return unless defined $data; + my $opt = shift; + + my $obj = { + data => $data, + name => '', + mode => MODE, + uid => UID, + gid => GID, + size => length $data, + mtime => time - TIME_OFFSET, + chksum => CHECK_SUM, + type => FILE, + linkname => '', + magic => MAGIC, + version => TAR_VERSION, + uname => UNAME->( UID ), + gname => GNAME->( GID ), + devminor => 0, + devmajor => 0, + prefix => '', + }; + + ### overwrite with user options, if provided ### + if( $opt and ref $opt eq 'HASH' ) { + for my $key ( keys %$opt ) { + + ### don't write bogus options ### + next unless exists $obj->{$key}; + $obj->{$key} = $opt->{$key}; + } + } + + bless $obj, $class; + + ### fix up the prefix and file from the path + my($prefix,$file) = $obj->_prefix_and_file( $path ); + $obj->prefix( $prefix ); + $obj->name( $file ); + + return $obj; +} + +sub _prefix_and_file { + my $self = shift; + my $path = shift; + + my ($vol, $dirs, $file) = File::Spec->splitpath( $path, $self->is_dir ); + my @dirs = File::Spec->splitdir( File::Spec->canonpath($dirs) ); + + ### if it's a directory, then $file might be empty + $file = pop @dirs if $self->is_dir and not length $file; + + ### splitting ../ gives you the relative path in native syntax + ### Remove the root (000000) directory + ### The volume from splitpath will also be in native syntax + if (ON_VMS) { + map { $_ = '..' if $_ eq '-'; $_ = '' if $_ eq '000000' } @dirs; + if (length($vol)) { + $vol = VMS::Filespec::unixify($vol); + unshift @dirs, $vol; + } + } + + my $prefix = File::Spec::Unix->catdir(@dirs); + return( $prefix, $file ); +} + +sub _filetype { + my $self = shift; + my $file = shift; + + return unless defined $file; + + return SYMLINK if (-l $file); # Symlink + + return FILE if (-f _); # Plain file + + return DIR if (-d _); # Directory + + return FIFO if (-p _); # Named pipe + + return SOCKET if (-S _); # Socket + + return BLOCKDEV if (-b _); # Block special + + return CHARDEV if (-c _); # Character special + + ### shouldn't happen, this is when making archives, not reading ### + return LONGLINK if ( $file eq LONGLINK_NAME ); + + return UNKNOWN; # Something else (like what?) + +} + +### this method 'downgrades' a file to plain file -- this is used for +### symlinks when FOLLOW_SYMLINKS is true. +sub _downgrade_to_plainfile { + my $entry = shift; + $entry->type( FILE ); + $entry->mode( MODE ); + $entry->linkname(''); + + return 1; +} + +=head2 $bool = $file->extract( [ $alternative_name ] ) + +Extract this object, optionally to an alternative name. + +See C<< Archive::Tar->extract_file >> for details. + +Returns true on success and false on failure. + +=cut + +sub extract { + my $self = shift; + + local $Carp::CarpLevel += 1; + + ### avoid circular use, so only require; + require Archive::Tar; + return Archive::Tar->_extract_file( $self, @_ ); +} + +=head2 $path = $file->full_path + +Returns the full path from the tar header; this is basically a +concatenation of the C and C fields. + +=cut + +sub full_path { + my $self = shift; + + ### if prefix field is empty + return $self->name unless defined $self->prefix and length $self->prefix; + + ### or otherwise, catfile'd + my $path = File::Spec::Unix->catfile( $self->prefix, $self->name ); + $path .= "/" if $self->name =~ m{/$}; # Re-add trailing slash if necessary, as catfile() strips them off. + return $path; +} + + +=head2 $bool = $file->validate + +Done by Archive::Tar internally when reading the tar file: +validate the header against the checksum to ensure integer tar file. + +Returns true on success, false on failure + +=cut + +sub validate { + my $self = shift; + + my $raw = $self->raw; + + ### don't know why this one is different from the one we /write/ ### + substr ($raw, 148, 8) = " "; + + ### bug #43513: [PATCH] Accept wrong checksums from SunOS and HP-UX tar + ### like GNU tar does. See here for details: + ### http://www.gnu.org/software/tar/manual/tar.html#SEC139 + ### so we do both a signed AND unsigned validate. if one succeeds, that's + ### good enough + return ( (unpack ("%16C*", $raw) == $self->chksum) + or (unpack ("%16c*", $raw) == $self->chksum)) ? 1 : 0; +} + +=head2 $bool = $file->has_content + +Returns a boolean to indicate whether the current object has content. +Some special files like directories and so on never will have any +content. This method is mainly to make sure you don't get warnings +for using uninitialized values when looking at an object's content. + +=cut + +sub has_content { + my $self = shift; + return defined $self->data() && length $self->data() ? 1 : 0; +} + +=head2 $content = $file->get_content + +Returns the current content for the in-memory file + +=cut + +sub get_content { + my $self = shift; + $self->data( ); +} + +=head2 $cref = $file->get_content_by_ref + +Returns the current content for the in-memory file as a scalar +reference. Normal users won't need this, but it will save memory if +you are dealing with very large data files in your tar archive, since +it will pass the contents by reference, rather than make a copy of it +first. + +=cut + +sub get_content_by_ref { + my $self = shift; + + return \$self->{data}; +} + +=head2 $bool = $file->replace_content( $content ) + +Replace the current content of the file with the new content. This +only affects the in-memory archive, not the on-disk version until +you write it. + +Returns true on success, false on failure. + +=cut + +sub replace_content { + my $self = shift; + my $data = shift || ''; + + $self->data( $data ); + $self->size( length $data ); + return 1; +} + +=head2 $bool = $file->rename( $new_name ) + +Rename the current file to $new_name. + +Note that you must specify a Unix path for $new_name, since per tar +standard, all files in the archive must be Unix paths. + +Returns true on success and false on failure. + +=cut + +sub rename { + my $self = shift; + my $path = shift; + + return unless defined $path; + + my ($prefix,$file) = $self->_prefix_and_file( $path ); + + $self->name( $file ); + $self->prefix( $prefix ); + + return 1; +} + +=head2 $bool = $file->chmod( $mode ) + +Change mode of $file to $mode. The mode can be a string or a number +which is interpreted as octal whether or not a leading 0 is given. + +Returns true on success and false on failure. + +=cut + +sub chmod { + my $self = shift; + my $mode = shift; return unless defined $mode && $mode =~ /^[0-7]{1,4}$/; + $self->{mode} = oct($mode); + return 1; +} + +=head2 $bool = $file->chown( $user [, $group]) + +Change owner of $file to $user. If a $group is given that is changed +as well. You can also pass a single parameter with a colon separating the +use and group as in 'root:wheel'. + +Returns true on success and false on failure. + +=cut + +sub chown { + my $self = shift; + my $uname = shift; + return unless defined $uname; + my $gname; + if (-1 != index($uname, ':')) { + ($uname, $gname) = split(/:/, $uname); + } else { + $gname = shift if @_ > 0; + } + + $self->uname( $uname ); + $self->gname( $gname ) if $gname; + return 1; +} + +=head1 Convenience methods + +To quickly check the type of a C object, you can +use the following methods: + +=over 4 + +=item $file->is_file + +Returns true if the file is of type C + +=item $file->is_dir + +Returns true if the file is of type C

+ +=item $file->is_hardlink + +Returns true if the file is of type C + +=item $file->is_symlink + +Returns true if the file is of type C + +=item $file->is_chardev + +Returns true if the file is of type C + +=item $file->is_blockdev + +Returns true if the file is of type C + +=item $file->is_fifo + +Returns true if the file is of type C + +=item $file->is_socket + +Returns true if the file is of type C + +=item $file->is_longlink + +Returns true if the file is of type C. +Should not happen after a successful C. + +=item $file->is_label + +Returns true if the file is of type C