-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathDataSection.java
More file actions
257 lines (219 loc) · 11 KB
/
DataSection.java
File metadata and controls
257 lines (219 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
package org.perlonjava.frontend.parser;
import org.perlonjava.app.cli.CompilerOptions;
import org.perlonjava.frontend.lexer.LexerToken;
import org.perlonjava.frontend.lexer.LexerTokenType;
import org.perlonjava.runtime.io.ScalarBackedIO;
import org.perlonjava.runtime.runtimetypes.GlobalVariable;
import org.perlonjava.runtime.runtimetypes.RuntimeIO;
import org.perlonjava.runtime.runtimetypes.RuntimeScalar;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8;
public class DataSection {
/**
* Set of package names that have already processed their DATA section
*/
private static final Set<String> processedPackages = new HashSet<>();
/**
* Set of package names that have already created placeholder DATA handles
*/
private static final Set<String> placeholderCreated = new HashSet<>();
/**
* Resets all static state for DataSection.
* Called between test runs to prevent stale state from interfering.
*/
public static void reset() {
processedPackages.clear();
placeholderCreated.clear();
}
/**
* Creates a placeholder DATA filehandle for a package early in parsing.
* This ensures the DATA filehandle exists during BEGIN block execution.
*
* @param parser the parser instance
*/
public static void createPlaceholderDataHandle(Parser parser) {
String handleName = parser.ctx.symbolTable.getCurrentPackage() + "::DATA";
if (placeholderCreated.contains(handleName)) {
return; // Already created placeholder for this package
}
placeholderCreated.add(handleName);
if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("Creating placeholder DATA handle for package: " + handleName);
// Create an empty placeholder file handle that will be populated later
RuntimeScalar emptyContent = new RuntimeScalar("");
var fileHandle = RuntimeIO.open(emptyContent.createReference(), "<");
GlobalVariable.getGlobalIO(handleName).setIO(fileHandle);
}
/**
* Creates or updates a DATA filehandle for a package.
*
* @param parser the parser instance
* @param content the content after __DATA__ or __END__
*/
public static void createDataHandle(Parser parser, String content) {
String handleName = parser.ctx.symbolTable.getCurrentPackage() + "::DATA";
if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("Populating DATA handle for package: " + handleName + " with content: " + content);
// Get the existing RuntimeIO (which should be the placeholder we created earlier)
RuntimeIO existingIO = GlobalVariable.getGlobalIO(handleName).getRuntimeIO();
if (existingIO != null) {
// Update the existing IO handle with new content instead of replacing it
// This ensures that any aliased handles (like *ARGV = *DATA) continue to work
RuntimeScalar contentScalar = new RuntimeScalar(content);
ScalarBackedIO newScalarIO = new ScalarBackedIO(contentScalar);
existingIO.ioHandle = newScalarIO;
if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("Updated existing DATA handle with new content");
} else {
// Fallback: create new handle if no placeholder exists
RuntimeScalar contentScalar = new RuntimeScalar(content);
var fileHandle = RuntimeIO.open(contentScalar.createReference(), "<");
GlobalVariable.getGlobalIO(handleName).setIO(fileHandle);
if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("Created new DATA handle");
}
}
/**
* Checks if a token represents an end-of-file marker.
* This includes EOF tokens and special characters like ^D (EOT) and ^Z (SUB).
*
* @param token the token to check
* @return true if the token is an end marker, false otherwise
*/
private static boolean isEndMarker(LexerToken token) {
if (token.type == LexerTokenType.EOF) {
return true;
}
if (token.type == LexerTokenType.STRING) {
return token.text.equals(String.valueOf((char) 4)) || // ^D (EOT)
token.text.equals(String.valueOf((char) 26)); // ^Z (SUB)
}
return false;
}
/**
* Extracts DATA section content from raw file bytes.
* In Perl 5, <DATA> reads raw bytes from the file by default. When
* {@code use utf8} is active, a {@code :utf8} IO layer is applied to the
* DATA handle (handled by the caller), matching Perl 5 behavior.
*
* @param rawBytes the raw file bytes (after BOM removal)
* @param markerText the marker to search for ("__DATA__" or "__END__")
* @return the DATA content as a string (Latin-1 encoded), or null if marker not found
*/
private static String extractDataFromRawBytes(byte[] rawBytes, String markerText) {
byte[] marker = markerText.getBytes(StandardCharsets.US_ASCII);
int markerLen = marker.length;
// Search for the marker at the start of a line in raw bytes
for (int i = 0; i <= rawBytes.length - markerLen; i++) {
// Check that we're at the start of a line (position 0 or after \n)
if (i > 0 && rawBytes[i - 1] != '\n') {
continue;
}
// Check if the marker matches at this position
boolean match = true;
for (int j = 0; j < markerLen; j++) {
if (rawBytes[i + j] != marker[j]) {
match = false;
break;
}
}
if (!match) continue;
// Verify the marker is followed by whitespace/newline/EOF (not part of a longer identifier)
int afterMarker = i + markerLen;
if (afterMarker < rawBytes.length) {
byte next = rawBytes[afterMarker];
if (next != '\n' && next != '\r' && next != ' ' && next != '\t') {
continue; // Part of a longer identifier
}
}
// Skip past the marker and any trailing whitespace + newline
int dataStart = afterMarker;
// Skip spaces/tabs
while (dataStart < rawBytes.length && (rawBytes[dataStart] == ' ' || rawBytes[dataStart] == '\t')) {
dataStart++;
}
// Skip the newline (\n or \r\n)
if (dataStart < rawBytes.length && rawBytes[dataStart] == '\r') {
dataStart++;
}
if (dataStart < rawBytes.length && rawBytes[dataStart] == '\n') {
dataStart++;
}
// Always store as Latin-1 (each byte = one character) to preserve raw bytes.
// The DATA handle's encoding layer (applied by parseDataSection) handles
// UTF-8 decoding at read time when `use utf8` is active.
return new String(rawBytes, dataStart, rawBytes.length - dataStart, StandardCharsets.ISO_8859_1);
}
return null; // Marker not found
}
static int parseDataSection(Parser parser, int tokenIndex, List<LexerToken> tokens, LexerToken token) {
String handleName = parser.ctx.symbolTable.getCurrentPackage() + "::DATA";
// Check if this package has already processed its DATA section.
// However, allow re-processing if the DATA handle was closed (e.g., module
// was re-required after delete $INC{...}). This is needed because modules
// like ConfigData.pm close DATA after reading and expect a fresh handle on reload.
if (processedPackages.contains(handleName)) {
RuntimeIO existingIO = GlobalVariable.getGlobalIO(handleName).getRuntimeIO();
if (existingIO != null && !(existingIO.ioHandle instanceof org.perlonjava.runtime.io.ClosedIOHandle)) {
return tokens.size();
}
// Handle was closed — allow re-processing
processedPackages.remove(handleName);
placeholderCreated.remove(handleName);
}
if (token.text.equals("__DATA__") || token.text.equals("__END__")) {
processedPackages.add(handleName);
// __END__ should always stop parsing, but only top-level scripts (and __DATA__) should
// populate the DATA handle content.
boolean populateData = token.text.equals("__DATA__") || parser.isTopLevelScript;
tokenIndex++;
// Skip any whitespace immediately after __DATA__
while (tokenIndex < tokens.size() && tokens.get(tokenIndex).type == LexerTokenType.WHITESPACE) {
tokenIndex++;
}
// Skip the newline after __DATA__
if (tokenIndex < tokens.size() && tokens.get(tokenIndex).type == LexerTokenType.NEWLINE) {
tokenIndex++;
}
if (populateData) {
// Try to extract DATA content from raw file bytes first.
// This preserves non-UTF-8 bytes (e.g., Latin-1) that would be corrupted
// by the UTF-8 decoding that happens when reading source files.
// In Perl 5, <DATA> reads raw bytes from the file.
byte[] rawBytes = parser.ctx.compilerOptions.rawCodeBytes;
boolean useUtf8 = parser.ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8);
String rawContent = null;
if (rawBytes != null) {
rawContent = extractDataFromRawBytes(rawBytes, token.text);
}
if (rawContent != null) {
createDataHandle(parser, rawContent);
} else {
// Fallback: concatenate remaining tokens (for eval/string-based code
// where raw bytes are not available)
StringBuilder dataContent = new StringBuilder();
while (tokenIndex < tokens.size()) {
LexerToken currentToken = tokens.get(tokenIndex);
// Stop if we hit an end marker
if (isEndMarker(currentToken)) {
break;
}
dataContent.append(currentToken.text);
tokenIndex++;
}
createDataHandle(parser, dataContent.toString());
}
// When `use utf8` is active, apply :utf8 layer to the DATA handle.
// This matches Perl 5 behavior where the DATA handle inherits the
// source encoding pragma, decoding UTF-8 bytes at read time.
if (useUtf8) {
RuntimeIO dataIO = GlobalVariable.getGlobalIO(handleName).getRuntimeIO();
if (dataIO != null) {
dataIO.binmode(":utf8");
}
}
}
}
// Return tokens.size() to indicate we've consumed everything
return tokens.size();
}
}