-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_completion_datasets.py
More file actions
392 lines (339 loc) · 17.4 KB
/
create_completion_datasets.py
File metadata and controls
392 lines (339 loc) · 17.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python3
"""
Create code completion datasets from tokenized files.
Supports token-level and line-level completion tasks.
"""
import os
import argparse
import json
from typing import List, Tuple
try:
from tqdm import tqdm
HAS_TQDM = True
except ImportError:
HAS_TQDM = False
def tqdm(iterable, **kwargs):
return iterable
def load_tokenized_file(file_path: str, limit: int = None) -> List[List[str]]:
"""Load tokenized file and return list of token sequences."""
sequences = []
count = 0
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
if limit and count >= limit:
break
line = line.strip()
if line:
tokens = line.split()
# Remove <s> and </s> markers for processing
if tokens and tokens[0] == "<s>":
tokens = tokens[1:]
if tokens and tokens[-1] == "</s>":
tokens = tokens[:-1]
if tokens: # Only add non-empty sequences
sequences.append(tokens)
count += 1
return sequences
def create_token_level_dataset(sequences: List[List[str]], max_length: int = 256) -> List[Tuple[str, str]]:
"""
Create token-level dataset for next-token prediction.
Each example: (context, target_token)
"""
examples = []
for tokens in sequences:
# Create sliding window examples
for i in range(1, len(tokens)):
# Context: tokens up to (but not including) target
context = tokens[:i]
target = tokens[i]
# Truncate context if too long
if len(context) > max_length - 1:
context = context[-(max_length - 1):]
context_str = " ".join(context)
examples.append((context_str, target))
return examples
def split_into_lines(tokens: List[str]) -> List[List[str]]:
"""Split token sequence into lines based on <EOL> markers."""
lines = []
current_line = []
for token in tokens:
if token == "<EOL>":
if current_line: # Only add non-empty lines
lines.append(current_line)
current_line = []
else:
current_line.append(token)
# Add last line if exists
if current_line:
lines.append(current_line)
return lines
def create_line_level_dataset(sequences: List[List[str]],
min_prefix_length: int = 3,
max_prefix_ratio: float = 0.8,
examples_per_line: int = 1) -> List[Tuple[str, str, str]]:
"""
Create line-level dataset for line completion.
Each example: (previous_lines, prefix, suffix)
- previous_lines: all lines before the current line
- prefix: beginning of current line (to predict from)
- suffix: rest of current line (to predict)
"""
import random
examples = []
for tokens in sequences:
lines = split_into_lines(tokens)
for line_idx, line in enumerate(lines):
if len(line) < min_prefix_length + 1:
continue # Skip lines that are too short
# Previous lines context
previous_lines = lines[:line_idx]
previous_context = " <EOL> ".join([" ".join(l) for l in previous_lines])
# Split current line into prefix and suffix
# Prefix length: between min_prefix_length and max_prefix_ratio of line length
max_prefix_len = max(min_prefix_length, int(len(line) * max_prefix_ratio))
min_prefix_len = min(min_prefix_length, len(line) - 1)
# Create 1 example per line (or more if specified)
# Use a random split point for diversity
valid_split_points = list(range(min_prefix_len, min(max_prefix_len + 1, len(line))))
if not valid_split_points:
continue
# Sample split points
if examples_per_line == 1:
split_points = [random.choice(valid_split_points)]
else:
split_points = random.sample(valid_split_points, min(examples_per_line, len(valid_split_points)))
for split_point in split_points:
prefix = line[:split_point]
suffix = line[split_point:]
if len(suffix) > 0: # Only add if there's something to predict
prefix_str = " ".join(prefix)
suffix_str = " ".join(suffix)
examples.append((previous_context, prefix_str, suffix_str))
return examples
def save_token_level_dataset(examples: List[Tuple[str, str]], output_file: str):
"""Save token-level dataset in JSONL format."""
with open(output_file, 'w', encoding='utf-8') as f:
for context, target in examples:
example = {
"context": context,
"target": target
}
f.write(json.dumps(example, ensure_ascii=False) + "\n")
def save_line_level_dataset(examples: List[Tuple[str, str, str]], output_file: str):
"""Save line-level dataset in JSONL format."""
with open(output_file, 'w', encoding='utf-8') as f:
for previous_lines, prefix, suffix in examples:
example = {
"previous_lines": previous_lines,
"prefix": prefix,
"suffix": suffix
}
f.write(json.dumps(example, ensure_ascii=False) + "\n")
def main():
parser = argparse.ArgumentParser(description="Create code completion datasets from tokenized files")
parser.add_argument("--input_dir", type=str, default="token_completion",
help="Directory containing tokenized files (train.txt, dev.txt, test.txt)")
parser.add_argument("--output_dir", type=str, default="completion_datasets",
help="Output directory for completion datasets")
parser.add_argument("--max_length", type=int, default=256,
help="Maximum sequence length for token-level dataset")
parser.add_argument("--min_prefix_length", type=int, default=3,
help="Minimum prefix length for line-level dataset")
parser.add_argument("--max_prefix_ratio", type=float, default=0.8,
help="Maximum prefix ratio (0-1) for line-level dataset")
parser.add_argument("--examples_per_line", type=int, default=1,
help="Number of examples to create per line (default: 1)")
# If neither flag is provided, default to creating BOTH datasets (backwards compatible).
parser.add_argument("--token_level", action="store_true", default=False,
help="Create token-level dataset")
parser.add_argument("--line_level", action="store_true", default=False,
help="Create line-level dataset")
parser.add_argument("--limit", type=int, default=None,
help="Limit number of sequences to process (for testing)")
parser.add_argument("--chunk_size", type=int, default=5000,
help="Process sequences in chunks to save memory (default: 5000, increase for faster processing on Colab)")
parser.add_argument("--write_batch_size", type=int, default=10000,
help="Batch size for writing JSONL lines (default: 10000, larger = faster but more memory)")
parser.add_argument("--disable_progress", action="store_true",
help="Disable progress bars (useful if redirecting output)")
args = parser.parse_args()
# Backwards-compatible default: if user didn't choose, create both.
if not args.token_level and not args.line_level:
args.token_level = True
args.line_level = True
# Create output directories
os.makedirs(args.output_dir, exist_ok=True)
if args.token_level:
os.makedirs(os.path.join(args.output_dir, "token_level"), exist_ok=True)
if args.line_level:
os.makedirs(os.path.join(args.output_dir, "line_level"), exist_ok=True)
# Check which input files exist before processing
print(f"\nChecking for input files in: {args.input_dir}")
expected_files = ["train.txt", "dev.txt", "test.txt"]
missing_files = []
for split_file in expected_files:
full_path = os.path.join(args.input_dir, split_file)
if os.path.exists(full_path):
print(f" ✓ Found: {split_file}")
else:
print(f" ✗ Missing: {split_file}")
missing_files.append(split_file)
if missing_files:
print(f"\nWarning: {len(missing_files)} file(s) missing. These will be skipped.")
print("Make sure preprocess.py completed successfully and created all files.")
# Process each split
for split in ["train", "dev", "test"]:
input_file = os.path.join(args.input_dir, f"{split}.txt")
if not os.path.exists(input_file):
print(f"\nSkipping {split} (file not found: {input_file})")
continue
print(f"\nProcessing {split}...")
# Process file line-by-line to avoid loading everything into memory
chunk_size = args.chunk_size
sequence_count = 0
chunk = []
# Token-level dataset
if args.token_level:
print(f" Creating token-level dataset...")
output_file = os.path.join(args.output_dir, "token_level", f"{split}.jsonl")
total_examples = 0
write_batch = []
write_batch_size = args.write_batch_size
# Count total lines for progress bar (approximate)
if not args.disable_progress and HAS_TQDM:
try:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f)
except:
total_lines = None
else:
total_lines = None
with open(output_file, 'w', encoding='utf-8') as f, \
open(input_file, 'r', encoding='utf-8') as infile:
iterator = tqdm(infile, total=total_lines, desc=f" Token-level {split}",
disable=args.disable_progress or not HAS_TQDM, unit="lines")
for line in iterator:
if args.limit and sequence_count >= args.limit:
break
line = line.strip()
if line:
tokens = line.split()
# Remove <s> and </s> markers
if tokens and tokens[0] == "<s>":
tokens = tokens[1:]
if tokens and tokens[-1] == "</s>":
tokens = tokens[:-1]
if tokens:
chunk.append(tokens)
sequence_count += 1
# Process chunk when it reaches chunk_size
if len(chunk) >= chunk_size:
chunk_examples = create_token_level_dataset(chunk, args.max_length)
for context, target in chunk_examples:
example = {"context": context, "target": target}
json_line = json.dumps(example, ensure_ascii=False) + "\n"
write_batch.append(json_line)
# Write batch when it reaches write_batch_size
if len(write_batch) >= write_batch_size:
f.writelines(write_batch)
write_batch = []
total_examples += len(chunk_examples)
chunk = [] # Clear chunk
if not args.disable_progress and HAS_TQDM:
iterator.set_postfix({"examples": f"{total_examples:,}"})
# Process remaining chunk
if chunk:
chunk_examples = create_token_level_dataset(chunk, args.max_length)
for context, target in chunk_examples:
example = {"context": context, "target": target}
json_line = json.dumps(example, ensure_ascii=False) + "\n"
write_batch.append(json_line)
total_examples += len(chunk_examples)
# Write remaining batch
if write_batch:
f.writelines(write_batch)
print(f" Created {total_examples:,} token-level examples from {sequence_count} sequences -> {output_file}")
# Line-level dataset - process file again (or could combine, but simpler to separate)
if args.line_level:
print(f" Creating line-level dataset...")
output_file = os.path.join(args.output_dir, "line_level", f"{split}.jsonl")
total_examples = 0
sequence_count = 0
chunk = []
write_batch = []
write_batch_size = args.write_batch_size
# Count total lines for progress bar (approximate)
if not args.disable_progress and HAS_TQDM:
try:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f)
except:
total_lines = None
else:
total_lines = None
with open(output_file, 'w', encoding='utf-8') as f, \
open(input_file, 'r', encoding='utf-8') as infile:
iterator = tqdm(infile, total=total_lines, desc=f" Line-level {split}",
disable=args.disable_progress or not HAS_TQDM, unit="lines")
for line in iterator:
if args.limit and sequence_count >= args.limit:
break
line = line.strip()
if line:
tokens = line.split()
# Remove <s> and </s> markers
if tokens and tokens[0] == "<s>":
tokens = tokens[1:]
if tokens and tokens[-1] == "</s>":
tokens = tokens[:-1]
if tokens:
chunk.append(tokens)
sequence_count += 1
# Process chunk when it reaches chunk_size
if len(chunk) >= chunk_size:
chunk_examples = create_line_level_dataset(
chunk,
args.min_prefix_length,
args.max_prefix_ratio,
args.examples_per_line
)
for previous_lines, prefix, suffix in chunk_examples:
example = {
"previous_lines": previous_lines,
"prefix": prefix,
"suffix": suffix
}
json_line = json.dumps(example, ensure_ascii=False) + "\n"
write_batch.append(json_line)
# Write batch when it reaches write_batch_size
if len(write_batch) >= write_batch_size:
f.writelines(write_batch)
write_batch = []
total_examples += len(chunk_examples)
chunk = [] # Clear chunk
if not args.disable_progress and HAS_TQDM:
iterator.set_postfix({"examples": f"{total_examples:,}"})
# Process remaining chunk
if chunk:
chunk_examples = create_line_level_dataset(
chunk,
args.min_prefix_length,
args.max_prefix_ratio,
args.examples_per_line
)
for previous_lines, prefix, suffix in chunk_examples:
example = {
"previous_lines": previous_lines,
"prefix": prefix,
"suffix": suffix
}
json_line = json.dumps(example, ensure_ascii=False) + "\n"
write_batch.append(json_line)
total_examples += len(chunk_examples)
# Write remaining batch
if write_batch:
f.writelines(write_batch)
print(f" Created {total_examples:,} line-level examples from {sequence_count} sequences -> {output_file}")
print("\nDone!")
if __name__ == "__main__":
main()