-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCreate_long_dataset.py
More file actions
407 lines (327 loc) · 15 KB
/
Create_long_dataset.py
File metadata and controls
407 lines (327 loc) · 15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
from datasets import load_dataset, Dataset, IterableDataset, DatasetDict, IterableDatasetDict
from transformers import AutoTokenizer
from typing import List, Dict, Any, Optional, Union
import pandas as pd
class DatasetCreator:
"""
Dataset creator for LongBench v2 and Super-NaturalInstructions datasets.
Supports streaming and is compatible with the existing DatasetCreator structure.
"""
def __init__(self, tokenizer, dataset_name: str = "longbench", use_streaming: bool = True, seed: int = 42):
"""
Initialize the DatasetCreator.
Args:
tokenizer: The tokenizer to use for applying chat templates
dataset_name: Either "longbench" or "supernatural" for dataset selection
use_streaming: Whether to use streaming for the dataset
seed: Random seed for shuffling
"""
self.tokenizer = tokenizer
self.dataset_name = dataset_name.lower()
self.use_streaming = use_streaming
self.seed = seed
self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict, IterableDatasetDict]] = None
if self.dataset_name not in ["longbench", "supernatural"]:
raise ValueError(f"Unsupported dataset_name: {dataset_name}. Must be 'longbench' or 'supernatural'")
def load_dataset(self):
"""Load the specified dataset."""
if self.dataset_name == "longbench":
print("Loading LongBench v2 dataset...")
self.dataset = load_dataset(
"THUDM/LongBench-v2",
split="train",
streaming=self.use_streaming
)
elif self.dataset_name == "supernatural":
print("Loading Super-NaturalInstructions dataset...")
self.dataset = load_dataset(
"Muennighoff/natural-instructions",
split="train",
streaming=self.use_streaming
)
print(f"Dataset loaded with streaming={'enabled' if self.use_streaming else 'disabled'}")
return self.dataset
def format_single_choice_question(self, example: Dict[str, Any]) -> str:
"""
Format a LongBench v2 example into a single choice question string.
Args:
example: Single example from LongBench v2 dataset
Returns:
Formatted question string
"""
question = example["question"]
formatted_question = f"""Question: {question}
Please provide your answer."""
return formatted_question
def format_supernatural_task(self, example: Dict[str, Any]) -> str:
"""
Format a Super-NaturalInstructions example into an instruction string.
Args:
example: Single example from Super-NaturalInstructions dataset
Returns:
Formatted instruction string
"""
definition = example["definition"]
inputs = example["inputs"]
formatted_instruction = f"""{definition}
Input: {inputs}
Please provide your answer."""
return formatted_instruction
def create_conversation_from_example(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
"""
Convert an example into conversation format.
Args:
example: Single example from the dataset
Returns:
Conversation in the format expected by chat templates
"""
if self.dataset_name == "longbench":
return self._create_longbench_conversation(example)
elif self.dataset_name == "supernatural":
return self._create_supernatural_conversation(example)
def _create_longbench_conversation(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
"""Create conversation format for LongBench v2 examples."""
# Format the context and question
context = example["context"]
question = self.format_single_choice_question(example)
answer = example["answer"]
# Get the actual answer text based on the answer choice
answer_mapping = {
'A': example["choice_A"],
'B': example["choice_B"],
'C': example["choice_C"],
'D': example["choice_D"]
}
answer_text = answer_mapping.get(answer, answer)
# Create the user message with context and question
user_content = f"""Context:
{context}
{question}"""
# Create the assistant response with the actual answer text
assistant_content = answer_text
conversation = [
{"role": "user", "content": user_content},
{"role": "assistant", "content": assistant_content}
]
return conversation
def _create_supernatural_conversation(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
"""Create conversation format for Super-NaturalInstructions examples."""
# Format the instruction
instruction = self.format_supernatural_task(example)
targets = example["targets"]
# Create the user message with instruction
user_content = instruction
# Create the assistant response with targets
assistant_content = targets
conversation = [
{"role": "user", "content": user_content},
{"role": "assistant", "content": assistant_content}
]
return conversation
def process_examples_batch(self, examples: Dict[str, List[Any]]) -> Dict[str, List[str]]:
"""
Process a batch of examples into conversation format.
Args:
examples: Batch of examples from the dataset
Returns:
Dictionary with conversations list
"""
conversations = []
# Process each example in the batch
if self.dataset_name == "longbench":
id_key = "_id"
elif self.dataset_name == "supernatural":
id_key = "id"
num_examples = len(examples[id_key])
for i in range(num_examples):
example = {key: examples[key][i] for key in examples.keys()}
conversation = self.create_conversation_from_example(example)
conversations.append(conversation)
return {"conversations": conversations}
def apply_chat_template_to_conversations(self, conversations: List[List[Dict[str, str]]]) -> List[str]:
"""
Apply chat template to a list of conversations.
Args:
conversations: List of conversations
Returns:
List of formatted text strings
"""
try:
formatted_texts = self.tokenizer.apply_chat_template(
conversations,
tokenize=False,
add_generation_prompt=False
)
return formatted_texts
except Exception as e:
print(f"Error applying chat template: {e}")
# Fallback: process conversations individually
formatted_texts = []
for conversation in conversations:
try:
formatted_text = self.tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=False
)
formatted_texts.append(formatted_text)
except Exception as inner_e:
print(f"Error processing individual conversation: {inner_e}")
# Create a simple fallback format
formatted_text = f"{conversation[0]['content']}\n\n{conversation[1]['content']}"
formatted_texts.append(formatted_text)
return formatted_texts
def create_dataset(self, max_examples: Optional[int] = None) -> Dataset:
"""
Create the complete dataset with chat templates applied.
Args:
max_examples: Maximum number of examples to process (for testing/debugging)
Returns:
Processed dataset with text column
"""
if self.dataset is None:
self.load_dataset()
if self.dataset is None:
raise ValueError("Failed to load dataset")
print(f"Processing {self.dataset_name} dataset...")
if self.use_streaming:
# Handle streaming dataset
examples_list = []
count = 0
for example in self.dataset:
examples_list.append(example)
count += 1
if max_examples and count >= max_examples:
break
if count % 1000 == 0:
print(f"Processed {count} examples...")
print(f"Collected {len(examples_list)} examples from streaming dataset")
# Convert to regular dataset for processing
regular_dataset = Dataset.from_list(examples_list)
else:
# Handle non-streaming dataset
if isinstance(self.dataset, Dataset):
regular_dataset = self.dataset
if max_examples:
regular_dataset = regular_dataset.select(range(min(max_examples, len(regular_dataset))))
else:
# Convert any other dataset type to list first
examples_list = list(self.dataset)
if max_examples:
examples_list = examples_list[:max_examples]
regular_dataset = Dataset.from_list(examples_list)
# Process the dataset in batches
print("Converting examples to conversations...")
# Get column names safely - all Dataset objects should have column_names
columns_to_remove = regular_dataset.column_names
conversation_dataset = regular_dataset.map(
self.process_examples_batch,
batched=True,
batch_size=50, # Process in smaller batches to manage memory
remove_columns=columns_to_remove
)
# Apply chat templates
print("Applying chat templates...")
conversations = conversation_dataset["conversations"]
# Process conversations in chunks to manage memory
chunk_size = 100
all_formatted_texts = []
for i in range(0, len(conversations), chunk_size):
chunk = conversations[i:i + chunk_size]
formatted_chunk = self.apply_chat_template_to_conversations(chunk)
all_formatted_texts.extend(formatted_chunk)
if (i // chunk_size + 1) % 10 == 0:
print(f"Processed {i + len(chunk)} conversations...")
print(f"Generated {len(all_formatted_texts)} formatted texts")
# Create final dataset
final_dataset = Dataset.from_dict({"text": all_formatted_texts})
# Shuffle the dataset
final_dataset = final_dataset.shuffle(seed=self.seed)
print(f"Created final dataset with {len(final_dataset)} examples")
return final_dataset
def get_dataset_info(self) -> Dict[str, Any]:
"""
Get information about the selected dataset.
Returns:
Dictionary with dataset information
"""
if self.dataset is None:
self.load_dataset()
if self.dataset_name == "longbench":
info = {
"name": "LongBench v2",
"description": "Challenging long-context benchmark with 503 multiple-choice questions",
"streaming": self.use_streaming,
"tasks": [
"single-document QA",
"multi-document QA",
"long in-context learning",
"long-dialogue history understanding",
"code repo understanding",
"long structured data understanding"
],
"context_length_range": "8k to 2M words",
"format": "multiple-choice questions"
}
elif self.dataset_name == "supernatural":
info = {
"name": "Super-NaturalInstructions",
"description": "Large collection of diverse NLP tasks with natural language instructions",
"streaming": self.use_streaming,
"tasks": [
"text classification",
"question answering",
"text generation",
"reading comprehension",
"sentiment analysis",
"named entity recognition",
"and many more"
],
"format": "instruction-following tasks"
}
if not self.use_streaming and self.dataset is not None:
# Only try len() on Dataset and DatasetDict types that support it
if isinstance(self.dataset, (Dataset, DatasetDict)):
info["total_examples"] = len(self.dataset)
else:
info["total_examples"] = "Unknown (streaming or iterable dataset type)"
return info
# Example usage:
if __name__ == "__main__":
from transformers import AutoTokenizer
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Create dataset creator for LongBench v2
creator_longbench = LongBenchV2DatasetCreator(
tokenizer=tokenizer,
dataset_name="longbench", # Use LongBench v2
use_streaming=True,
seed=42
)
# Create dataset creator for Super-NaturalInstructions
creator_supernatural = LongBenchV2DatasetCreator(
tokenizer=tokenizer,
dataset_name="supernatural", # Use Super-NaturalInstructions
use_streaming=True,
seed=42
)
# Get dataset info for both
print("LongBench v2 Dataset Info:")
info_longbench = creator_longbench.get_dataset_info()
for key, value in info_longbench.items():
print(f" {key}: {value}")
print("\nSuper-NaturalInstructions Dataset Info:")
info_supernatural = creator_supernatural.get_dataset_info()
for key, value in info_supernatural.items():
print(f" {key}: {value}")
# Create datasets (use max_examples for testing)
print("\nCreating LongBench v2 dataset:")
dataset_longbench = creator_longbench.create_dataset(max_examples=5)
print(f"Sample from LongBench v2 dataset:")
print(dataset_longbench[0]["text"][:500] + "...")
print("\nCreating Super-NaturalInstructions dataset:")
dataset_supernatural = creator_supernatural.create_dataset(max_examples=5)
print(f"Sample from Super-NaturalInstructions dataset:")
print(dataset_supernatural[0]["text"][:500] + "...")