Compression_of_context/Create_long_dataset.py at main · Reminguch/Compression_of_context · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
from datasets import load_dataset, Dataset, IterableDataset, DatasetDict, IterableDatasetDict
from transformers import AutoTokenizer
from typing import List, Dict, Any, Optional, Union
import pandas as pd


class DatasetCreator:
    """
    Dataset creator for LongBench v2 and Super-NaturalInstructions datasets.
    Supports streaming and is compatible with the existing DatasetCreator structure.
    """

    def __init__(self, tokenizer, dataset_name: str = "longbench", use_streaming: bool = True, seed: int = 42):
        """
        Initialize the DatasetCreator.

        Args:
            tokenizer: The tokenizer to use for applying chat templates
            dataset_name: Either "longbench" or "supernatural" for dataset selection
            use_streaming: Whether to use streaming for the dataset
            seed: Random seed for shuffling
        """
        self.tokenizer = tokenizer
        self.dataset_name = dataset_name.lower()
        self.use_streaming = use_streaming
        self.seed = seed
        self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict, IterableDatasetDict]] = None

        if self.dataset_name not in ["longbench", "supernatural"]:
            raise ValueError(f"Unsupported dataset_name: {dataset_name}. Must be 'longbench' or 'supernatural'")

    def load_dataset(self):
        """Load the specified dataset."""
        if self.dataset_name == "longbench":
            print("Loading LongBench v2 dataset...")
            self.dataset = load_dataset(
                "THUDM/LongBench-v2",
                split="train",
                streaming=self.use_streaming
            )
        elif self.dataset_name == "supernatural":
            print("Loading Super-NaturalInstructions dataset...")
            self.dataset = load_dataset(
                "Muennighoff/natural-instructions",
                split="train",
                streaming=self.use_streaming
            )

        print(f"Dataset loaded with streaming={'enabled' if self.use_streaming else 'disabled'}")
        return self.dataset

    def format_single_choice_question(self, example: Dict[str, Any]) -> str:
        """
        Format a LongBench v2 example into a single choice question string.

        Args:
            example: Single example from LongBench v2 dataset

        Returns:
            Formatted question string
        """
        question = example["question"]

        formatted_question = f"""Question: {question}

Please provide your answer."""

        return formatted_question

    def format_supernatural_task(self, example: Dict[str, Any]) -> str:
        """
        Format a Super-NaturalInstructions example into an instruction string.

        Args:
            example: Single example from Super-NaturalInstructions dataset

        Returns:
            Formatted instruction string
        """
        definition = example["definition"]
        inputs = example["inputs"]

        formatted_instruction = f"""{definition}

Input: {inputs}

Please provide your answer."""

        return formatted_instruction

    def create_conversation_from_example(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
        """
        Convert an example into conversation format.

        Args:
            example: Single example from the dataset

        Returns:
            Conversation in the format expected by chat templates
        """
        if self.dataset_name == "longbench":
            return self._create_longbench_conversation(example)
        elif self.dataset_name == "supernatural":
            return self._create_supernatural_conversation(example)

    def _create_longbench_conversation(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
        """Create conversation format for LongBench v2 examples."""
        # Format the context and question
        context = example["context"]
        question = self.format_single_choice_question(example)
        answer = example["answer"]

        # Get the actual answer text based on the answer choice
        answer_mapping = {
            'A': example["choice_A"],
            'B': example["choice_B"],
            'C': example["choice_C"],
            'D': example["choice_D"]
        }
        answer_text = answer_mapping.get(answer, answer)

        # Create the user message with context and question
        user_content = f"""Context:
{context}

{question}"""

        # Create the assistant response with the actual answer text
        assistant_content = answer_text

        conversation = [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]

        return conversation

    def _create_supernatural_conversation(self, example: Dict[str, Any]) -> List[Dict[str, str]]:
        """Create conversation format for Super-NaturalInstructions examples."""
        # Format the instruction
        instruction = self.format_supernatural_task(example)
        targets = example["targets"]

        # Create the user message with instruction
        user_content = instruction

        # Create the assistant response with targets
        assistant_content = targets

        conversation = [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]

        return conversation

    def process_examples_batch(self, examples: Dict[str, List[Any]]) -> Dict[str, List[str]]:
        """
        Process a batch of examples into conversation format.

        Args:
            examples: Batch of examples from the dataset

        Returns:
            Dictionary with conversations list
        """
        conversations = []

        # Process each example in the batch
        if self.dataset_name == "longbench":
            id_key = "_id"
        elif self.dataset_name == "supernatural":
            id_key = "id"

        num_examples = len(examples[id_key])
        for i in range(num_examples):
            example = {key: examples[key][i] for key in examples.keys()}
            conversation = self.create_conversation_from_example(example)
            conversations.append(conversation)

        return {"conversations": conversations}

    def apply_chat_template_to_conversations(self, conversations: List[List[Dict[str, str]]]) -> List[str]:
        """
        Apply chat template to a list of conversations.

        Args:
            conversations: List of conversations

        Returns:
            List of formatted text strings
        """
        try:
            formatted_texts = self.tokenizer.apply_chat_template(
                conversations,
                tokenize=False,
                add_generation_prompt=False
            )
            return formatted_texts
        except Exception as e:
            print(f"Error applying chat template: {e}")
            # Fallback: process conversations individually
            formatted_texts = []
            for conversation in conversations:
                try:
                    formatted_text = self.tokenizer.apply_chat_template(
                        conversation,
                        tokenize=False,
                        add_generation_prompt=False
                    )
                    formatted_texts.append(formatted_text)
                except Exception as inner_e:
                    print(f"Error processing individual conversation: {inner_e}")
                    # Create a simple fallback format
                    formatted_text = f"{conversation[0]['content']}\n\n{conversation[1]['content']}"
                    formatted_texts.append(formatted_text)
            return formatted_texts

    def create_dataset(self, max_examples: Optional[int] = None) -> Dataset:
        """
        Create the complete dataset with chat templates applied.

        Args:
            max_examples: Maximum number of examples to process (for testing/debugging)

        Returns:
            Processed dataset with text column
        """
        if self.dataset is None:
            self.load_dataset()

        if self.dataset is None:
            raise ValueError("Failed to load dataset")

        print(f"Processing {self.dataset_name} dataset...")

        if self.use_streaming:
            # Handle streaming dataset
            examples_list = []
            count = 0

            for example in self.dataset:
                examples_list.append(example)
                count += 1

                if max_examples and count >= max_examples:
                    break

                if count % 1000 == 0:
                    print(f"Processed {count} examples...")

            print(f"Collected {len(examples_list)} examples from streaming dataset")

            # Convert to regular dataset for processing
            regular_dataset = Dataset.from_list(examples_list)
        else:
            # Handle non-streaming dataset
            if isinstance(self.dataset, Dataset):
                regular_dataset = self.dataset
                if max_examples:
                    regular_dataset = regular_dataset.select(range(min(max_examples, len(regular_dataset))))
            else:
                # Convert any other dataset type to list first
                examples_list = list(self.dataset)
                if max_examples:
                    examples_list = examples_list[:max_examples]
                regular_dataset = Dataset.from_list(examples_list)

        # Process the dataset in batches
        print("Converting examples to conversations...")

        # Get column names safely - all Dataset objects should have column_names
        columns_to_remove = regular_dataset.column_names

        conversation_dataset = regular_dataset.map(
            self.process_examples_batch,
            batched=True,
            batch_size=50,  # Process in smaller batches to manage memory
            remove_columns=columns_to_remove
        )

        # Apply chat templates
        print("Applying chat templates...")
        conversations = conversation_dataset["conversations"]

        # Process conversations in chunks to manage memory
        chunk_size = 100
        all_formatted_texts = []

        for i in range(0, len(conversations), chunk_size):
            chunk = conversations[i:i + chunk_size]
            formatted_chunk = self.apply_chat_template_to_conversations(chunk)
            all_formatted_texts.extend(formatted_chunk)

            if (i // chunk_size + 1) % 10 == 0:
                print(f"Processed {i + len(chunk)} conversations...")

        print(f"Generated {len(all_formatted_texts)} formatted texts")

        # Create final dataset
        final_dataset = Dataset.from_dict({"text": all_formatted_texts})

        # Shuffle the dataset
        final_dataset = final_dataset.shuffle(seed=self.seed)

        print(f"Created final dataset with {len(final_dataset)} examples")
        return final_dataset

    def get_dataset_info(self) -> Dict[str, Any]:
        """
        Get information about the selected dataset.

        Returns:
            Dictionary with dataset information
        """
        if self.dataset is None:
            self.load_dataset()

        if self.dataset_name == "longbench":
            info = {
                "name": "LongBench v2",
                "description": "Challenging long-context benchmark with 503 multiple-choice questions",
                "streaming": self.use_streaming,
                "tasks": [
                    "single-document QA",
                    "multi-document QA",
                    "long in-context learning",
                    "long-dialogue history understanding",
                    "code repo understanding",
                    "long structured data understanding"
                ],
                "context_length_range": "8k to 2M words",
                "format": "multiple-choice questions"
            }
        elif self.dataset_name == "supernatural":
            info = {
                "name": "Super-NaturalInstructions",
                "description": "Large collection of diverse NLP tasks with natural language instructions",
                "streaming": self.use_streaming,
                "tasks": [
                    "text classification",
                    "question answering",
                    "text generation",
                    "reading comprehension",
                    "sentiment analysis",
                    "named entity recognition",
                    "and many more"
                ],
                "format": "instruction-following tasks"
            }

        if not self.use_streaming and self.dataset is not None:
            # Only try len() on Dataset and DatasetDict types that support it
            if isinstance(self.dataset, (Dataset, DatasetDict)):
                info["total_examples"] = len(self.dataset)
            else:
                info["total_examples"] = "Unknown (streaming or iterable dataset type)"

        return info


# Example usage:
if __name__ == "__main__":
    from transformers import AutoTokenizer

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Create dataset creator for LongBench v2
    creator_longbench = LongBenchV2DatasetCreator(
        tokenizer=tokenizer,
        dataset_name="longbench",  # Use LongBench v2
        use_streaming=True,
        seed=42
    )

    # Create dataset creator for Super-NaturalInstructions
    creator_supernatural = LongBenchV2DatasetCreator(
        tokenizer=tokenizer,
        dataset_name="supernatural",  # Use Super-NaturalInstructions
        use_streaming=True,
        seed=42
    )

    # Get dataset info for both
    print("LongBench v2 Dataset Info:")
    info_longbench = creator_longbench.get_dataset_info()
    for key, value in info_longbench.items():
        print(f"  {key}: {value}")

    print("\nSuper-NaturalInstructions Dataset Info:")
    info_supernatural = creator_supernatural.get_dataset_info()
    for key, value in info_supernatural.items():
        print(f"  {key}: {value}")

    # Create datasets (use max_examples for testing)
    print("\nCreating LongBench v2 dataset:")
    dataset_longbench = creator_longbench.create_dataset(max_examples=5)
    print(f"Sample from LongBench v2 dataset:")
    print(dataset_longbench[0]["text"][:500] + "...")

    print("\nCreating Super-NaturalInstructions dataset:")
    dataset_supernatural = creator_supernatural.create_dataset(max_examples=5)
    print(f"Sample from Super-NaturalInstructions dataset:")
    print(dataset_supernatural[0]["text"][:500] + "...")