diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_test.parquet new file mode 100644 index 0000000..5eccf6a Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_train.parquet new file mode 100644 index 0000000..8caf04f Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group0_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_test.parquet new file mode 100644 index 0000000..925ca62 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_train.parquet new file mode 100644 index 0000000..b5c4bb8 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group1_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_test.parquet new file mode 100644 index 0000000..5badff8 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_train.parquet new file mode 100644 index 0000000..782219e Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group2_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_test.parquet new file mode 100644 index 0000000..7f69a0b Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_train.parquet new file mode 100644 index 0000000..cc179a9 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group3_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_test.parquet new file mode 100644 index 0000000..52c40a6 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_train.parquet new file mode 100644 index 0000000..894874b Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group4_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_test.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_test.parquet new file mode 100644 index 0000000..c26cc6f Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_train.parquet b/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_train.parquet new file mode 100644 index 0000000..979355c Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/llama_3b/group5_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_test.parquet new file mode 100644 index 0000000..b1ccde4 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_train.parquet new file mode 100644 index 0000000..8d8543b Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group0_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_test.parquet new file mode 100644 index 0000000..a94ac78 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_train.parquet new file mode 100644 index 0000000..34b0f1a Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group1_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_test.parquet new file mode 100644 index 0000000..4c76fb1 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_train.parquet new file mode 100644 index 0000000..04c4430 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group2_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_test.parquet new file mode 100644 index 0000000..0ad9237 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_train.parquet new file mode 100644 index 0000000..a33e72f Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group3_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_test.parquet new file mode 100644 index 0000000..2a32bfa Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_train.parquet new file mode 100644 index 0000000..2d1fd42 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group4_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_test.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_test.parquet new file mode 100644 index 0000000..245071d Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_train.parquet b/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_train.parquet new file mode 100644 index 0000000..f50907d Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/phi4mini/group5_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_test.parquet new file mode 100644 index 0000000..0a0a217 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_train.parquet new file mode 100644 index 0000000..b08b416 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group0_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_test.parquet new file mode 100644 index 0000000..3b1fb4e Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_train.parquet new file mode 100644 index 0000000..cd187dc Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group1_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_test.parquet new file mode 100644 index 0000000..e47e920 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_train.parquet new file mode 100644 index 0000000..4044ad4 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group2_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_test.parquet new file mode 100644 index 0000000..14bcc05 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_train.parquet new file mode 100644 index 0000000..6b5291b Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group3_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_test.parquet new file mode 100644 index 0000000..6178147 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_train.parquet new file mode 100644 index 0000000..fb20b80 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group4_train.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_test.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_test.parquet new file mode 100644 index 0000000..203ff85 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_test.parquet differ diff --git a/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_train.parquet b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_train.parquet new file mode 100644 index 0000000..4101433 Binary files /dev/null and b/data/out/splits/single_token_entropy/mmlu/qwen_3b/group5_train.parquet differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group0_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group0_test.parquet deleted file mode 100644 index d480cad..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group0_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group0_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group0_train.parquet deleted file mode 100644 index 4739070..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group0_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group1_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group1_test.parquet deleted file mode 100644 index 2f9a43f..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group1_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group1_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group1_train.parquet deleted file mode 100644 index d1884a7..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group1_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group2_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group2_test.parquet deleted file mode 100644 index e8f0416..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group2_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group2_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group2_train.parquet deleted file mode 100644 index 40d9015..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group2_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group3_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group3_test.parquet deleted file mode 100644 index 10e212d..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group3_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group3_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group3_train.parquet deleted file mode 100644 index 3c1a598..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group3_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group4_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group4_test.parquet deleted file mode 100644 index 162b9c8..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group4_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group4_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group4_train.parquet deleted file mode 100644 index 55872eb..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group4_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group5_test.parquet b/data/out/splits/single_token_entropy/phi4mini/group5_test.parquet deleted file mode 100644 index 48b9068..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group5_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/phi4mini/group5_train.parquet b/data/out/splits/single_token_entropy/phi4mini/group5_train.parquet deleted file mode 100644 index aa13d86..0000000 Binary files a/data/out/splits/single_token_entropy/phi4mini/group5_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group0_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group0_test.parquet deleted file mode 100644 index fe4ba79..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group0_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group0_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group0_train.parquet deleted file mode 100644 index 444e450..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group0_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group1_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group1_test.parquet deleted file mode 100644 index 2efe92b..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group1_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group1_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group1_train.parquet deleted file mode 100644 index f4324f3..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group1_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group2_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group2_test.parquet deleted file mode 100644 index 644de3f..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group2_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group2_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group2_train.parquet deleted file mode 100644 index 6d82618..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group2_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group3_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group3_test.parquet deleted file mode 100644 index 6928f8b..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group3_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group3_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group3_train.parquet deleted file mode 100644 index 5e89dcf..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group3_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group4_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group4_test.parquet deleted file mode 100644 index 16322b8..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group4_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group4_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group4_train.parquet deleted file mode 100644 index 0e6ee23..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group4_train.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group5_test.parquet b/data/out/splits/single_token_entropy/qwen_3b/group5_test.parquet deleted file mode 100644 index 9ac9a81..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group5_test.parquet and /dev/null differ diff --git a/data/out/splits/single_token_entropy/qwen_3b/group5_train.parquet b/data/out/splits/single_token_entropy/qwen_3b/group5_train.parquet deleted file mode 100644 index 26465a4..0000000 Binary files a/data/out/splits/single_token_entropy/qwen_3b/group5_train.parquet and /dev/null differ diff --git a/src/core/training/base_trainer.py b/src/core/training/base_trainer.py index 1a32f29..8a62fed 100644 --- a/src/core/training/base_trainer.py +++ b/src/core/training/base_trainer.py @@ -56,6 +56,7 @@ class BaseTrainer[TConfig: BaseTrainerConfig[Any] = BaseTrainerConfig]: def __init__(self, config: TConfig, tokenizer: PreTrainedTokenizer | None = None): self.config = config self._tokenizer: PreTrainedTokenizer | None = tokenizer + self._model: AutoModelForCausalLM | None = None def train(self): if not self._directory_is_empty(self.config.out_path, self.config.training_args.num_train_epochs): @@ -76,9 +77,7 @@ def tokenizer(self): if not self._tokenizer: self._tokenizer = AutoTokenizer.from_pretrained(self.config.model_id) - assert isinstance(self._tokenizer, PreTrainedTokenizer), ( - "Tokenizer must be a PreTrainedTokenizer, but got {}".format(type(self._tokenizer)) - ) + assert self._tokenizer is not None, "Tokenizer should be initialized" if self._tokenizer.pad_token is None: logger.warning("Tokenizer has no pad token, setting it to eos token") @@ -103,7 +102,9 @@ def data_collator(self): @property def training_args(self): return Seq2SeqTrainingArguments( - **self.config.training_args.model_dump(), + **self.config.training_args.model_dump( + exclude={"effective_train_batch_size", "per_device_train_batch_size", "gradient_accumulation_steps"} + ), **self._batch_size_config( self.config.training_args.effective_train_batch_size, self.config.training_args.per_device_train_batch_size, @@ -116,7 +117,8 @@ def _prepare_data(self): logger.info("Dataset samples") logger.info("Train") logger.info(f"Input: {self.tokenizer.decode(train_ds[0]['input_ids'])}") - logger.info(f"Labels: {self.tokenizer.decode(train_ds[0]['labels'])}") + labels = [tok for tok in train_ds[0]["labels"] if tok != -100] + logger.info(f"Labels: {self.tokenizer.decode(labels)}") return train_ds @@ -132,7 +134,9 @@ def _run_training(self, train_ds): if self.config.save_schedule is not None: trainer.add_callback(SaveByScheduleCallback(schedule=self.config.save_schedule)) - trainer.train(resume_from_checkpoint=True) + has_checkpoint = get_last_checkpoint_dir(self.config.out_path) is not None + logger.info(f"Has checkpoint: {has_checkpoint}") + trainer.train(resume_from_checkpoint=has_checkpoint) def _directory_is_empty(self, directory: str, expected_epochs: int) -> bool: p = Path(directory) diff --git a/src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py b/src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py index 8619dd3..9c77b0b 100644 --- a/src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py +++ b/src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py @@ -2,8 +2,8 @@ from transformers import AutoTokenizer +from core.datasets.causal_dataset_adapter import CausalDatasetAdapter from core.datasets.mmlu.mmlu_single_token_response_dataset import MMLUSingleTokenResponseDataset, QADatasetConfig -from core.datasets.qa_dataset_adapter import QADatasetAdapter from core.training.lora_trainer import LoRATrainer, LoRATrainerConfig, LoRATrainingArgs MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" @@ -13,13 +13,17 @@ trainer = LoRATrainer( config=LoRATrainerConfig( - out_path=Path(__file__).parent.joinpath("../../../../artifacts/sft_by_complexity_splits/llama_3b").as_posix(), + out_path=Path(__file__) + .parent.joinpath("../../../../artifacts/sft_by_complexity_splits/mmlu/llama_3b/group0") + .as_posix(), model_id=MODEL_NAME, - train_dataset=QADatasetAdapter( + train_dataset=CausalDatasetAdapter( dataset=MMLUSingleTokenResponseDataset( config=QADatasetConfig( path=Path(__file__) - .parent.joinpath("../../../../data/out/splits/single_token_entropy/qwen_3b/group0_train.parquet") + .parent.joinpath( + "../../../../data/out/splits/single_token_entropy/mmlu/llama_3b/group0_train.parquet" + ) .as_posix() ), tokenizer=tokenizer, diff --git a/src/postprocessing/split_by_entropy_mmlu.ipynb b/src/postprocessing/split_by_entropy_mmlu.ipynb new file mode 100644 index 0000000..7214b85 --- /dev/null +++ b/src/postprocessing/split_by_entropy_mmlu.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "e698e053", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "\n", + "from core.utils.seed import set_seed\n", + "\n", + "set_seed()\n", + "\n", + "TEST_ALLOCATION = 0.2\n", + "SPLIT_OUT_PATH = Path(\"../../data/out/splits/single_token_entropy/mmlu/qwen_3b/\")\n", + "CHUNK_CNT = 6\n", + "\n", + "entropy_col = \"entropy_value\"\n", + "\n", + "df = pd.read_parquet(\"../../data/out/single_token_entropy/mmlu_qwen_3b.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a198d340", + "metadata": {}, + "outputs": [], + "source": [ + "from pandas import DataFrame\n", + "\n", + "filtered_df = df[df[entropy_col].notna()]\n", + "\n", + "sorted_df = filtered_df.sort_values(entropy_col, ascending=True)\n", + "\n", + "chunk_len = len(sorted_df) // CHUNK_CNT\n", + "\n", + "chunks: list[DataFrame] = []\n", + "for i in range(CHUNK_CNT):\n", + " start_idx = i * chunk_len\n", + " # Python (and pandas for that matter) is OK with end index to be out of bounds\n", + " end_idx = start_idx + chunk_len\n", + " chunk = sorted_df.iloc[start_idx:end_idx]\n", + " chunk.reset_index(drop=True, inplace=True)\n", + " chunks.append(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c9cd9eb3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chunks: 6\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 2.042132e-09\n", + "1 2.780812e-09\n", + "2 2.787101e-09\n", + "3 4.184975e-09\n", + "4 4.319797e-09\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 0.000062\n", + "1 0.000063\n", + "2 0.000063\n", + "3 0.000063\n", + "4 0.000063\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 0.004533\n", + "1 0.004534\n", + "2 0.004538\n", + "3 0.004544\n", + "4 0.004553\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 0.082552\n", + "1 0.082575\n", + "2 0.082623\n", + "3 0.082634\n", + "4 0.082653\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 0.392897\n", + "1 0.393025\n", + "2 0.393167\n", + "3 0.393354\n", + "4 0.393358\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n", + "Chunk len: 2005\n", + "Chunk sample\n", + "0 0.793556\n", + "1 0.793746\n", + "2 0.793916\n", + "3 0.794349\n", + "4 0.794569\n", + "Name: entropy_value, dtype: float64\n", + "NA count: 0\n" + ] + } + ], + "source": [ + "print(\"Chunks: \", len(chunks))\n", + "\n", + "for chunk in chunks:\n", + " print(\"Chunk len: \", len(chunk))\n", + " print(\"Chunk sample\")\n", + " print(chunk.head()[entropy_col])\n", + " # check for missing values in the entropy column\n", + " na_count = chunk[entropy_col].isna().sum()\n", + " print(\"NA count:\", na_count)\n", + " assert na_count == 0, \"Found NA values in chunk\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "de03e738", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chunk : 0\n", + "Train len: 1604\n", + "Test len: 401\n", + "Chunk : 1\n", + "Train len: 1604\n", + "Test len: 401\n", + "Chunk : 2\n", + "Train len: 1604\n", + "Test len: 401\n", + "Chunk : 3\n", + "Train len: 1604\n", + "Test len: 401\n", + "Chunk : 4\n", + "Train len: 1604\n", + "Test len: 401\n", + "Chunk : 5\n", + "Train len: 1604\n", + "Test len: 401\n" + ] + } + ], + "source": [ + "from core.utils.splitter import split_chunk_into_train_test\n", + "\n", + "for chunk_i, chunk in enumerate(chunks):\n", + " print(\"Chunk : \", chunk_i)\n", + "\n", + " train_df, test_df = split_chunk_into_train_test(chunk, TEST_ALLOCATION)\n", + " print(\"Train len: \", len(train_df))\n", + " print(\"Test len: \", len(test_df))\n", + "\n", + " SPLIT_OUT_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + " test_df.to_parquet(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_test.parquet\").resolve()), index=False)\n", + " train_df.to_parquet(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_train.parquet\").resolve()), index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "complexity-aware-fine-tuning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/postprocessing/split_by_entropy_phi4mini.ipynb b/src/postprocessing/split_by_entropy_phi4mini.ipynb deleted file mode 100644 index a01f0d8..0000000 --- a/src/postprocessing/split_by_entropy_phi4mini.ipynb +++ /dev/null @@ -1,182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e698e053", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "import pandas as pd\n", - "\n", - "from core.utils.seed import set_seed\n", - "\n", - "set_seed()\n", - "\n", - "TEST_ALLOCATION = 0.2\n", - "SPLIT_OUT_PATH = Path(\"../../data/out/splits/single_token_entropy/phi4mini/\")\n", - "\n", - "ans_col = \"entropy_ans_phi3\"\n", - "entropy_col = \"entropy_value_phi3\"\n", - "\n", - "df = pd.read_csv(\n", - " \"../../data/out/single_token_entropy/mmlu_phi4mini.tsv\",\n", - " sep=\"\\t\",\n", - " header=0,\n", - " dtype={ans_col: \"str\"},\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a198d340", - "metadata": {}, - "outputs": [], - "source": [ - "from core.utils.splitter import split_into_even_chunks\n", - "\n", - "chunks = split_into_even_chunks(df, entropy_col, ans_col, chunk_cnt=6)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c9cd9eb3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Chunks: 6\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 0.004531\n", - "1 0.004555\n", - "2 0.004800\n", - "3 0.005088\n", - "4 0.005106\n", - "Name: entropy_value_phi3, dtype: float64\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 0.241959\n", - "1 0.242102\n", - "2 0.243058\n", - "3 0.243299\n", - "4 0.243874\n", - "Name: entropy_value_phi3, dtype: float64\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 0.763710\n", - "1 0.764015\n", - "2 0.764074\n", - "3 0.764553\n", - "4 0.764896\n", - "Name: entropy_value_phi3, dtype: float64\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 1.266042\n", - "1 1.266153\n", - "2 1.266666\n", - "3 1.267027\n", - "4 1.267364\n", - "Name: entropy_value_phi3, dtype: float64\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 1.762308\n", - "1 1.762762\n", - "2 1.762959\n", - "3 1.763277\n", - "4 1.763723\n", - "Name: entropy_value_phi3, dtype: float64\n", - "Chunk len: 1935\n", - "Chunk sample\n", - "0 2.283264\n", - "1 2.283761\n", - "2 2.284007\n", - "3 2.284326\n", - "4 2.284616\n", - "Name: entropy_value_phi3, dtype: float64\n" - ] - } - ], - "source": [ - "print(\"Chunks: \", len(chunks))\n", - "\n", - "for chunk in chunks:\n", - " print(\"Chunk len: \", len(chunk))\n", - " print(\"Chunk sample\")\n", - " print(chunk.head()[entropy_col])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de03e738", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Chunk : 0\n", - "Train len: 1548\n", - "Test len: 387\n", - "Chunk : 1\n", - "Train len: 1548\n", - "Test len: 387\n", - "Chunk : 2\n", - "Train len: 1548\n", - "Test len: 387\n", - "Chunk : 3\n", - "Train len: 1548\n", - "Test len: 387\n", - "Chunk : 4\n", - "Train len: 1548\n", - "Test len: 387\n", - "Chunk : 5\n", - "Train len: 1548\n", - "Test len: 387\n" - ] - } - ], - "source": [ - "from core.utils.splitter import split_chunk_into_train_test\n", - "\n", - "for chunk_i, chunk in enumerate(chunks):\n", - " print(\"Chunk : \", chunk_i)\n", - "\n", - " train_df, test_df = split_chunk_into_train_test(chunk, TEST_ALLOCATION)\n", - " print(\"Train len: \", len(train_df))\n", - " print(\"Test len: \", len(test_df))\n", - "\n", - " test_df.to_csv(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_test.tsv\").resolve()), sep=\"\\t\", index=False)\n", - " train_df.to_csv(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_train.tsv\").resolve()), sep=\"\\t\", index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "reasoning-fine-tune", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/postprocessing/split_by_entropy_qwen_3b.ipynb b/src/postprocessing/split_by_entropy_qwen_3b.ipynb deleted file mode 100644 index 0470943..0000000 --- a/src/postprocessing/split_by_entropy_qwen_3b.ipynb +++ /dev/null @@ -1,182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e698e053", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "import pandas as pd\n", - "\n", - "from core.utils.seed import set_seed\n", - "\n", - "set_seed()\n", - "\n", - "TEST_ALLOCATION = 0.2\n", - "SPLIT_OUT_PATH = Path(\"../../data/out/splits/single_token_entropy/qwen_3b/\")\n", - "\n", - "ans_col = \"entropy_ans_qwen2\"\n", - "entropy_col = \"entropy_value_qwen2\"\n", - "\n", - "df = pd.read_csv(\n", - " \"../../data/out/single_token_entropy/mmlu_qwen_3b.tsv\",\n", - " sep=\"\\t\",\n", - " header=0,\n", - " dtype={ans_col: \"str\"},\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a198d340", - "metadata": {}, - "outputs": [], - "source": [ - "from core.utils.splitter import split_into_even_chunks\n", - "\n", - "chunks = split_into_even_chunks(df, entropy_col, ans_col, chunk_cnt=6)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c9cd9eb3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Chunks: 6\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 1.667101e-09\n", - "1 2.326040e-09\n", - "2 3.429733e-09\n", - "3 4.099281e-09\n", - "4 4.317271e-09\n", - "Name: entropy_value_qwen2, dtype: float64\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 0.000061\n", - "1 0.000061\n", - "2 0.000061\n", - "3 0.000061\n", - "4 0.000061\n", - "Name: entropy_value_qwen2, dtype: float64\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 0.004460\n", - "1 0.004475\n", - "2 0.004492\n", - "3 0.004492\n", - "4 0.004515\n", - "Name: entropy_value_qwen2, dtype: float64\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 0.082473\n", - "1 0.082576\n", - "2 0.082576\n", - "3 0.082781\n", - "4 0.083025\n", - "Name: entropy_value_qwen2, dtype: float64\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 0.386922\n", - "1 0.386953\n", - "2 0.386957\n", - "3 0.386975\n", - "4 0.387138\n", - "Name: entropy_value_qwen2, dtype: float64\n", - "Chunk len: 2005\n", - "Chunk sample\n", - "0 0.789808\n", - "1 0.790229\n", - "2 0.790564\n", - "3 0.790684\n", - "4 0.790753\n", - "Name: entropy_value_qwen2, dtype: float64\n" - ] - } - ], - "source": [ - "print(\"Chunks: \", len(chunks))\n", - "\n", - "for chunk in chunks:\n", - " print(\"Chunk len: \", len(chunk))\n", - " print(\"Chunk sample\")\n", - " print(chunk.head()[entropy_col])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "de03e738", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Chunk : 0\n", - "Train len: 1604\n", - "Test len: 401\n", - "Chunk : 1\n", - "Train len: 1604\n", - "Test len: 401\n", - "Chunk : 2\n", - "Train len: 1604\n", - "Test len: 401\n", - "Chunk : 3\n", - "Train len: 1604\n", - "Test len: 401\n", - "Chunk : 4\n", - "Train len: 1604\n", - "Test len: 401\n", - "Chunk : 5\n", - "Train len: 1604\n", - "Test len: 401\n" - ] - } - ], - "source": [ - "from core.utils.splitter import split_chunk_into_train_test\n", - "\n", - "for chunk_i, chunk in enumerate(chunks):\n", - " print(\"Chunk : \", chunk_i)\n", - "\n", - " train_df, test_df = split_chunk_into_train_test(chunk, TEST_ALLOCATION)\n", - " print(\"Train len: \", len(train_df))\n", - " print(\"Test len: \", len(test_df))\n", - "\n", - " test_df.to_csv(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_test.tsv\").resolve()), sep=\"\\t\", index=False)\n", - " train_df.to_csv(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_train.tsv\").resolve()), sep=\"\\t\", index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "reasoning-fine-tune", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}