Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 10 additions & 6 deletions src/core/training/base_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class BaseTrainer[TConfig: BaseTrainerConfig[Any] = BaseTrainerConfig]:
def __init__(self, config: TConfig, tokenizer: PreTrainedTokenizer | None = None):
self.config = config
self._tokenizer: PreTrainedTokenizer | None = tokenizer
self._model: AutoModelForCausalLM | None = None

def train(self):
if not self._directory_is_empty(self.config.out_path, self.config.training_args.num_train_epochs):
Expand All @@ -76,9 +77,7 @@ def tokenizer(self):
if not self._tokenizer:
self._tokenizer = AutoTokenizer.from_pretrained(self.config.model_id)

assert isinstance(self._tokenizer, PreTrainedTokenizer), (
"Tokenizer must be a PreTrainedTokenizer, but got {}".format(type(self._tokenizer))
)
assert self._tokenizer is not None, "Tokenizer should be initialized"

if self._tokenizer.pad_token is None:
logger.warning("Tokenizer has no pad token, setting it to eos token")
Expand All @@ -103,7 +102,9 @@ def data_collator(self):
@property
def training_args(self):
return Seq2SeqTrainingArguments(
**self.config.training_args.model_dump(),
**self.config.training_args.model_dump(
exclude={"effective_train_batch_size", "per_device_train_batch_size", "gradient_accumulation_steps"}
),
**self._batch_size_config(
self.config.training_args.effective_train_batch_size,
self.config.training_args.per_device_train_batch_size,
Expand All @@ -116,7 +117,8 @@ def _prepare_data(self):
logger.info("Dataset samples")
logger.info("Train")
logger.info(f"Input: {self.tokenizer.decode(train_ds[0]['input_ids'])}")
logger.info(f"Labels: {self.tokenizer.decode(train_ds[0]['labels'])}")
labels = [tok for tok in train_ds[0]["labels"] if tok != -100]
logger.info(f"Labels: {self.tokenizer.decode(labels)}")

return train_ds

Expand All @@ -132,7 +134,9 @@ def _run_training(self, train_ds):
if self.config.save_schedule is not None:
trainer.add_callback(SaveByScheduleCallback(schedule=self.config.save_schedule))

trainer.train(resume_from_checkpoint=True)
has_checkpoint = get_last_checkpoint_dir(self.config.out_path) is not None
logger.info(f"Has checkpoint: {has_checkpoint}")
trainer.train(resume_from_checkpoint=has_checkpoint)

def _directory_is_empty(self, directory: str, expected_epochs: int) -> bool:
p = Path(directory)
Expand Down
12 changes: 8 additions & 4 deletions src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from transformers import AutoTokenizer

from core.datasets.causal_dataset_adapter import CausalDatasetAdapter
from core.datasets.mmlu.mmlu_single_token_response_dataset import MMLUSingleTokenResponseDataset, QADatasetConfig
from core.datasets.qa_dataset_adapter import QADatasetAdapter
from core.training.lora_trainer import LoRATrainer, LoRATrainerConfig, LoRATrainingArgs

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
Expand All @@ -13,13 +13,17 @@

trainer = LoRATrainer(
config=LoRATrainerConfig(
out_path=Path(__file__).parent.joinpath("../../../../artifacts/sft_by_complexity_splits/llama_3b").as_posix(),
out_path=Path(__file__)
.parent.joinpath("../../../../artifacts/sft_by_complexity_splits/mmlu/llama_3b/group0")
.as_posix(),
model_id=MODEL_NAME,
train_dataset=QADatasetAdapter(
train_dataset=CausalDatasetAdapter(
dataset=MMLUSingleTokenResponseDataset(
config=QADatasetConfig(
path=Path(__file__)
.parent.joinpath("../../../../data/out/splits/single_token_entropy/qwen_3b/group0_train.parquet")
.parent.joinpath(
"../../../../data/out/splits/single_token_entropy/mmlu/llama_3b/group0_train.parquet"
)
.as_posix()
),
tokenizer=tokenizer,
Expand Down
202 changes: 202 additions & 0 deletions src/postprocessing/split_by_entropy_mmlu.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "e698e053",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from core.utils.seed import set_seed\n",
"\n",
"set_seed()\n",
"\n",
"TEST_ALLOCATION = 0.2\n",
"SPLIT_OUT_PATH = Path(\"../../data/out/splits/single_token_entropy/mmlu/qwen_3b/\")\n",
"CHUNK_CNT = 6\n",
"\n",
"entropy_col = \"entropy_value\"\n",
"\n",
"df = pd.read_parquet(\"../../data/out/single_token_entropy/mmlu_qwen_3b.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a198d340",
"metadata": {},
"outputs": [],
"source": [
"from pandas import DataFrame\n",
"\n",
"filtered_df = df[df[entropy_col].notna()]\n",
"\n",
"sorted_df = filtered_df.sort_values(entropy_col, ascending=True)\n",
"\n",
"chunk_len = len(sorted_df) // CHUNK_CNT\n",
"\n",
"chunks: list[DataFrame] = []\n",
"for i in range(CHUNK_CNT):\n",
" start_idx = i * chunk_len\n",
" # Python (and pandas for that matter) is OK with end index to be out of bounds\n",
" end_idx = start_idx + chunk_len\n",
" chunk = sorted_df.iloc[start_idx:end_idx]\n",
" chunk.reset_index(drop=True, inplace=True)\n",
" chunks.append(chunk)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c9cd9eb3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chunks: 6\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 2.042132e-09\n",
"1 2.780812e-09\n",
"2 2.787101e-09\n",
"3 4.184975e-09\n",
"4 4.319797e-09\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 0.000062\n",
"1 0.000063\n",
"2 0.000063\n",
"3 0.000063\n",
"4 0.000063\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 0.004533\n",
"1 0.004534\n",
"2 0.004538\n",
"3 0.004544\n",
"4 0.004553\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 0.082552\n",
"1 0.082575\n",
"2 0.082623\n",
"3 0.082634\n",
"4 0.082653\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 0.392897\n",
"1 0.393025\n",
"2 0.393167\n",
"3 0.393354\n",
"4 0.393358\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n",
"Chunk len: 2005\n",
"Chunk sample\n",
"0 0.793556\n",
"1 0.793746\n",
"2 0.793916\n",
"3 0.794349\n",
"4 0.794569\n",
"Name: entropy_value, dtype: float64\n",
"NA count: 0\n"
]
}
],
"source": [
"print(\"Chunks: \", len(chunks))\n",
"\n",
"for chunk in chunks:\n",
" print(\"Chunk len: \", len(chunk))\n",
" print(\"Chunk sample\")\n",
" print(chunk.head()[entropy_col])\n",
" # check for missing values in the entropy column\n",
" na_count = chunk[entropy_col].isna().sum()\n",
" print(\"NA count:\", na_count)\n",
" assert na_count == 0, \"Found NA values in chunk\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "de03e738",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chunk : 0\n",
"Train len: 1604\n",
"Test len: 401\n",
"Chunk : 1\n",
"Train len: 1604\n",
"Test len: 401\n",
"Chunk : 2\n",
"Train len: 1604\n",
"Test len: 401\n",
"Chunk : 3\n",
"Train len: 1604\n",
"Test len: 401\n",
"Chunk : 4\n",
"Train len: 1604\n",
"Test len: 401\n",
"Chunk : 5\n",
"Train len: 1604\n",
"Test len: 401\n"
]
}
],
"source": [
"from core.utils.splitter import split_chunk_into_train_test\n",
"\n",
"for chunk_i, chunk in enumerate(chunks):\n",
" print(\"Chunk : \", chunk_i)\n",
"\n",
" train_df, test_df = split_chunk_into_train_test(chunk, TEST_ALLOCATION)\n",
" print(\"Train len: \", len(train_df))\n",
" print(\"Test len: \", len(test_df))\n",
"\n",
" SPLIT_OUT_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
" test_df.to_parquet(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_test.parquet\").resolve()), index=False)\n",
" train_df.to_parquet(str(SPLIT_OUT_PATH.joinpath(f\"group{chunk_i}_train.parquet\").resolve()), index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "complexity-aware-fine-tuning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading