From 49d6fc7310c78eb9b39ab9debdd028c1094aaa75 Mon Sep 17 00:00:00 2001 From: troyfeng116 Date: Thu, 5 Jan 2023 00:09:19 -0500 Subject: [PATCH 1/4] Initial HF trainer testing --- finetuning/hf_trainer.py | 64 +++++++++++++++++++ .../spider_t5_finetuning.yaml | 4 +- 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 finetuning/hf_trainer.py diff --git a/finetuning/hf_trainer.py b/finetuning/hf_trainer.py new file mode 100644 index 00000000..0cc952b9 --- /dev/null +++ b/finetuning/hf_trainer.py @@ -0,0 +1,64 @@ +from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq +from lightning_modules.models.seq2seq_model_util import get_model +from lightning_modules.datasets.spider_reader import SpiderDataset, Text2SqlDataModule + + +MODEL_NAME = "EleutherAI/gpt-neo-125M" + +model, tokenizer = get_model(MODEL_NAME, gradient_ckpt=True) +training_args = Seq2SeqTrainingArguments( + output_dir="out", + gradient_checkpointing=True, + learning_rate=5e-05, + weight_decay=0.01, + max_steps=25000, + fp16=True, + do_train=True, + # do_eval=True, + logging_strategy="no", +) + +dataset_init_args = { + "transformer_model_name": MODEL_NAME, + # "batch_size": 4, + # "val_batch_size": 4, + # "train_max_instances": 200, + # "val_max_instances": 100, + "file_path": "data/spider/train_spider_processed_v2.jsonl", +} + +# dataset = Text2SqlDataModule( +# transformer_model_name=MODEL_NAME, +# batch_size=4, +# val_batch_size=4, +# train_max_instances=200, +# val_max_instances=100, +# train_set_init_args={ +# "file_path": "data/spider/train_spider_processed_v2.jsonl", +# }, +# val_set_init_args={ +# "file_path": "data/spider/dev_processed.jsonl", +# }, +# set_common_init_args={ +# "use_skg_format": False, +# }, +# ) + +dataset = SpiderDataset( + transformer_model_name=MODEL_NAME, + file_path="data/spider/train_spider_processed_v2.jsonl", +) + +# print(dataset) + +collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) + +trainer = Seq2SeqTrainer( + model=model, + data_collator=collator, + args=training_args, + train_dataset=dataset, + tokenizer=tokenizer, +) + +trainer.train() diff --git a/finetuning/training_configs/spider_t5_finetuning.yaml b/finetuning/training_configs/spider_t5_finetuning.yaml index b4ac0719..b98c9e00 100755 --- a/finetuning/training_configs/spider_t5_finetuning.yaml +++ b/finetuning/training_configs/spider_t5_finetuning.yaml @@ -1,6 +1,6 @@ seed_everything: 333 trainer: - gpus: 0, 1 + gpus: 0 gradient_clip_val: 1.0 # default_root_dir: &exp_name results/spider-t5_base-finetuning default_root_dir: &exp_name results/debug-tmp @@ -49,7 +49,7 @@ trainer: model: class_path: lightning_modules.models.seq2seq_model.Seq2SeqModel init_args: - transformer_model_name: &transformer t5-base + transformer_model_name: &transformer EleutherAI/gpt-neo-125M executor_cls: execution.executors.SpiderExecutor categorize_func: execution.spider_execution.spider_categorize_complexity category_list: ["JOIN", "NESTED", "COMPOUND", "SIMPLE"] From 4ac72032a03e86a6a21d87ac00c4f39c6a963526 Mon Sep 17 00:00:00 2001 From: troyfeng116 Date: Wed, 1 Feb 2023 00:22:30 -0500 Subject: [PATCH 2/4] More HF trainer experimentation (for inference) --- finetuning/consts.py | 10 + finetuning/eval_helpers.py | 160 +++++++++++++ finetuning/hf_infer.py | 217 +++++++++++++++++ finetuning/hf_trainer.py | 221 ++++++++++++++---- .../lightning_modules/datasets/base_reader.py | 3 +- .../lightning_modules/models/seq2seq_model.py | 29 ++- .../patches/patched_loggers.py | 2 + 7 files changed, 586 insertions(+), 56 deletions(-) create mode 100644 finetuning/consts.py create mode 100644 finetuning/eval_helpers.py create mode 100644 finetuning/hf_infer.py diff --git a/finetuning/consts.py b/finetuning/consts.py new file mode 100644 index 00000000..67c7c654 --- /dev/null +++ b/finetuning/consts.py @@ -0,0 +1,10 @@ +# MODEL_NAME = "EleutherAI/gpt-neo-125M" +# MODEL_NAME = "EleutherAI/gpt-neo-2.7B" +MODEL_NAME = "Salesforce/codegen-2B-multi" + +# logging +RUN_NAME = "results/spider-gpt_neo_125M-finetuning" + +# hyperparams +MAX_STEPS = 25000 +EVAL_STEPS = 25 diff --git a/finetuning/eval_helpers.py b/finetuning/eval_helpers.py new file mode 100644 index 00000000..8b0bac69 --- /dev/null +++ b/finetuning/eval_helpers.py @@ -0,0 +1,160 @@ +from torchmetrics import Metric, MeanMetric, MetricCollection +from transformers.trainer_utils import EvalPrediction +from typing import Any, Dict, List + +from execution.executors import BaseExecutor, SpiderExecutor +from lightning_modules.datasets.spider_reader import SpiderDataset, Text2SqlDataModule +from lightning_modules.models.seq2seq_model import Seq2SeqModel + +from consts import MODEL_NAME, MAX_STEPS + + +spider_data_module = Text2SqlDataModule( + transformer_model_name=MODEL_NAME, + batch_size=4, + val_batch_size=4, + train_max_instances=200, + val_max_instances=100, + train_set_init_args={"file_path": "data/spider/train_spider_processed_v2.jsonl"}, + val_set_init_args={ + "file_path": "data/spider/dev_processed.jsonl", + }, + set_common_init_args={ + "use_skg_format": False, + }, +) + +spider_data_module.setup(stage="fit") + + +train_dataset = spider_data_module.train_data + +eval_dataset = spider_data_module.val_data + +seq2seq_model = Seq2SeqModel( + transformer_model_name=MODEL_NAME, + gradient_ckpt=True, + executor_cls="execution.executors.SpiderExecutor", + categorize_func="execution.spider_execution.spider_categorize_complexity", + category_list=["JOIN", "NESTED", "COMPOUND", "SIMPLE"], + max_gen_len=128, + sampling_temp=0.01, + optimizer={ + "init_args": { + "lr": 5.0e-5, + # lr: 0.0, + "betas": [0.9, 0.99], + "eps": 1.0e-8, + "weight_decay": 0.01, + } + }, + lr_scheduler={ + "name": "linear", + "init_args": { + "num_warmup_steps": 100, + "num_training_steps": MAX_STEPS, + }, + }, +) +seq2seq_model.model = seq2seq_model.model.cuda() +# seq2seq_model.model.config.max_new_tokens = 1024 +# print(seq2seq_model.model.config.max_new_tokens) +# seq2seq_model.model.config.max_length = 1024 + +executor = SpiderExecutor() + + +def get_program_exec_dict( + generated_program: str, exec_match: int, exec_result: Any +) -> Dict[str, Any]: + exec_acc = 1.0 if exec_match == 1 else 0.0 + exec_rate = 0.0 if exec_match == -1 else 1.0 + + # save the results in the json output file + save_metrics = {"exec_acc": float(exec_acc), "exec_rate": float(exec_rate)} + + # add more information to the program dict + program_dict = {"program": generated_program, "exec_result": exec_result} + program_dict.update(save_metrics) + + return program_dict + + +val_instances = eval_dataset.instances + + +def validation_step_end( + eval_pred: EvalPrediction, + metrics_dict: Dict[str, Metric], + executor: BaseExecutor, + val_instances: List[Any], +) -> None: + n = len(val_instances) + # update the evaluation metrics + for i in range(n): + prediction = eval_pred.predictions[1][i] + label_id = eval_pred.label_ids[i] + print(list(prediction)) + print(list(seq2seq_model.tokenizer.convert_tokens_to_ids(prediction))) + print(seq2seq_model.tokenizer.decode(prediction)) + print(seq2seq_model.tokenizer.decode(label_id)) + # example = eval_pred.label_ids[i] + # example = eval_pred.inputs[i] + example = val_instances[i]["metadata"] + + # obtain the execution results + exec_match, exec_result = executor.exec_program(prediction, example) + program_len_diff = executor.program_len(prediction) - executor.gold_program_len( + example + ) + program_dict = get_program_exec_dict(prediction, exec_match, exec_result) + + # update the metrics + metrics_dict["exec_acc"](program_dict["exec_acc"]) + metrics_dict["exec_rate"](program_dict["exec_rate"]) + metrics_dict["program_len_diff"](program_len_diff) + # category_metrics.update(program_dict["exec_acc"], metadata) # note that this can't be forward as compute will be called + + # if print_eval_every_n_batches > 0: + # # compute the metrics + # eval_metrics_dict = {} + # for k in metrics_dict.keys(): + # eval_metrics_dict[k] = float(metrics_dict[k].compute()) + # print("eval metrics: ", eval_metrics_dict) + + # # save the outputs to the model + # predictions.extend(outputs) + + +def compute_metrics(eval_pred: EvalPrediction) -> dict: + print(len(eval_pred.predictions)) + print(eval_pred.predictions[0].shape) + print(eval_pred.predictions[0]) + print(eval_pred.predictions[1].shape) + print(eval_pred.predictions[1]) + print("\n========\n") + print(len(eval_pred.label_ids)) + print(eval_pred.label_ids[0].shape) + print(eval_pred.label_ids[0]) + print(eval_pred.label_ids[1].shape) + print(eval_pred.label_ids[1]) + # # n = len(eval_pred.predictions) + metrics_dict: Dict[str, Metric] = MetricCollection({}) + metrics_dict["exec_acc"] = MeanMetric() + metrics_dict["exec_rate"] = MeanMetric() + metrics_dict["program_len_diff"] = MeanMetric() + # metrics_dict = {} + # metrics_dict["exec_acc"] = 0.0 + # metrics_dict["exec_rate"] = 0.0 + # metrics_dict["program_len_diff"] = 0.0 + # print(eval_pred.predictions) + + validation_step_end( + eval_pred=eval_pred, + metrics_dict=metrics_dict, + executor=executor, + val_instances=val_instances, + ) + # print("TEST" + str(eval_pred)) + print(metrics_dict) + return metrics_dict diff --git a/finetuning/hf_infer.py b/finetuning/hf_infer.py new file mode 100644 index 00000000..153cf692 --- /dev/null +++ b/finetuning/hf_infer.py @@ -0,0 +1,217 @@ +from torch.utils.data import Dataset +from transformers import ( + Trainer, + TrainingArguments, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + DataCollatorWithPadding, + DataCollatorForSeq2Seq, + TrainerCallback, + TrainerState, + TrainerControl, +) +from transformers.trainer_utils import EvalPrediction, IntervalStrategy +from lightning_modules.models.seq2seq_model import Seq2SeqModel +from lightning_modules.models.seq2seq_model_util import get_model +from lightning_modules.datasets.spider_reader import SpiderDataset, Text2SqlDataModule + +from eval_helpers import ( + compute_metrics, + seq2seq_model, + train_dataset, + eval_dataset, + spider_data_module, +) + +from lightning_modules.datasets.base_reader import ( + customized_collate_fn_enc_dec, + customized_collate_fn_gpt, +) +from finetuning.lightning_modules.models.seq2seq_model_util import ( + is_model_gpt_style, + right_pad_sequences, +) + +from consts import MODEL_NAME, RUN_NAME + + +import os +import torch +from typing import Dict + + +os.environ["WANDB_PROJECT"] = "codegen-hf-migration-tests" + + +# hyperparams +MAX_STEPS = 25000 +EVAL_STEPS = 25 + +lone_model, tokenizer = get_model( + MODEL_NAME, + gradient_ckpt=True, + additional_init_args={ + "executor_cls": "execution.executors.SpiderExecutor", + "categorize_func": "execution.spider_execution.spider_categorize_complexity", + "category_list": ["JOIN", "NESTED", "COMPOUND", "SIMPLE"], + "max_gen_len": 128, + "sampling_temp": 0.01, + }, +) + +spider_data_module = Text2SqlDataModule( + transformer_model_name=MODEL_NAME, + batch_size=4, + val_batch_size=4, + train_max_instances=200, + val_max_instances=100, + train_set_init_args={"file_path": "data/spider/train_spider_processed_v2.jsonl"}, + val_set_init_args={ + "file_path": "data/spider/dev_processed.jsonl", + }, + set_common_init_args={ + "use_skg_format": False, + }, +) + +training_args = Seq2SeqTrainingArguments( + output_dir="results/debug-tmp", # local output dir + # for inference! + do_train=False, + do_eval=True, + run_name=RUN_NAME, + report_to="wandb", + # # hyperparams + # learning_rate=5e-05, + # weight_decay=0.01, + # max_steps=MAX_STEPS, + fp16=True, + # find batch size automatically to avoid cuda OOM: only compatible with accelerate + auto_find_batch_size=True, + # # checkpointing + # save_strategy="epoch", + # # validation (?) + # evaluation_strategy=IntervalStrategy.STEPS, + # eval_steps=EVAL_STEPS, + # logging_steps=EVAL_STEPS, + # per_device_eval_batch_size=1, + eval_accumulation_steps=4, + # # memory optimizations + # gradient_checkpointing=True, + ddp_find_unused_parameters=True, + # deepspeed="deepspeed_config.json", + predict_with_generate=True, + generation_max_length=256, +) + +collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=lone_model) + + +class ValidationCallback(TrainerCallback): + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + print("======== Evaluation complete ========") + print(state) + + # eval_dataloader = DataLoader(eval_dataset.val_data, batch_size=eval_dataset.val_batch_size, + # shuffle=False, drop_last=True, collate_fn=collate_fn) + val_outs = [] + eval_dataloader = spider_data_module.val_dataloader() + for val_idx, val_batch in enumerate(iter(eval_dataloader)): + val_batch["input_ids"] = val_batch["input_ids"].cuda() + val_batch["attention_mask"] = val_batch["attention_mask"].cuda() + # print(val_batch["metadata"]) + # print(val_batch["input_ids"].device) + # print(val_batch["attention_mask"].device) + out = seq2seq_model.validation_step(val_batch, val_idx) + # print(out) + # val_outs.append(out) + val_outs.extend(out) + seq2seq_model.validation_step_end(val_outs) + # return super().on_evaluate(args, state, control, **kwargs) + + +class CustomTrainer(Seq2SeqTrainer): + def __init__( + self, + seq2seq_model: Seq2SeqModel, + args: TrainingArguments, + train_dataset: Dataset, + eval_dataset: Dataset, + ): + self.seq2seq_model = seq2seq_model + collator = DataCollatorForSeq2Seq( + tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model + ) + self.collator = collator + self.args = args + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + + super().__init__( + model=seq2seq_model.model, + data_collator=collator, + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=seq2seq_model.tokenizer, + compute_metrics=compute_metrics, + ) + + # def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): + # print("TEST") + # super().evaluate( + # eval_dataset=eval_dataset, + # ignore_keys=ignore_keys, + # metric_key_prefix=metric_key_prefix, + # ) + + # def training_step( + # self, batch: Dict[str, torch.Tensor], batch_idx: int + # ) -> Dict[str, torch.Tensor]: + # return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) + + +# trainer = CustomTrainer( +# seq2seq_model=seq2seq_model, +# args=training_args, +# train_dataset=train_dataset, +# eval_dataset=eval_dataset, +# ) + + +def preprocess_logits_for_metrics(logits, labels): + """ + Original Trainer may have a memory leak. + This is a workaround to avoid storing too many tensors that are not needed. + """ + pred_ids = torch.argmax(logits[0], dim=-1) + return pred_ids, labels + + +trainer = Seq2SeqTrainer( + model=seq2seq_model.model, + args=training_args, + data_collator=collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + # compute_metrics=compute_metrics, + callbacks=[ValidationCallback], + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + tokenizer=seq2seq_model.tokenizer, +) + +# res = trainer.predict(eval_dataset[0:16]) +# print(res) +# print(res.predictions) +# print(res.label_ids) +# decode_test = seq2seq_model.tokenizer.decode(res.label_ids[0]) +# print(decode_test) + +trainer.evaluate() +# trainer.train() diff --git a/finetuning/hf_trainer.py b/finetuning/hf_trainer.py index 0cc952b9..18e3cc56 100644 --- a/finetuning/hf_trainer.py +++ b/finetuning/hf_trainer.py @@ -1,64 +1,199 @@ -from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq +from torch.utils.data import Dataset +from transformers import ( + Trainer, + TrainingArguments, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + DataCollatorWithPadding, + DataCollatorForSeq2Seq, + TrainerCallback, + TrainerState, + TrainerControl, +) +from transformers.trainer_utils import EvalPrediction, IntervalStrategy +from lightning_modules.models.seq2seq_model import Seq2SeqModel from lightning_modules.models.seq2seq_model_util import get_model from lightning_modules.datasets.spider_reader import SpiderDataset, Text2SqlDataModule +from eval_helpers import ( + compute_metrics, + seq2seq_model, + train_dataset, + eval_dataset, + spider_data_module, +) + +from lightning_modules.datasets.base_reader import ( + customized_collate_fn_enc_dec, + customized_collate_fn_gpt, +) +from finetuning.lightning_modules.models.seq2seq_model_util import ( + is_model_gpt_style, + right_pad_sequences, +) + + +import os +import torch +from typing import Dict + + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["WANDB_PROJECT"] = "codegen-hf-migration-tests" -MODEL_NAME = "EleutherAI/gpt-neo-125M" +from consts import MODEL_NAME, MAX_STEPS, RUN_NAME, EVAL_STEPS + +# model, tokenizer = get_model( +# MODEL_NAME, +# gradient_ckpt=True, +# additional_init_args={ +# "executor_cls": "execution.executors.SpiderExecutor", +# "categorize_func": "execution.spider_execution.spider_categorize_complexity", +# "category_list": ["JOIN", "NESTED", "COMPOUND", "SIMPLE"], +# "max_gen_len": 128, +# "sampling_temp": 0.01, +# }, +# ) -model, tokenizer = get_model(MODEL_NAME, gradient_ckpt=True) training_args = Seq2SeqTrainingArguments( - output_dir="out", - gradient_checkpointing=True, + output_dir="results/debug-tmp", # local output dir + do_train=True, + do_eval=True, + run_name=RUN_NAME, + report_to="wandb", + # hyperparams learning_rate=5e-05, weight_decay=0.01, - max_steps=25000, + max_steps=MAX_STEPS, fp16=True, - do_train=True, - # do_eval=True, - logging_strategy="no", + # find batch size automatically to avoid cuda OOM: only compatible with accelerate + auto_find_batch_size=True, + # checkpointing + save_strategy="epoch", + # validation (?) + evaluation_strategy=IntervalStrategy.STEPS, + eval_steps=EVAL_STEPS, + logging_steps=EVAL_STEPS, + # per_device_eval_batch_size=1, + eval_accumulation_steps=4, + # memory optimizations + gradient_checkpointing=True, + ddp_find_unused_parameters=True, + # deepspeed="deepspeed_config.json", + predict_with_generate=True, + generation_max_length=128, ) -dataset_init_args = { - "transformer_model_name": MODEL_NAME, - # "batch_size": 4, - # "val_batch_size": 4, - # "train_max_instances": 200, - # "val_max_instances": 100, - "file_path": "data/spider/train_spider_processed_v2.jsonl", -} - -# dataset = Text2SqlDataModule( -# transformer_model_name=MODEL_NAME, -# batch_size=4, -# val_batch_size=4, -# train_max_instances=200, -# val_max_instances=100, -# train_set_init_args={ -# "file_path": "data/spider/train_spider_processed_v2.jsonl", -# }, -# val_set_init_args={ -# "file_path": "data/spider/dev_processed.jsonl", -# }, -# set_common_init_args={ -# "use_skg_format": False, -# }, +collator = DataCollatorForSeq2Seq( + tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model +) + + +class ValidationCallback(TrainerCallback): + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + print("======== Evaluation complete ========") + print(state) + + # eval_dataloader = DataLoader(eval_dataset.val_data, batch_size=eval_dataset.val_batch_size, + # shuffle=False, drop_last=True, collate_fn=collate_fn) + val_outs = [] + eval_dataloader = spider_data_module.val_dataloader() + for val_idx, val_batch in enumerate(iter(eval_dataloader)): + val_batch["input_ids"] = val_batch["input_ids"].cuda() + val_batch["attention_mask"] = val_batch["attention_mask"].cuda() + # print(val_batch["metadata"]) + # print(val_batch["input_ids"].device) + # print(val_batch["attention_mask"].device) + out = seq2seq_model.validation_step(val_batch, val_idx) + # print(out) + # val_outs.append(out) + val_outs.extend(out) + seq2seq_model.validation_step_end(val_outs) + # return super().on_evaluate(args, state, control, **kwargs) + + +class CustomTrainer(Seq2SeqTrainer): + def __init__( + self, + seq2seq_model: Seq2SeqModel, + args: TrainingArguments, + train_dataset: Dataset, + eval_dataset: Dataset, + ): + self.seq2seq_model = seq2seq_model + collator = DataCollatorForSeq2Seq( + tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model + ) + self.collator = collator + self.args = args + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + + super().__init__( + model=seq2seq_model.model, + data_collator=collator, + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=seq2seq_model.tokenizer, + compute_metrics=compute_metrics, + ) + + # def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): + # print("TEST") + # super().evaluate( + # eval_dataset=eval_dataset, + # ignore_keys=ignore_keys, + # metric_key_prefix=metric_key_prefix, + # ) + + # def training_step( + # self, batch: Dict[str, torch.Tensor], batch_idx: int + # ) -> Dict[str, torch.Tensor]: + # return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) + + +# trainer = CustomTrainer( +# seq2seq_model=seq2seq_model, +# args=training_args, +# train_dataset=train_dataset, +# eval_dataset=eval_dataset, # ) -dataset = SpiderDataset( - transformer_model_name=MODEL_NAME, - file_path="data/spider/train_spider_processed_v2.jsonl", -) -# print(dataset) +def preprocess_logits_for_metrics(logits, labels): + """ + Original Trainer may have a memory leak. + This is a workaround to avoid storing too many tensors that are not needed. + """ + pred_ids = torch.argmax(logits[0], dim=-1) + return pred_ids, labels -collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) trainer = Seq2SeqTrainer( - model=model, - data_collator=collator, + model=seq2seq_model.model, args=training_args, - train_dataset=dataset, - tokenizer=tokenizer, + data_collator=collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + # compute_metrics=compute_metrics, + callbacks=[ValidationCallback], + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + tokenizer=seq2seq_model.tokenizer, ) +# res = trainer.predict(eval_dataset[0:16]) +# print(res) +# print(res.predictions) +# print(res.label_ids) +# decode_test = seq2seq_model.tokenizer.decode(res.label_ids[0]) +# print(decode_test) + +# trainer.evaluate() trainer.train() diff --git a/finetuning/lightning_modules/datasets/base_reader.py b/finetuning/lightning_modules/datasets/base_reader.py index ace419ce..26ffc027 100644 --- a/finetuning/lightning_modules/datasets/base_reader.py +++ b/finetuning/lightning_modules/datasets/base_reader.py @@ -260,7 +260,8 @@ def val_dataloader(self): else customized_collate_fn_enc_dec dtloader = DataLoader(self.val_data, batch_size=self.val_batch_size, - shuffle=False, drop_last=True, collate_fn=collate_fn) + shuffle=False, drop_last=True, collate_fn=collate_fn, + pin_memory=True) return dtloader def test_dataloader(self): diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py index 815c2b7f..e0a6e9b3 100644 --- a/finetuning/lightning_modules/models/seq2seq_model.py +++ b/finetuning/lightning_modules/models/seq2seq_model.py @@ -248,12 +248,17 @@ def validation_step_end(self, outputs: List[Dict[str, Any]]) -> None: self.metrics_dict["program_len_diff"](program_len_diff) self.category_metrics.update(program_dict["exec_acc"], metadata) # note that this can't be forward as compute will be called - if self.print_eval_every_n_batches > 0: - # compute the metrics - eval_metrics_dict = {} - for k in self.metrics_dict.keys(): - eval_metrics_dict[k] = float(self.metrics_dict[k].compute()) - print("eval metrics: ", eval_metrics_dict) + # if self.print_eval_every_n_batches > 0: + # # compute the metrics + # eval_metrics_dict = {} + # for k in self.metrics_dict.keys(): + # eval_metrics_dict[k] = float(self.metrics_dict[k].compute()) + # print("eval metrics: ", eval_metrics_dict) + # compute the metrics + eval_metrics_dict = {} + for k in self.metrics_dict.keys(): + eval_metrics_dict[k] = float(self.metrics_dict[k].compute()) + print("eval metrics: ", eval_metrics_dict) # save the outputs to the model self.predictions.extend(outputs) @@ -318,12 +323,12 @@ def validation_epoch_end(self, outputs: List[Dict[str, Any]]) -> None: self.category_metrics.reset() # save the predictions - save_pred_file_path = os.path.join(self.trainer.log_dir, - f'predictions_step_{self.trainer.global_step}_rank_{self.trainer.global_rank}.jsonl') - with open(save_pred_file_path, 'w+') as f: - for prediction in self.predictions: - f.write(json.dumps(prediction)+'\n') - print(f"{len(self.predictions)} predictions saved to {save_pred_file_path}") + # save_pred_file_path = os.path.join(self.trainer.log_dir, + # f'predictions_step_{self.trainer.global_step}_rank_{self.trainer.global_rank}.jsonl') + # with open(save_pred_file_path, 'w+') as f: + # for prediction in self.predictions: + # f.write(json.dumps(prediction)+'\n') + # print(f"{len(self.predictions)} predictions saved to {save_pred_file_path}") # reset the predictions self.predictions = [] diff --git a/finetuning/lightning_modules/patches/patched_loggers.py b/finetuning/lightning_modules/patches/patched_loggers.py index 514a4609..4848254d 100755 --- a/finetuning/lightning_modules/patches/patched_loggers.py +++ b/finetuning/lightning_modules/patches/patched_loggers.py @@ -45,6 +45,8 @@ def __init__(self, entity: str, project: str, name: str, log_model: bool, save_c if "tmp" in processed_name and not offline: print(f"WandbLogger: {processed_name} is a tmp exp so running in offline mode") kwargs['offline'] = True + # TODO: manually disable online logs for now + kwargs['offline'] = True # create the save_dir if it doesn't exist print(f"ready to create save_dir: {save_dir}", flush=True) From 57ec8546410b0bef73d776f7cd0674a797cb7f34 Mon Sep 17 00:00:00 2001 From: troyfeng116 Date: Sat, 11 Feb 2023 14:01:13 -0500 Subject: [PATCH 3/4] HF inference --- deepspeed_config.json | 5 ++- finetuning/eval_helpers.py | 1 - finetuning/hf_infer.py | 89 ++++++++++++++++++++----------------- finetuning/hf_infer_yaml.py | 15 +++++++ finetuning/hf_trainer.py | 24 +++++----- 5 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 finetuning/hf_infer_yaml.py diff --git a/deepspeed_config.json b/deepspeed_config.json index 86b9f746..7722197c 100644 --- a/deepspeed_config.json +++ b/deepspeed_config.json @@ -1,7 +1,7 @@ { "zero_allow_untested_optimizer": true, "zero_optimization": { - "stage": 2, + "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true @@ -10,5 +10,6 @@ "overlap_comm": true, "allgather_bucket_size": 1e10, "reduce_bucket_size": 1e10 - } + }, + "train_batch_size": "auto" } \ No newline at end of file diff --git a/finetuning/eval_helpers.py b/finetuning/eval_helpers.py index 8b0bac69..e5d6fc70 100644 --- a/finetuning/eval_helpers.py +++ b/finetuning/eval_helpers.py @@ -26,7 +26,6 @@ spider_data_module.setup(stage="fit") - train_dataset = spider_data_module.train_data eval_dataset = spider_data_module.val_data diff --git a/finetuning/hf_infer.py b/finetuning/hf_infer.py index 153cf692..deb5c9f9 100644 --- a/finetuning/hf_infer.py +++ b/finetuning/hf_infer.py @@ -1,3 +1,7 @@ +import os + +os.environ["CUDA_VISIBLE_DEVICES"] = "1" + from torch.utils.data import Dataset from transformers import ( Trainer, @@ -35,10 +39,11 @@ from consts import MODEL_NAME, RUN_NAME -import os import torch from typing import Dict +# LightningModule extends torch.nn.Module +# -> use inheritance: base Module class to support Lightning for training, and also HF Trainer for inference os.environ["WANDB_PROJECT"] = "codegen-hf-migration-tests" @@ -98,8 +103,8 @@ eval_accumulation_steps=4, # # memory optimizations # gradient_checkpointing=True, - ddp_find_unused_parameters=True, - # deepspeed="deepspeed_config.json", + # ddp_find_unused_parameters=True, + deepspeed="deepspeed_config.json", predict_with_generate=True, generation_max_length=256, ) @@ -136,45 +141,45 @@ def on_evaluate( # return super().on_evaluate(args, state, control, **kwargs) -class CustomTrainer(Seq2SeqTrainer): - def __init__( - self, - seq2seq_model: Seq2SeqModel, - args: TrainingArguments, - train_dataset: Dataset, - eval_dataset: Dataset, - ): - self.seq2seq_model = seq2seq_model - collator = DataCollatorForSeq2Seq( - tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model - ) - self.collator = collator - self.args = args - self.train_dataset = train_dataset - self.eval_dataset = eval_dataset - - super().__init__( - model=seq2seq_model.model, - data_collator=collator, - args=args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=seq2seq_model.tokenizer, - compute_metrics=compute_metrics, - ) - - # def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): - # print("TEST") - # super().evaluate( - # eval_dataset=eval_dataset, - # ignore_keys=ignore_keys, - # metric_key_prefix=metric_key_prefix, - # ) - - # def training_step( - # self, batch: Dict[str, torch.Tensor], batch_idx: int - # ) -> Dict[str, torch.Tensor]: - # return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) +# class CustomTrainer(Seq2SeqTrainer): +# def __init__( +# self, +# seq2seq_model: Seq2SeqModel, +# args: TrainingArguments, +# train_dataset: Dataset, +# eval_dataset: Dataset, +# ): +# self.seq2seq_model = seq2seq_model +# collator = DataCollatorForSeq2Seq( +# tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model +# ) +# self.collator = collator +# self.args = args +# self.train_dataset = train_dataset +# self.eval_dataset = eval_dataset + +# super().__init__( +# model=seq2seq_model.model, +# data_collator=collator, +# args=args, +# train_dataset=train_dataset, +# eval_dataset=eval_dataset, +# tokenizer=seq2seq_model.tokenizer, +# compute_metrics=compute_metrics, +# ) + +# def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): +# print("TEST") +# super().evaluate( +# eval_dataset=eval_dataset, +# ignore_keys=ignore_keys, +# metric_key_prefix=metric_key_prefix, +# ) + +# def training_step( +# self, batch: Dict[str, torch.Tensor], batch_idx: int +# ) -> Dict[str, torch.Tensor]: +# return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) # trainer = CustomTrainer( diff --git a/finetuning/hf_infer_yaml.py b/finetuning/hf_infer_yaml.py new file mode 100644 index 00000000..50dd8994 --- /dev/null +++ b/finetuning/hf_infer_yaml.py @@ -0,0 +1,15 @@ +import yaml + + +FILE_PATH = "finetuning/training_configs/spider_t5_finetuning.yaml" + + +def main() -> int: + with open(FILE_PATH, "r") as f: + d = yaml.safe_load(stream=f) + print(d) + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/finetuning/hf_trainer.py b/finetuning/hf_trainer.py index 18e3cc56..fb98d828 100644 --- a/finetuning/hf_trainer.py +++ b/finetuning/hf_trainer.py @@ -145,18 +145,18 @@ def __init__( compute_metrics=compute_metrics, ) - # def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): - # print("TEST") - # super().evaluate( - # eval_dataset=eval_dataset, - # ignore_keys=ignore_keys, - # metric_key_prefix=metric_key_prefix, - # ) - - # def training_step( - # self, batch: Dict[str, torch.Tensor], batch_idx: int - # ) -> Dict[str, torch.Tensor]: - # return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) + def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): + print("TEST") + super().evaluate( + eval_dataset=eval_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + def training_step( + self, batch: Dict[str, torch.Tensor], batch_idx: int + ) -> Dict[str, torch.Tensor]: + return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) # trainer = CustomTrainer( From 13317ca1397351e65bcde7f57133b72ec09a81f7 Mon Sep 17 00:00:00 2001 From: troyfeng116 Date: Sat, 11 Feb 2023 14:53:40 -0500 Subject: [PATCH 4/4] Add documentation --- finetuning/eval_helpers.py | 1 + finetuning/hf_infer.py | 145 +++++++++++++++++++++---------------- 2 files changed, 82 insertions(+), 64 deletions(-) diff --git a/finetuning/eval_helpers.py b/finetuning/eval_helpers.py index e5d6fc70..2aef1a1f 100644 --- a/finetuning/eval_helpers.py +++ b/finetuning/eval_helpers.py @@ -125,6 +125,7 @@ def validation_step_end( # predictions.extend(outputs) +# attempt to use compute_metrics to inject custom validation def compute_metrics(eval_pred: EvalPrediction) -> dict: print(len(eval_pred.predictions)) print(eval_pred.predictions[0].shape) diff --git a/finetuning/hf_infer.py b/finetuning/hf_infer.py index deb5c9f9..929c1b29 100644 --- a/finetuning/hf_infer.py +++ b/finetuning/hf_infer.py @@ -52,18 +52,7 @@ MAX_STEPS = 25000 EVAL_STEPS = 25 -lone_model, tokenizer = get_model( - MODEL_NAME, - gradient_ckpt=True, - additional_init_args={ - "executor_cls": "execution.executors.SpiderExecutor", - "categorize_func": "execution.spider_execution.spider_categorize_complexity", - "category_list": ["JOIN", "NESTED", "COMPOUND", "SIMPLE"], - "max_gen_len": 128, - "sampling_temp": 0.01, - }, -) - +# reuse PL DataModule code: only need access to train+val datasets and dataloaders spider_data_module = Text2SqlDataModule( transformer_model_name=MODEL_NAME, batch_size=4, @@ -79,6 +68,7 @@ }, ) +# @see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments training_args = Seq2SeqTrainingArguments( output_dir="results/debug-tmp", # local output dir # for inference! @@ -109,9 +99,14 @@ generation_max_length=256, ) -collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=lone_model) +# @see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.data_collator +collator = DataCollatorForSeq2Seq( + tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model +) +# custom callback to inject behavior into HF trainer loop +# @see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.callbacks class ValidationCallback(TrainerCallback): def on_evaluate( self, @@ -141,55 +136,8 @@ def on_evaluate( # return super().on_evaluate(args, state, control, **kwargs) -# class CustomTrainer(Seq2SeqTrainer): -# def __init__( -# self, -# seq2seq_model: Seq2SeqModel, -# args: TrainingArguments, -# train_dataset: Dataset, -# eval_dataset: Dataset, -# ): -# self.seq2seq_model = seq2seq_model -# collator = DataCollatorForSeq2Seq( -# tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model -# ) -# self.collator = collator -# self.args = args -# self.train_dataset = train_dataset -# self.eval_dataset = eval_dataset - -# super().__init__( -# model=seq2seq_model.model, -# data_collator=collator, -# args=args, -# train_dataset=train_dataset, -# eval_dataset=eval_dataset, -# tokenizer=seq2seq_model.tokenizer, -# compute_metrics=compute_metrics, -# ) - -# def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): -# print("TEST") -# super().evaluate( -# eval_dataset=eval_dataset, -# ignore_keys=ignore_keys, -# metric_key_prefix=metric_key_prefix, -# ) - -# def training_step( -# self, batch: Dict[str, torch.Tensor], batch_idx: int -# ) -> Dict[str, torch.Tensor]: -# return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) - - -# trainer = CustomTrainer( -# seq2seq_model=seq2seq_model, -# args=training_args, -# train_dataset=train_dataset, -# eval_dataset=eval_dataset, -# ) - - +# when using compute_metrics, fix CUDA OOM bug +# https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941 def preprocess_logits_for_metrics(logits, labels): """ Original Trainer may have a memory leak. @@ -199,12 +147,16 @@ def preprocess_logits_for_metrics(logits, labels): return pred_ids, labels +# HF Trainer "entry point" +# @see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer trainer = Seq2SeqTrainer( model=seq2seq_model.model, args=training_args, data_collator=collator, train_dataset=train_dataset, eval_dataset=eval_dataset, + # compute_metrics is called by Trainer.evaluate: extends metrics dict + # @see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.compute_metrics # compute_metrics=compute_metrics, callbacks=[ValidationCallback], preprocess_logits_for_metrics=preprocess_logits_for_metrics, @@ -215,8 +167,73 @@ def preprocess_logits_for_metrics(logits, labels): # print(res) # print(res.predictions) # print(res.label_ids) -# decode_test = seq2seq_model.tokenizer.decode(res.label_ids[0]) -# print(decode_test) trainer.evaluate() # trainer.train() + + +# ======== Unused but may be useful ======== + +# UNUSED: attempt to remove Seq2SeqModel dependency (underlying PT LightningModule dependency) +# lone_model, tokenizer = get_model( +# MODEL_NAME, +# gradient_ckpt=True, +# additional_init_args={ +# "executor_cls": "execution.executors.SpiderExecutor", +# "categorize_func": "execution.spider_execution.spider_categorize_complexity", +# "category_list": ["JOIN", "NESTED", "COMPOUND", "SIMPLE"], +# "max_gen_len": 128, +# "sampling_temp": 0.01, +# }, +# ) + + +# UNUSED: attempt to override Seq2SeqTrainer class methods: evaluate, training_step, etc. +# see top of HF Trainer doc page: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer +class CustomTrainer(Seq2SeqTrainer): + def __init__( + self, + seq2seq_model: Seq2SeqModel, + args: TrainingArguments, + train_dataset: Dataset, + eval_dataset: Dataset, + ): + self.seq2seq_model = seq2seq_model + collator = DataCollatorForSeq2Seq( + tokenizer=seq2seq_model.tokenizer, model=seq2seq_model.model + ) + self.collator = collator + self.args = args + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + + super().__init__( + model=seq2seq_model.model, + data_collator=collator, + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=seq2seq_model.tokenizer, + compute_metrics=compute_metrics, + ) + + def evaluate(self, eval_dataset, ignore_keys, metric_key_prefix): + print("TEST") + super().evaluate( + eval_dataset=eval_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + def training_step( + self, batch: Dict[str, torch.Tensor], batch_idx: int + ) -> Dict[str, torch.Tensor]: + return self.seq2seq_model.training_step(batch=batch, batch_idx=batch_idx) + + +# trainer = CustomTrainer( +# seq2seq_model=seq2seq_model, +# args=training_args, +# train_dataset=train_dataset, +# eval_dataset=eval_dataset, +# )