From d7dcc884448ff017f2e80940a05d864402410708 Mon Sep 17 00:00:00 2001 From: bclavie Date: Fri, 22 Dec 2023 13:16:39 +0000 Subject: [PATCH 1/5] chore: accelerator-compatible training --- docs/api/overview.md | 1 + docs/fine_tune/.pages | 1 + docs/fine_tune/multi_gpu.md | 64 ++++++++++++++++++++++ neural_cherche/models/base.py | 14 ++--- neural_cherche/models/colbert.py | 29 +++++++++- neural_cherche/models/sparse_embed.py | 33 ++++++++++- neural_cherche/models/splade.py | 46 ++++++++++++---- neural_cherche/train/train_colbert.py | 6 +- neural_cherche/train/train_sparse_embed.py | 6 +- neural_cherche/train/train_splade.py | 6 +- 10 files changed, 182 insertions(+), 24 deletions(-) create mode 100644 docs/fine_tune/multi_gpu.md diff --git a/docs/api/overview.md b/docs/api/overview.md index 7d12872..141b101 100644 --- a/docs/api/overview.md +++ b/docs/api/overview.md @@ -27,6 +27,7 @@ - [train_colbert](../train/train-colbert) - [train_sparse_embed](../train/train-sparse-embed) - [train_splade](../train/train-splade) +- [Multi-GPU training via Accelerator](../train/multi-gpu) ## utils diff --git a/docs/fine_tune/.pages b/docs/fine_tune/.pages index 06efd47..aecd94a 100644 --- a/docs/fine_tune/.pages +++ b/docs/fine_tune/.pages @@ -3,4 +3,5 @@ nav: - colbert.md - splade.md - sparse_embed.md + - multi_gpu.md \ No newline at end of file diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md new file mode 100644 index 0000000..9f5b666 --- /dev/null +++ b/docs/fine_tune/multi_gpu.md @@ -0,0 +1,64 @@ +# Multi-GPU (Accelerator) + + +Training any of the models on multiple GPU via the accelerator library is simple. You just need to modify the training loop in a few key ways: + +```python +from neural_cherche import models, utils, train +import torch +from torch.utils.data import DataLoader +from accelerate import Accelerator + + +# Wrap in main function to avoid multiprocessing issues +if __name__ == "__main__"": + accelerator = Accelerator() + device = accelerator.device + batch_size = 32 + epochs = 2 + save_on_epoch = True + + model = models.SparseEmbed( + model_name_or_path="distilbert-base-uncased", + device=device + ).to(device) + + # Optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) + + # prepare your dataset -- this example uses a huggingface `datasets` object + ... + + # Convert the data into a PyTorch dataloader for ease of preparation + data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + + # Wrap the model, optimizer, and data loader in the accelerator + model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader) + + for epoch in range(epochs): + for batch_data in enumerate(data_loader): + # Assuming batch_data is a tuple in the form (anchors, positives, negatives) + anchors, positives, negatives = batch_data + + loss = train_sparse_embed( + model=model, + optimizer=optimizer, + anchor=anchors, + positive=positives, + negative=negatives, + threshold_flops=30, + accelerator=accelerator, + ) + + if accelerator.is_main_process and save_on_epoch: + accelerator.save_model(model, "checkpoint/epoch" + str(epoch)) + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained( + "SPARSE_EMBEDFULL/epoch" + str(epoch), + + # Save at the end of the training loop + # We check to make sure that only the main process will export the model + if accelerator.is_main_process: + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained("checkpoint", accelerator=True) +``` \ No newline at end of file diff --git a/neural_cherche/models/base.py b/neural_cherche/models/base.py index a2b0b92..0165abd 100644 --- a/neural_cherche/models/base.py +++ b/neural_cherche/models/base.py @@ -77,16 +77,16 @@ def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tenso texts List of sentences to encode. """ - encoded_input = self.tokenizer.batch_encode_plus( - texts, return_tensors="pt", **kwargs + encoded_input = self.tokenizer(texts, return_tensors="pt", **kwargs).to( + self.device ) - if self.device != "cpu": - encoded_input = { - key: value.to(self.device) for key, value in encoded_input.items() - } + # Must hardcode position_ids to avoid a bug with accelerate multi-GPU + seq_len = encoded_input["input_ids"].size(1) + position_ids = torch.arange(0, seq_len).expand((len(texts), -1)).to(self.device) - output = self.model(**encoded_input) + # Pass both the inputs and position_ids to the model + output = self.model(**encoded_input, position_ids=position_ids) return output.logits, output.hidden_states[-1] @abstractmethod diff --git a/neural_cherche/models/colbert.py b/neural_cherche/models/colbert.py index 2dd21e4..302421e 100644 --- a/neural_cherche/models/colbert.py +++ b/neural_cherche/models/colbert.py @@ -268,7 +268,7 @@ def scores( return torch.cat(list_scores, dim=0) - def save_pretrained(self, path: str) -> "ColBERT": + def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT": """Save model the model. Parameters @@ -279,7 +279,32 @@ def save_pretrained(self, path: str) -> "ColBERT": self.model.save_pretrained(path) torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt")) self.tokenizer.pad_token = self.original_pad_token - self.tokenizer.save_pretrained(path) + if accelerator: + # Workaround an issue with accelerator. Tokenizer has a key "device" + # which is non serialisable, but not removeable with a basic delattr + + # dump config + tokenizer_config = { + k: v for k, v in self.tokenizer.__dict__.items() if k != "device" + } + tokenizer_config_file = os.path.join(path, "tokenizer_config.json") + with open(tokenizer_config_file, "w", encoding="utf-8") as file: + json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) + + # dump vocab + self.tokenizer.save_vocabulary(path) + + # save special tokens + special_tokens_file = os.path.join(path, "special_tokens_map.json") + with open(special_tokens_file, "w", encoding="utf-8") as file: + json.dump( + self.tokenizer.special_tokens_map, + file, + ensure_ascii=False, + indent=4, + ) + else: + self.tokenizer.save_pretrained(path) with open(os.path.join(path, "metadata.json"), "w") as f: json.dump( { diff --git a/neural_cherche/models/sparse_embed.py b/neural_cherche/models/sparse_embed.py index f4eec0e..c52358d 100644 --- a/neural_cherche/models/sparse_embed.py +++ b/neural_cherche/models/sparse_embed.py @@ -212,11 +212,40 @@ def _get_attention( return self.softmax(attention) - def save_pretrained(self, path: str): + def save_pretrained( + self, + path: str, + accelerator: bool = False, + ): """Save model the model.""" self.model.save_pretrained(path) self.tokenizer.pad_token = self.original_pad_token - self.tokenizer.save_pretrained(path) + if accelerator: + # Workaround an issue with accelerator. Tokenizer has a key "device" + # which is non serialisable, but not removeable with a basic delattr + + # dump config + tokenizer_config = { + k: v for k, v in self.tokenizer.__dict__.items() if k != "device" + } + tokenizer_config_file = os.path.join(path, "tokenizer_config.json") + with open(tokenizer_config_file, "w", encoding="utf-8") as file: + json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) + + # dump vocab + self.tokenizer.save_vocabulary(path) + + # save special tokens + special_tokens_file = os.path.join(path, "special_tokens_map.json") + with open(special_tokens_file, "w", encoding="utf-8") as file: + json.dump( + self.tokenizer.special_tokens_map, + file, + ensure_ascii=False, + indent=4, + ) + else: + self.tokenizer.save_pretrained(path) torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt")) with open(os.path.join(path, "metadata.json"), "w") as file: json.dump( diff --git a/neural_cherche/models/splade.py b/neural_cherche/models/splade.py index 34bc5a0..ad4b488 100644 --- a/neural_cherche/models/splade.py +++ b/neural_cherche/models/splade.py @@ -206,7 +206,11 @@ def forward( return {"sparse_activations": activations["sparse_activations"]} - def save_pretrained(self, path: str): + def save_pretrained( + self, + path: str, + accelerator: bool = False, + ): """Save model the model. Parameters @@ -217,7 +221,32 @@ def save_pretrained(self, path: str): """ self.model.save_pretrained(path) self.tokenizer.pad_token = self.original_pad_token - self.tokenizer.save_pretrained(path) + if accelerator: + # Workaround an issue with accelerator. Tokenizer has a key "device" + # which is non serialisable, but not removeable with a basic delattr + + # dump config + tokenizer_config = { + k: v for k, v in self.tokenizer.__dict__.items() if k != "device" + } + tokenizer_config_file = os.path.join(path, "tokenizer_config.json") + with open(tokenizer_config_file, "w", encoding="utf-8") as file: + json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) + + # dump vocab + self.tokenizer.save_vocabulary(path) + + # save special tokens + special_tokens_file = os.path.join(path, "special_tokens_map.json") + with open(special_tokens_file, "w", encoding="utf-8") as file: + json.dump( + self.tokenizer.special_tokens_map, + file, + ensure_ascii=False, + indent=4, + ) + else: + self.tokenizer.save_pretrained(path) with open(os.path.join(path, "metadata.json"), "w") as file: json.dump( @@ -306,15 +335,12 @@ def _update_activations( ) -> torch.Tensor: """Returns activated tokens.""" activations = torch.topk(input=sparse_activations, k=k_tokens, dim=1).indices - - # Set value of max sparse_activations which are not in top k to 0. - sparse_activations = sparse_activations * torch.zeros( - (sparse_activations.shape[0], sparse_activations.shape[1]), - dtype=int, - device=self.device, - ).scatter_(dim=1, index=activations.long(), value=1) + zero_tensor = torch.zeros_like(sparse_activations, dtype=int) + updated_sparse_activations = sparse_activations * zero_tensor.scatter( + dim=1, index=activations.long(), value=1 + ) return { "activations": activations, - "sparse_activations": sparse_activations, + "sparse_activations": updated_sparse_activations, } diff --git a/neural_cherche/train/train_colbert.py b/neural_cherche/train/train_colbert.py index 6858210..dc09294 100644 --- a/neural_cherche/train/train_colbert.py +++ b/neural_cherche/train/train_colbert.py @@ -10,6 +10,7 @@ def train_colbert( positive: list[str], negative: list[str], in_batch_negatives: bool = False, + accelerator=None, **kwargs, ): """Compute the ranking loss and the flops loss for a single step. @@ -98,7 +99,10 @@ def train_colbert( loss = losses.Ranking()(**scores) - loss.backward() + if accelerator: + accelerator.backward(loss) + else: + loss.backward() optimizer.step() optimizer.zero_grad() diff --git a/neural_cherche/train/train_sparse_embed.py b/neural_cherche/train/train_sparse_embed.py index 82c3138..2860058 100644 --- a/neural_cherche/train/train_sparse_embed.py +++ b/neural_cherche/train/train_sparse_embed.py @@ -16,6 +16,7 @@ def train_sparse_embed( dense_loss_weight: float = 1.0, in_batch_negatives: bool = False, threshold_flops: float = 30, + accelerator=None, **kwargs, ): """Compute the ranking loss and the flops loss for a single step. @@ -147,7 +148,10 @@ def train_sparse_embed( + flops_loss_weight * flops_loss ) - loss.backward() + if accelerator: + accelerator.backward(loss) + else: + loss.backward() optimizer.step() optimizer.zero_grad() diff --git a/neural_cherche/train/train_splade.py b/neural_cherche/train/train_splade.py index ff48f32..d633400 100644 --- a/neural_cherche/train/train_splade.py +++ b/neural_cherche/train/train_splade.py @@ -13,6 +13,7 @@ def train_splade( sparse_loss_weight: float = 1.0, in_batch_negatives: bool = False, threshold_flops: float = 30, + accelerator=None, **kwargs, ): """Compute the ranking loss and the flops loss for a single step. @@ -117,7 +118,10 @@ def train_splade( loss = sparse_loss_weight * sparse_loss + flops_loss_weight * flops_loss - loss.backward() + if accelerator: + accelerator.backward(loss) + else: + loss.backward() optimizer.step() optimizer.zero_grad() From eb1376b10bea9413091e65860945c64f970a2e3a Mon Sep 17 00:00:00 2001 From: bclavie Date: Fri, 22 Dec 2023 13:20:53 +0000 Subject: [PATCH 2/5] fix: typo --- docs/fine_tune/multi_gpu.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md index 9f5b666..2b7d6e0 100644 --- a/docs/fine_tune/multi_gpu.md +++ b/docs/fine_tune/multi_gpu.md @@ -54,7 +54,8 @@ if __name__ == "__main__"": accelerator.save_model(model, "checkpoint/epoch" + str(epoch)) unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( - "SPARSE_EMBEDFULL/epoch" + str(epoch), + "checkpoint/epoch" + str(epoch), + ) # Save at the end of the training loop # We check to make sure that only the main process will export the model From b8465cac692a2e6fc8902ea00375c98828e47166 Mon Sep 17 00:00:00 2001 From: bclavie Date: Fri, 22 Dec 2023 13:21:13 +0000 Subject: [PATCH 3/5] doc: redundant save in example --- docs/fine_tune/multi_gpu.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md index 2b7d6e0..78cead7 100644 --- a/docs/fine_tune/multi_gpu.md +++ b/docs/fine_tune/multi_gpu.md @@ -51,7 +51,6 @@ if __name__ == "__main__"": ) if accelerator.is_main_process and save_on_epoch: - accelerator.save_model(model, "checkpoint/epoch" + str(epoch)) unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( "checkpoint/epoch" + str(epoch), From 5e0c80455e61e76946fb6a4922afc382d6baa018 Mon Sep 17 00:00:00 2001 From: bclavie Date: Fri, 22 Dec 2023 15:26:36 +0000 Subject: [PATCH 4/5] fix: loop typo --- docs/fine_tune/multi_gpu.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md index 78cead7..5a28cbb 100644 --- a/docs/fine_tune/multi_gpu.md +++ b/docs/fine_tune/multi_gpu.md @@ -36,7 +36,7 @@ if __name__ == "__main__"": model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader) for epoch in range(epochs): - for batch_data in enumerate(data_loader): + for batch_id, batch_data in enumerate(data_loader): # Assuming batch_data is a tuple in the form (anchors, positives, negatives) anchors, positives, negatives = batch_data From 742635ba69ae64ed60d3c740276cad463ea3ecaf Mon Sep 17 00:00:00 2001 From: bclavie Date: Sat, 23 Dec 2023 18:42:19 +0000 Subject: [PATCH 5/5] chore: apply changes --- docs/fine_tune/multi_gpu.md | 62 +++++++++++++----------- neural_cherche/models/base.py | 70 ++++++++++++++++++++++++--- neural_cherche/models/colbert.py | 43 +++++----------- neural_cherche/models/sparse_embed.py | 30 ++---------- neural_cherche/models/splade.py | 30 ++---------- 5 files changed, 121 insertions(+), 114 deletions(-) diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md index 5a28cbb..1032d14 100644 --- a/docs/fine_tune/multi_gpu.md +++ b/docs/fine_tune/multi_gpu.md @@ -1,46 +1,54 @@ -# Multi-GPU (Accelerator) +# Multi-GPU (Partial) - -Training any of the models on multiple GPU via the accelerator library is simple. You just need to modify the training loop in a few key ways: +Neural-Cherche is working towards being fully compatible with multiples GPUs training using [Accelerator](https://huggingface.co/docs/accelerate/package_reference/accelerator). At the moment, there is partial compatibility, and we can train every models of neural-cherche using GPUs in most circumstances, although it's not yet fully supported. Here is a tutorial. ```python -from neural_cherche import models, utils, train import torch -from torch.utils.data import DataLoader from accelerate import Accelerator +from datasets import Dataset +from torch.utils.data import DataLoader +from neural_cherche import models, train -# Wrap in main function to avoid multiprocessing issues -if __name__ == "__main__"": +if __name__ == "__main__": + # We will need to wrap your training loop in a function to avoid multiprocessing issues. accelerator = Accelerator() - device = accelerator.device - batch_size = 32 - epochs = 2 - save_on_epoch = True + save_each_epoch = True model = models.SparseEmbed( model_name_or_path="distilbert-base-uncased", - device=device - ).to(device) + accelerate=True, + device=accelerator.device, + ).to(accelerator.device) # Optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) - # prepare your dataset -- this example uses a huggingface `datasets` object - ... + # Dataset creation using HuggingFace Datasets library. + dataset = Dataset.from_dict( + { + "anchors": ["anchor 1", "anchor 2", "anchor 3", "anchor 4"], + "positives": ["positive 1", "positive 2", "positive 3", "positive 4"], + "negatives": ["negative 1", "negative 2", "negative 3", "negative 4"], + } + ) - # Convert the data into a PyTorch dataloader for ease of preparation - data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + # Convert your dataset to a DataLoader. + data_loader = DataLoader(dataset, batch_size=32, shuffle=True) - # Wrap the model, optimizer, and data loader in the accelerator + # Wrap model, optimizer, and dataloader in accelerator. model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader) - for epoch in range(epochs): - for batch_id, batch_data in enumerate(data_loader): - # Assuming batch_data is a tuple in the form (anchors, positives, negatives) - anchors, positives, negatives = batch_data + for epoch in range(2): + for batch in enumerate(data_loader): + # Batch is a triple like (anchors, positives, negatives) + anchors, positives, negatives = ( + batch["anchors"], + batch["positives"], + batch["negatives"], + ) - loss = train_sparse_embed( + loss = train.train_sparse_embed( model=model, optimizer=optimizer, anchor=anchors, @@ -49,16 +57,16 @@ if __name__ == "__main__"": threshold_flops=30, accelerator=accelerator, ) - - if accelerator.is_main_process and save_on_epoch: + + if accelerator.is_main_process and save_each_epoch: unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( - "checkpoint/epoch" + str(epoch), + "checkpoint/epoch" + str(epoch), ) # Save at the end of the training loop # We check to make sure that only the main process will export the model if accelerator.is_main_process: unwrapped_model = accelerator.unwrap_model(model) - unwrapped_model.save_pretrained("checkpoint", accelerator=True) + unwrapped_model.save_pretrained("checkpoint") ``` \ No newline at end of file diff --git a/neural_cherche/models/base.py b/neural_cherche/models/base.py index 0165abd..413c68f 100644 --- a/neural_cherche/models/base.py +++ b/neural_cherche/models/base.py @@ -1,3 +1,4 @@ +import json import os from abc import ABC, abstractmethod @@ -15,6 +16,10 @@ class Base(ABC, torch.nn.Module): Path to the model or the model name. device Device to use for the model. CPU or CUDA. + extra_files_to_load + List of extra files to load. + accelerate + Use HuggingFace Accelerate. kwargs Additional parameters to the model. """ @@ -24,6 +29,7 @@ def __init__( model_name_or_path: str, device: str = None, extra_files_to_load: list[str] = [], + accelerate: bool = False, **kwargs, ) -> None: """Initialize the model.""" @@ -37,6 +43,8 @@ def __init__( else: self.device = "cpu" + self.accelerate = accelerate + os.environ["TRANSFORMERS_CACHE"] = "." self.model = AutoModelForMaskedLM.from_pretrained( model_name_or_path, cache_dir="./", **kwargs @@ -69,26 +77,54 @@ def __init__( self.query_pad_token = self.tokenizer.mask_token self.original_pad_token = self.tokenizer.pad_token - def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tensor]: - """Encode sentences. + def _encode_accelerate(self, texts: list[str], **kwargs) -> tuple[torch.Tensor]: + """Encode sentences with multiples gpus. Parameters ---------- texts List of sentences to encode. + + References + ---------- + [Accelerate issue.](https://github.com/huggingface/accelerate/issues/97) """ encoded_input = self.tokenizer(texts, return_tensors="pt", **kwargs).to( self.device ) - # Must hardcode position_ids to avoid a bug with accelerate multi-GPU - seq_len = encoded_input["input_ids"].size(1) - position_ids = torch.arange(0, seq_len).expand((len(texts), -1)).to(self.device) + position_ids = ( + torch.arange(0, encoded_input["input_ids"].size(1)) + .expand((len(texts), -1)) + .to(self.device) + ) - # Pass both the inputs and position_ids to the model output = self.model(**encoded_input, position_ids=position_ids) return output.logits, output.hidden_states[-1] + def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tensor]: + """Encode sentences. + + Parameters + ---------- + texts + List of sentences to encode. + """ + if self.accelerate: + return self._encode_accelerate(texts, **kwargs) + + encoded_input = self.tokenizer.batch_encode_plus( + texts, return_tensors="pt", **kwargs + ) + + if self.device != "cpu": + encoded_input = { + key: value.to(self.device) for key, value in encoded_input.items() + } + + output = self.model(**encoded_input) + return output.logits, output.hidden_states[-1] + @abstractmethod def forward(self, *args, **kwargs): """Pytorch forward method.""" @@ -108,3 +144,25 @@ def scores(self, *args, **kwars): def save_pretrained(self, path: str): """Save model the model.""" pass + + def save_tokenizer_accelerate(self, path: str) -> None: + """Save tokenizer when using accelerate.""" + tokenizer_config = { + k: v for k, v in self.tokenizer.__dict__.items() if k != "device" + } + tokenizer_config_file = os.path.join(path, "tokenizer_config.json") + with open(tokenizer_config_file, "w", encoding="utf-8") as file: + json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) + + # dump vocab + self.tokenizer.save_vocabulary(path) + + # save special tokens + special_tokens_file = os.path.join(path, "special_tokens_map.json") + with open(special_tokens_file, "w", encoding="utf-8") as file: + json.dump( + self.tokenizer.special_tokens_map, + file, + ensure_ascii=False, + indent=4, + ) diff --git a/neural_cherche/models/colbert.py b/neural_cherche/models/colbert.py index 302421e..ffeabfe 100644 --- a/neural_cherche/models/colbert.py +++ b/neural_cherche/models/colbert.py @@ -20,6 +20,8 @@ class ColBERT(Base): Size of the embeddings in output of ColBERT model. device Device to use for the model. CPU or CUDA. + accelerate + Use HuggingFace Accelerate. kwargs Additional parameters to the SentenceTransformer model. @@ -43,7 +45,6 @@ class ColBERT(Base): ... embedding_size=128, ... max_length_query=32, ... max_length_document=350, - ... device="mps", ... ) >>> scores = encoder.scores( @@ -52,9 +53,9 @@ class ColBERT(Base): ... ) >>> scores - tensor([20.2148, 16.7599, 18.2901], device='mps:0') + tensor([22.9325, 19.8296, 20.8019]) - >>> _ = encoder.save_pretrained("checkpoint") + >>> _ = encoder.save_pretrained("checkpoint", accelerate=False) >>> encoder = models.ColBERT( ... model_name_or_path="checkpoint", @@ -68,7 +69,7 @@ class ColBERT(Base): ... ) >>> scores - tensor([20.2148, 16.7599, 18.2901]) + tensor([22.9325, 19.8296, 20.8019]) >>> embeddings = encoder( ... texts=queries, @@ -95,6 +96,7 @@ def __init__( device: str = None, max_length_query: int = 32, max_length_document: int = 350, + accelerate: bool = False, **kwargs, ) -> None: """Initialize the model.""" @@ -102,6 +104,7 @@ def __init__( model_name_or_path=model_name_or_path, device=device, extra_files_to_load=["linear.pt", "metadata.json"], + accelerate=accelerate, **kwargs, ) @@ -268,7 +271,7 @@ def scores( return torch.cat(list_scores, dim=0) - def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT": + def save_pretrained(self, path: str) -> "ColBERT": """Save model the model. Parameters @@ -279,32 +282,6 @@ def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT": self.model.save_pretrained(path) torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt")) self.tokenizer.pad_token = self.original_pad_token - if accelerator: - # Workaround an issue with accelerator. Tokenizer has a key "device" - # which is non serialisable, but not removeable with a basic delattr - - # dump config - tokenizer_config = { - k: v for k, v in self.tokenizer.__dict__.items() if k != "device" - } - tokenizer_config_file = os.path.join(path, "tokenizer_config.json") - with open(tokenizer_config_file, "w", encoding="utf-8") as file: - json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) - - # dump vocab - self.tokenizer.save_vocabulary(path) - - # save special tokens - special_tokens_file = os.path.join(path, "special_tokens_map.json") - with open(special_tokens_file, "w", encoding="utf-8") as file: - json.dump( - self.tokenizer.special_tokens_map, - file, - ensure_ascii=False, - indent=4, - ) - else: - self.tokenizer.save_pretrained(path) with open(os.path.join(path, "metadata.json"), "w") as f: json.dump( { @@ -313,4 +290,8 @@ def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT": }, f, ) + if self.accelerate: + self.save_tokenizer_accelerate(path=path) + else: + self.tokenizer.save_pretrained(path) return self diff --git a/neural_cherche/models/sparse_embed.py b/neural_cherche/models/sparse_embed.py index c52358d..90369e2 100644 --- a/neural_cherche/models/sparse_embed.py +++ b/neural_cherche/models/sparse_embed.py @@ -97,12 +97,14 @@ def __init__( max_length_query: int = 128, max_length_document: int = 256, device: str = None, + accelerate: bool = False, **kwargs, ) -> None: super(SparseEmbed, self).__init__( model_name_or_path=model_name_or_path, device=device, extra_files_to_load=["linear.pt", "metadata.json"], + accelerate=accelerate, **kwargs, ) @@ -215,35 +217,13 @@ def _get_attention( def save_pretrained( self, path: str, - accelerator: bool = False, ): """Save model the model.""" self.model.save_pretrained(path) self.tokenizer.pad_token = self.original_pad_token - if accelerator: - # Workaround an issue with accelerator. Tokenizer has a key "device" - # which is non serialisable, but not removeable with a basic delattr - - # dump config - tokenizer_config = { - k: v for k, v in self.tokenizer.__dict__.items() if k != "device" - } - tokenizer_config_file = os.path.join(path, "tokenizer_config.json") - with open(tokenizer_config_file, "w", encoding="utf-8") as file: - json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) - - # dump vocab - self.tokenizer.save_vocabulary(path) - - # save special tokens - special_tokens_file = os.path.join(path, "special_tokens_map.json") - with open(special_tokens_file, "w", encoding="utf-8") as file: - json.dump( - self.tokenizer.special_tokens_map, - file, - ensure_ascii=False, - indent=4, - ) + + if self.accelerate: + self.save_tokenizer_accelerate(path) else: self.tokenizer.save_pretrained(path) torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt")) diff --git a/neural_cherche/models/splade.py b/neural_cherche/models/splade.py index ad4b488..bbbd18d 100644 --- a/neural_cherche/models/splade.py +++ b/neural_cherche/models/splade.py @@ -80,12 +80,14 @@ def __init__( max_length_query: int = 128, max_length_document: int = 256, extra_files_to_load: list[str] = ["metadata.json"], + accelerate: bool = False, **kwargs, ) -> None: super(Splade, self).__init__( model_name_or_path=model_name_or_path, device=device, extra_files_to_load=extra_files_to_load, + accelerate=accelerate, **kwargs, ) @@ -209,7 +211,6 @@ def forward( def save_pretrained( self, path: str, - accelerator: bool = False, ): """Save model the model. @@ -221,30 +222,9 @@ def save_pretrained( """ self.model.save_pretrained(path) self.tokenizer.pad_token = self.original_pad_token - if accelerator: - # Workaround an issue with accelerator. Tokenizer has a key "device" - # which is non serialisable, but not removeable with a basic delattr - - # dump config - tokenizer_config = { - k: v for k, v in self.tokenizer.__dict__.items() if k != "device" - } - tokenizer_config_file = os.path.join(path, "tokenizer_config.json") - with open(tokenizer_config_file, "w", encoding="utf-8") as file: - json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) - - # dump vocab - self.tokenizer.save_vocabulary(path) - - # save special tokens - special_tokens_file = os.path.join(path, "special_tokens_map.json") - with open(special_tokens_file, "w", encoding="utf-8") as file: - json.dump( - self.tokenizer.special_tokens_map, - file, - ensure_ascii=False, - indent=4, - ) + + if self.accelerate: + self.save_tokenizer_accelerate(path) else: self.tokenizer.save_pretrained(path)