From d7dcc884448ff017f2e80940a05d864402410708 Mon Sep 17 00:00:00 2001
From: bclavie <ben@clavie.eu>
Date: Fri, 22 Dec 2023 13:16:39 +0000
Subject: [PATCH 1/5] chore: accelerator-compatible training

---
 docs/api/overview.md                       |  1 +
 docs/fine_tune/.pages                      |  1 +
 docs/fine_tune/multi_gpu.md                | 64 ++++++++++++++++++++++
 neural_cherche/models/base.py              | 14 ++---
 neural_cherche/models/colbert.py           | 29 +++++++++-
 neural_cherche/models/sparse_embed.py      | 33 ++++++++++-
 neural_cherche/models/splade.py            | 46 ++++++++++++----
 neural_cherche/train/train_colbert.py      |  6 +-
 neural_cherche/train/train_sparse_embed.py |  6 +-
 neural_cherche/train/train_splade.py       |  6 +-
 10 files changed, 182 insertions(+), 24 deletions(-)
 create mode 100644 docs/fine_tune/multi_gpu.md

diff --git a/docs/api/overview.md b/docs/api/overview.md
index 7d12872..141b101 100644
--- a/docs/api/overview.md
+++ b/docs/api/overview.md
@@ -27,6 +27,7 @@
 - [train_colbert](../train/train-colbert)
 - [train_sparse_embed](../train/train-sparse-embed)
 - [train_splade](../train/train-splade)
+- [Multi-GPU training via Accelerator](../train/multi-gpu)
 
 ## utils
 
diff --git a/docs/fine_tune/.pages b/docs/fine_tune/.pages
index 06efd47..aecd94a 100644
--- a/docs/fine_tune/.pages
+++ b/docs/fine_tune/.pages
@@ -3,4 +3,5 @@ nav:
     - colbert.md
     - splade.md
     - sparse_embed.md
+    - multi_gpu.md
   
\ No newline at end of file
diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
new file mode 100644
index 0000000..9f5b666
--- /dev/null
+++ b/docs/fine_tune/multi_gpu.md
@@ -0,0 +1,64 @@
+# Multi-GPU (Accelerator)
+
+
+Training any of the models on multiple GPU via the accelerator library is simple. You just need to modify the training loop in a few key ways:
+
+```python
+from neural_cherche import models, utils, train
+import torch
+from torch.utils.data import DataLoader
+from accelerate import Accelerator
+
+
+# Wrap in main function to avoid multiprocessing issues
+if __name__ == "__main__"":
+    accelerator = Accelerator()
+    device = accelerator.device
+    batch_size = 32
+    epochs = 2
+    save_on_epoch = True
+
+    model = models.SparseEmbed(
+        model_name_or_path="distilbert-base-uncased",
+        device=device
+    ).to(device)
+
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
+
+    # prepare your dataset -- this example uses a huggingface `datasets` object
+    ...
+
+    # Convert the data into a PyTorch dataloader for ease of preparation
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    # Wrap the model, optimizer, and data loader in the accelerator
+    model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
+
+    for epoch in range(epochs):
+        for batch_data in enumerate(data_loader):
+            # Assuming batch_data is a tuple in the form (anchors, positives, negatives)
+            anchors, positives, negatives = batch_data
+
+            loss = train_sparse_embed(
+                model=model,
+                optimizer=optimizer,
+                anchor=anchors,
+                positive=positives,
+                negative=negatives,
+                threshold_flops=30,
+                accelerator=accelerator,
+            )
+    
+        if accelerator.is_main_process and save_on_epoch:
+            accelerator.save_model(model, "checkpoint/epoch" + str(epoch))
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+            "SPARSE_EMBEDFULL/epoch" + str(epoch),
+
+    # Save at the end of the training loop
+    # We check to make sure that only the main process will export the model
+    if accelerator.is_main_process:
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained("checkpoint", accelerator=True)
+```
\ No newline at end of file
diff --git a/neural_cherche/models/base.py b/neural_cherche/models/base.py
index a2b0b92..0165abd 100644
--- a/neural_cherche/models/base.py
+++ b/neural_cherche/models/base.py
@@ -77,16 +77,16 @@ def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tenso
         texts
             List of sentences to encode.
         """
-        encoded_input = self.tokenizer.batch_encode_plus(
-            texts, return_tensors="pt", **kwargs
+        encoded_input = self.tokenizer(texts, return_tensors="pt", **kwargs).to(
+            self.device
         )
 
-        if self.device != "cpu":
-            encoded_input = {
-                key: value.to(self.device) for key, value in encoded_input.items()
-            }
+        # Must hardcode position_ids to avoid a bug with accelerate multi-GPU
+        seq_len = encoded_input["input_ids"].size(1)
+        position_ids = torch.arange(0, seq_len).expand((len(texts), -1)).to(self.device)
 
-        output = self.model(**encoded_input)
+        # Pass both the inputs and position_ids to the model
+        output = self.model(**encoded_input, position_ids=position_ids)
         return output.logits, output.hidden_states[-1]
 
     @abstractmethod
diff --git a/neural_cherche/models/colbert.py b/neural_cherche/models/colbert.py
index 2dd21e4..302421e 100644
--- a/neural_cherche/models/colbert.py
+++ b/neural_cherche/models/colbert.py
@@ -268,7 +268,7 @@ def scores(
 
         return torch.cat(list_scores, dim=0)
 
-    def save_pretrained(self, path: str) -> "ColBERT":
+    def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT":
         """Save model the model.
 
         Parameters
@@ -279,7 +279,32 @@ def save_pretrained(self, path: str) -> "ColBERT":
         self.model.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
         with open(os.path.join(path, "metadata.json"), "w") as f:
             json.dump(
                 {
diff --git a/neural_cherche/models/sparse_embed.py b/neural_cherche/models/sparse_embed.py
index f4eec0e..c52358d 100644
--- a/neural_cherche/models/sparse_embed.py
+++ b/neural_cherche/models/sparse_embed.py
@@ -212,11 +212,40 @@ def _get_attention(
 
         return self.softmax(attention)
 
-    def save_pretrained(self, path: str):
+    def save_pretrained(
+        self,
+        path: str,
+        accelerator: bool = False,
+    ):
         """Save model the model."""
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
         with open(os.path.join(path, "metadata.json"), "w") as file:
             json.dump(
diff --git a/neural_cherche/models/splade.py b/neural_cherche/models/splade.py
index 34bc5a0..ad4b488 100644
--- a/neural_cherche/models/splade.py
+++ b/neural_cherche/models/splade.py
@@ -206,7 +206,11 @@ def forward(
 
         return {"sparse_activations": activations["sparse_activations"]}
 
-    def save_pretrained(self, path: str):
+    def save_pretrained(
+        self,
+        path: str,
+        accelerator: bool = False,
+    ):
         """Save model the model.
 
         Parameters
@@ -217,7 +221,32 @@ def save_pretrained(self, path: str):
         """
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
 
         with open(os.path.join(path, "metadata.json"), "w") as file:
             json.dump(
@@ -306,15 +335,12 @@ def _update_activations(
     ) -> torch.Tensor:
         """Returns activated tokens."""
         activations = torch.topk(input=sparse_activations, k=k_tokens, dim=1).indices
-
-        # Set value of max sparse_activations which are not in top k to 0.
-        sparse_activations = sparse_activations * torch.zeros(
-            (sparse_activations.shape[0], sparse_activations.shape[1]),
-            dtype=int,
-            device=self.device,
-        ).scatter_(dim=1, index=activations.long(), value=1)
+        zero_tensor = torch.zeros_like(sparse_activations, dtype=int)
+        updated_sparse_activations = sparse_activations * zero_tensor.scatter(
+            dim=1, index=activations.long(), value=1
+        )
 
         return {
             "activations": activations,
-            "sparse_activations": sparse_activations,
+            "sparse_activations": updated_sparse_activations,
         }
diff --git a/neural_cherche/train/train_colbert.py b/neural_cherche/train/train_colbert.py
index 6858210..dc09294 100644
--- a/neural_cherche/train/train_colbert.py
+++ b/neural_cherche/train/train_colbert.py
@@ -10,6 +10,7 @@ def train_colbert(
     positive: list[str],
     negative: list[str],
     in_batch_negatives: bool = False,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -98,7 +99,10 @@ def train_colbert(
 
     loss = losses.Ranking()(**scores)
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()
 
diff --git a/neural_cherche/train/train_sparse_embed.py b/neural_cherche/train/train_sparse_embed.py
index 82c3138..2860058 100644
--- a/neural_cherche/train/train_sparse_embed.py
+++ b/neural_cherche/train/train_sparse_embed.py
@@ -16,6 +16,7 @@ def train_sparse_embed(
     dense_loss_weight: float = 1.0,
     in_batch_negatives: bool = False,
     threshold_flops: float = 30,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -147,7 +148,10 @@ def train_sparse_embed(
         + flops_loss_weight * flops_loss
     )
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()
 
diff --git a/neural_cherche/train/train_splade.py b/neural_cherche/train/train_splade.py
index ff48f32..d633400 100644
--- a/neural_cherche/train/train_splade.py
+++ b/neural_cherche/train/train_splade.py
@@ -13,6 +13,7 @@ def train_splade(
     sparse_loss_weight: float = 1.0,
     in_batch_negatives: bool = False,
     threshold_flops: float = 30,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -117,7 +118,10 @@ def train_splade(
 
     loss = sparse_loss_weight * sparse_loss + flops_loss_weight * flops_loss
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()
 

From eb1376b10bea9413091e65860945c64f970a2e3a Mon Sep 17 00:00:00 2001
From: bclavie <ben@clavie.eu>
Date: Fri, 22 Dec 2023 13:20:53 +0000
Subject: [PATCH 2/5] fix: typo

---
 docs/fine_tune/multi_gpu.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
index 9f5b666..2b7d6e0 100644
--- a/docs/fine_tune/multi_gpu.md
+++ b/docs/fine_tune/multi_gpu.md
@@ -54,7 +54,8 @@ if __name__ == "__main__"":
             accelerator.save_model(model, "checkpoint/epoch" + str(epoch))
             unwrapped_model = accelerator.unwrap_model(model)
             unwrapped_model.save_pretrained(
-            "SPARSE_EMBEDFULL/epoch" + str(epoch),
+            "checkpoint/epoch" + str(epoch),
+            )
 
     # Save at the end of the training loop
     # We check to make sure that only the main process will export the model

From b8465cac692a2e6fc8902ea00375c98828e47166 Mon Sep 17 00:00:00 2001
From: bclavie <ben@clavie.eu>
Date: Fri, 22 Dec 2023 13:21:13 +0000
Subject: [PATCH 3/5] doc: redundant save in example

---
 docs/fine_tune/multi_gpu.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
index 2b7d6e0..78cead7 100644
--- a/docs/fine_tune/multi_gpu.md
+++ b/docs/fine_tune/multi_gpu.md
@@ -51,7 +51,6 @@ if __name__ == "__main__"":
             )
     
         if accelerator.is_main_process and save_on_epoch:
-            accelerator.save_model(model, "checkpoint/epoch" + str(epoch))
             unwrapped_model = accelerator.unwrap_model(model)
             unwrapped_model.save_pretrained(
             "checkpoint/epoch" + str(epoch),

From 5e0c80455e61e76946fb6a4922afc382d6baa018 Mon Sep 17 00:00:00 2001
From: bclavie <ben@clavie.eu>
Date: Fri, 22 Dec 2023 15:26:36 +0000
Subject: [PATCH 4/5] fix: loop typo

---
 docs/fine_tune/multi_gpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
index 78cead7..5a28cbb 100644
--- a/docs/fine_tune/multi_gpu.md
+++ b/docs/fine_tune/multi_gpu.md
@@ -36,7 +36,7 @@ if __name__ == "__main__"":
     model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
 
     for epoch in range(epochs):
-        for batch_data in enumerate(data_loader):
+        for batch_id, batch_data in enumerate(data_loader):
             # Assuming batch_data is a tuple in the form (anchors, positives, negatives)
             anchors, positives, negatives = batch_data
 

From 742635ba69ae64ed60d3c740276cad463ea3ecaf Mon Sep 17 00:00:00 2001
From: bclavie <ben@clavie.eu>
Date: Sat, 23 Dec 2023 18:42:19 +0000
Subject: [PATCH 5/5] chore: apply changes

---
 docs/fine_tune/multi_gpu.md           | 62 +++++++++++++-----------
 neural_cherche/models/base.py         | 70 ++++++++++++++++++++++++---
 neural_cherche/models/colbert.py      | 43 +++++-----------
 neural_cherche/models/sparse_embed.py | 30 ++----------
 neural_cherche/models/splade.py       | 30 ++----------
 5 files changed, 121 insertions(+), 114 deletions(-)

diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
index 5a28cbb..1032d14 100644
--- a/docs/fine_tune/multi_gpu.md
+++ b/docs/fine_tune/multi_gpu.md
@@ -1,46 +1,54 @@
-# Multi-GPU (Accelerator)
+# Multi-GPU (Partial)
 
-
-Training any of the models on multiple GPU via the accelerator library is simple. You just need to modify the training loop in a few key ways:
+Neural-Cherche is working towards being fully compatible with multiples GPUs training using [Accelerator](https://huggingface.co/docs/accelerate/package_reference/accelerator). At the moment, there is partial compatibility, and we can train every models of neural-cherche using GPUs in most circumstances, although it's not yet fully supported. Here is a tutorial.
 
 ```python
-from neural_cherche import models, utils, train
 import torch
-from torch.utils.data import DataLoader
 from accelerate import Accelerator
+from datasets import Dataset
+from torch.utils.data import DataLoader
 
+from neural_cherche import models, train
 
-# Wrap in main function to avoid multiprocessing issues
-if __name__ == "__main__"":
+if __name__ == "__main__":
+    # We will need to wrap your training loop in a function to avoid multiprocessing issues.
     accelerator = Accelerator()
-    device = accelerator.device
-    batch_size = 32
-    epochs = 2
-    save_on_epoch = True
+    save_each_epoch = True
 
     model = models.SparseEmbed(
         model_name_or_path="distilbert-base-uncased",
-        device=device
-    ).to(device)
+        accelerate=True,
+        device=accelerator.device,
+    ).to(accelerator.device)
 
     # Optimizer
     optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
 
-    # prepare your dataset -- this example uses a huggingface `datasets` object
-    ...
+    # Dataset creation using HuggingFace Datasets library.
+    dataset = Dataset.from_dict(
+        {
+            "anchors": ["anchor 1", "anchor 2", "anchor 3", "anchor 4"],
+            "positives": ["positive 1", "positive 2", "positive 3", "positive 4"],
+            "negatives": ["negative 1", "negative 2", "negative 3", "negative 4"],
+        }
+    )
 
-    # Convert the data into a PyTorch dataloader for ease of preparation
-    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+    # Convert your dataset to a DataLoader.
+    data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
 
-    # Wrap the model, optimizer, and data loader in the accelerator
+    # Wrap model, optimizer, and dataloader in accelerator.
     model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
 
-    for epoch in range(epochs):
-        for batch_id, batch_data in enumerate(data_loader):
-            # Assuming batch_data is a tuple in the form (anchors, positives, negatives)
-            anchors, positives, negatives = batch_data
+    for epoch in range(2):
+        for batch in enumerate(data_loader):
+            # Batch is a triple like (anchors, positives, negatives)
+            anchors, positives, negatives = (
+                batch["anchors"],
+                batch["positives"],
+                batch["negatives"],
+            )
 
-            loss = train_sparse_embed(
+            loss = train.train_sparse_embed(
                 model=model,
                 optimizer=optimizer,
                 anchor=anchors,
@@ -49,16 +57,16 @@ if __name__ == "__main__"":
                 threshold_flops=30,
                 accelerator=accelerator,
             )
-    
-        if accelerator.is_main_process and save_on_epoch:
+
+        if accelerator.is_main_process and save_each_epoch:
             unwrapped_model = accelerator.unwrap_model(model)
             unwrapped_model.save_pretrained(
-            "checkpoint/epoch" + str(epoch),
+                "checkpoint/epoch" + str(epoch),
             )
 
     # Save at the end of the training loop
     # We check to make sure that only the main process will export the model
     if accelerator.is_main_process:
         unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained("checkpoint", accelerator=True)
+        unwrapped_model.save_pretrained("checkpoint")
 ```
\ No newline at end of file
diff --git a/neural_cherche/models/base.py b/neural_cherche/models/base.py
index 0165abd..413c68f 100644
--- a/neural_cherche/models/base.py
+++ b/neural_cherche/models/base.py
@@ -1,3 +1,4 @@
+import json
 import os
 from abc import ABC, abstractmethod
 
@@ -15,6 +16,10 @@ class Base(ABC, torch.nn.Module):
         Path to the model or the model name.
     device
         Device to use for the model. CPU or CUDA.
+    extra_files_to_load
+        List of extra files to load.
+    accelerate
+        Use HuggingFace Accelerate.
     kwargs
         Additional parameters to the model.
     """
@@ -24,6 +29,7 @@ def __init__(
         model_name_or_path: str,
         device: str = None,
         extra_files_to_load: list[str] = [],
+        accelerate: bool = False,
         **kwargs,
     ) -> None:
         """Initialize the model."""
@@ -37,6 +43,8 @@ def __init__(
         else:
             self.device = "cpu"
 
+        self.accelerate = accelerate
+
         os.environ["TRANSFORMERS_CACHE"] = "."
         self.model = AutoModelForMaskedLM.from_pretrained(
             model_name_or_path, cache_dir="./", **kwargs
@@ -69,26 +77,54 @@ def __init__(
         self.query_pad_token = self.tokenizer.mask_token
         self.original_pad_token = self.tokenizer.pad_token
 
-    def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
-        """Encode sentences.
+    def _encode_accelerate(self, texts: list[str], **kwargs) -> tuple[torch.Tensor]:
+        """Encode sentences with multiples gpus.
 
         Parameters
         ----------
         texts
             List of sentences to encode.
+
+        References
+        ----------
+        [Accelerate issue.](https://github.com/huggingface/accelerate/issues/97)
         """
         encoded_input = self.tokenizer(texts, return_tensors="pt", **kwargs).to(
             self.device
         )
 
-        # Must hardcode position_ids to avoid a bug with accelerate multi-GPU
-        seq_len = encoded_input["input_ids"].size(1)
-        position_ids = torch.arange(0, seq_len).expand((len(texts), -1)).to(self.device)
+        position_ids = (
+            torch.arange(0, encoded_input["input_ids"].size(1))
+            .expand((len(texts), -1))
+            .to(self.device)
+        )
 
-        # Pass both the inputs and position_ids to the model
         output = self.model(**encoded_input, position_ids=position_ids)
         return output.logits, output.hidden_states[-1]
 
+    def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+        """Encode sentences.
+
+        Parameters
+        ----------
+        texts
+            List of sentences to encode.
+        """
+        if self.accelerate:
+            return self._encode_accelerate(texts, **kwargs)
+
+        encoded_input = self.tokenizer.batch_encode_plus(
+            texts, return_tensors="pt", **kwargs
+        )
+
+        if self.device != "cpu":
+            encoded_input = {
+                key: value.to(self.device) for key, value in encoded_input.items()
+            }
+
+        output = self.model(**encoded_input)
+        return output.logits, output.hidden_states[-1]
+
     @abstractmethod
     def forward(self, *args, **kwargs):
         """Pytorch forward method."""
@@ -108,3 +144,25 @@ def scores(self, *args, **kwars):
     def save_pretrained(self, path: str):
         """Save model the model."""
         pass
+
+    def save_tokenizer_accelerate(self, path: str) -> None:
+        """Save tokenizer when using accelerate."""
+        tokenizer_config = {
+            k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+        }
+        tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+        with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+            json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+        # dump vocab
+        self.tokenizer.save_vocabulary(path)
+
+        # save special tokens
+        special_tokens_file = os.path.join(path, "special_tokens_map.json")
+        with open(special_tokens_file, "w", encoding="utf-8") as file:
+            json.dump(
+                self.tokenizer.special_tokens_map,
+                file,
+                ensure_ascii=False,
+                indent=4,
+            )
diff --git a/neural_cherche/models/colbert.py b/neural_cherche/models/colbert.py
index 302421e..ffeabfe 100644
--- a/neural_cherche/models/colbert.py
+++ b/neural_cherche/models/colbert.py
@@ -20,6 +20,8 @@ class ColBERT(Base):
         Size of the embeddings in output of ColBERT model.
     device
         Device to use for the model. CPU or CUDA.
+    accelerate
+        Use HuggingFace Accelerate.
     kwargs
         Additional parameters to the SentenceTransformer model.
 
@@ -43,7 +45,6 @@ class ColBERT(Base):
     ...     embedding_size=128,
     ...     max_length_query=32,
     ...     max_length_document=350,
-    ...     device="mps",
     ... )
 
     >>> scores = encoder.scores(
@@ -52,9 +53,9 @@ class ColBERT(Base):
     ... )
 
     >>> scores
-    tensor([20.2148, 16.7599, 18.2901], device='mps:0')
+    tensor([22.9325, 19.8296, 20.8019])
 
-    >>> _ = encoder.save_pretrained("checkpoint")
+    >>> _ = encoder.save_pretrained("checkpoint", accelerate=False)
 
     >>> encoder = models.ColBERT(
     ...     model_name_or_path="checkpoint",
@@ -68,7 +69,7 @@ class ColBERT(Base):
     ... )
 
     >>> scores
-    tensor([20.2148, 16.7599, 18.2901])
+    tensor([22.9325, 19.8296, 20.8019])
 
     >>> embeddings = encoder(
     ...     texts=queries,
@@ -95,6 +96,7 @@ def __init__(
         device: str = None,
         max_length_query: int = 32,
         max_length_document: int = 350,
+        accelerate: bool = False,
         **kwargs,
     ) -> None:
         """Initialize the model."""
@@ -102,6 +104,7 @@ def __init__(
             model_name_or_path=model_name_or_path,
             device=device,
             extra_files_to_load=["linear.pt", "metadata.json"],
+            accelerate=accelerate,
             **kwargs,
         )
 
@@ -268,7 +271,7 @@ def scores(
 
         return torch.cat(list_scores, dim=0)
 
-    def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT":
+    def save_pretrained(self, path: str) -> "ColBERT":
         """Save model the model.
 
         Parameters
@@ -279,32 +282,6 @@ def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT":
         self.model.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
         self.tokenizer.pad_token = self.original_pad_token
-        if accelerator:
-            # Workaround an issue with accelerator. Tokenizer has a key "device"
-            # which is non serialisable, but not removeable with a basic delattr
-
-            # dump config
-            tokenizer_config = {
-                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
-            }
-            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
-            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
-                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
-
-            # dump vocab
-            self.tokenizer.save_vocabulary(path)
-
-            # save special tokens
-            special_tokens_file = os.path.join(path, "special_tokens_map.json")
-            with open(special_tokens_file, "w", encoding="utf-8") as file:
-                json.dump(
-                    self.tokenizer.special_tokens_map,
-                    file,
-                    ensure_ascii=False,
-                    indent=4,
-                )
-        else:
-            self.tokenizer.save_pretrained(path)
         with open(os.path.join(path, "metadata.json"), "w") as f:
             json.dump(
                 {
@@ -313,4 +290,8 @@ def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT":
                 },
                 f,
             )
+        if self.accelerate:
+            self.save_tokenizer_accelerate(path=path)
+        else:
+            self.tokenizer.save_pretrained(path)
         return self
diff --git a/neural_cherche/models/sparse_embed.py b/neural_cherche/models/sparse_embed.py
index c52358d..90369e2 100644
--- a/neural_cherche/models/sparse_embed.py
+++ b/neural_cherche/models/sparse_embed.py
@@ -97,12 +97,14 @@ def __init__(
         max_length_query: int = 128,
         max_length_document: int = 256,
         device: str = None,
+        accelerate: bool = False,
         **kwargs,
     ) -> None:
         super(SparseEmbed, self).__init__(
             model_name_or_path=model_name_or_path,
             device=device,
             extra_files_to_load=["linear.pt", "metadata.json"],
+            accelerate=accelerate,
             **kwargs,
         )
 
@@ -215,35 +217,13 @@ def _get_attention(
     def save_pretrained(
         self,
         path: str,
-        accelerator: bool = False,
     ):
         """Save model the model."""
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        if accelerator:
-            # Workaround an issue with accelerator. Tokenizer has a key "device"
-            # which is non serialisable, but not removeable with a basic delattr
-
-            # dump config
-            tokenizer_config = {
-                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
-            }
-            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
-            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
-                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
-
-            # dump vocab
-            self.tokenizer.save_vocabulary(path)
-
-            # save special tokens
-            special_tokens_file = os.path.join(path, "special_tokens_map.json")
-            with open(special_tokens_file, "w", encoding="utf-8") as file:
-                json.dump(
-                    self.tokenizer.special_tokens_map,
-                    file,
-                    ensure_ascii=False,
-                    indent=4,
-                )
+
+        if self.accelerate:
+            self.save_tokenizer_accelerate(path)
         else:
             self.tokenizer.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
diff --git a/neural_cherche/models/splade.py b/neural_cherche/models/splade.py
index ad4b488..bbbd18d 100644
--- a/neural_cherche/models/splade.py
+++ b/neural_cherche/models/splade.py
@@ -80,12 +80,14 @@ def __init__(
         max_length_query: int = 128,
         max_length_document: int = 256,
         extra_files_to_load: list[str] = ["metadata.json"],
+        accelerate: bool = False,
         **kwargs,
     ) -> None:
         super(Splade, self).__init__(
             model_name_or_path=model_name_or_path,
             device=device,
             extra_files_to_load=extra_files_to_load,
+            accelerate=accelerate,
             **kwargs,
         )
 
@@ -209,7 +211,6 @@ def forward(
     def save_pretrained(
         self,
         path: str,
-        accelerator: bool = False,
     ):
         """Save model the model.
 
@@ -221,30 +222,9 @@ def save_pretrained(
         """
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        if accelerator:
-            # Workaround an issue with accelerator. Tokenizer has a key "device"
-            # which is non serialisable, but not removeable with a basic delattr
-
-            # dump config
-            tokenizer_config = {
-                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
-            }
-            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
-            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
-                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
-
-            # dump vocab
-            self.tokenizer.save_vocabulary(path)
-
-            # save special tokens
-            special_tokens_file = os.path.join(path, "special_tokens_map.json")
-            with open(special_tokens_file, "w", encoding="utf-8") as file:
-                json.dump(
-                    self.tokenizer.special_tokens_map,
-                    file,
-                    ensure_ascii=False,
-                    indent=4,
-                )
+
+        if self.accelerate:
+            self.save_tokenizer_accelerate(path)
         else:
             self.tokenizer.save_pretrained(path)