huggingface · mokashang · May 12, 2026
diff --git a/benchmarks/fsdp2/utils.py b/benchmarks/fsdp2/utils.py
@@ -114,7 +114,7 @@ def parse_args():
         "--save_memory_snapshot",
         action="store_true",
         default=False,
-        help="If True, `torch.cuda.memory._dump_snapshot` will be used to additionaly save the memory trace.",
+        help="If True, `torch.cuda.memory._dump_snapshot` will be used to additionally save the memory trace.",
     )
     ######################
     # Training arguments #
@@ -189,7 +189,7 @@ def collate_fn(examples):
 
 
 def get_model(model_name: str):
-    # We reguire model to be loaded in fp32, otherwise benchmarks don't match as accelerate does upcasting of parameters to fp32
+    # We require model to be loaded in fp32, otherwise benchmarks don't match as accelerate does upcasting of parameters to fp32
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32)
     model = AutoModelForCausalLM.from_config(config)
     return model

diff --git a/benchmarks/fsdp2/visualize.py b/benchmarks/fsdp2/visualize.py
@@ -25,7 +25,7 @@ def parse_args():
         "--memory_threshold",
         type=int,
         default=0,
-        help="Memory threshold to filter data that is below this value (only filters 1st `--filter_partition` of the points which should roughtly correspond to the model loading)",
+        help="Memory threshold to filter data that is below this value (only filters 1st `--filter_partition` of the points which should roughly correspond to the model loading)",
     )
     parser.add_argument(
         "--filter_partition",

diff --git a/docs/source/concept_guides/context_parallelism.md b/docs/source/concept_guides/context_parallelism.md
@@ -17,7 +17,7 @@ rendered properly in your Markdown viewer.
 
 This guide will cover basics of using context parallelism in 🤗`accelerate`, for the more curious readers, we will also cover some technicalities in the later sections.
 
-See also the very related [Guide to Sequence Parallellism](./sequence_parallelism.md).
+See also the very related [Guide to Sequence Parallelism](./sequence_parallelism.md).
 
 ## Why context parallelism?
 

diff --git a/docs/source/concept_guides/sequence_parallelism.md b/docs/source/concept_guides/sequence_parallelism.md
@@ -17,18 +17,18 @@ rendered properly in your Markdown viewer.
 
 This guide will cover basics of using sequence parallelism in 🤗`accelerate`.
 
-See also the very related [Context Parallellism](./context_parallelism.md).
+See also the very related [Context Parallelism](./context_parallelism.md).
 
 ## Why sequence parallelism?
 
 With the advent of large language models, and recently reasoning models, the sequence length has been growing rapidly. This, combined with quadratic memory complexity of attention, has led to a need for more efficient ways to train models with long sequences.
 With sequence length of 128k, the memory requirement of the attention matrix is `128k * 128k * 2 bytes * num_heads = ~32 GB * num_heads` for `bf16` precision, given vanilla attention implementation. Granted, with usage of `flash attention` or `SDPA` which do not materialize these attention weights, this decreases drastically, but the growth in memory requirements is still considerable.
 
-Ulysses Sequence parallelism allows us to shard the inputs to the attention computation along the sequence dimension and compute the attention normally, but using only a slice of attention heads on each GPU. With this, we can train models with long sequences, with a few more tools, scaling to 15M+ sequence length. To see how to augment Ulysses SP with TiledMLP, Liger-Kernel, Activation checkpoint offload to cpu and a few other tricks pleae refer to the paper: [Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences](https://arxiv.org/abs/2506.13996).
+Ulysses Sequence parallelism allows us to shard the inputs to the attention computation along the sequence dimension and compute the attention normally, but using only a slice of attention heads on each GPU. With this, we can train models with long sequences, with a few more tools, scaling to 15M+ sequence length. To see how to augment Ulysses SP with TiledMLP, Liger-Kernel, Activation checkpoint offload to cpu and a few other tricks please refer to the paper: [Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences](https://arxiv.org/abs/2506.13996).
 
 ## How is Ulysses SP different from FSDP CP
 
-In the document [Context Parallellism](./context_parallelism.md) you can learn about deploying another technology called Context Parallelism, which too slices on the sequence dimension but uses Ring Attention instead of slicing on the head dimension.
+In the document [Context Parallelism](./context_parallelism.md) you can learn about deploying another technology called Context Parallelism, which too slices on the sequence dimension but uses Ring Attention instead of slicing on the head dimension.
 
 The following articles go into a very detailed explanation of the differences between the two technologies:
 - https://insujang.github.io/2024-01-11/tensor-parallelism-and-sequence-parallelism-detailed-analysis/
@@ -141,7 +141,7 @@ parallelism_config:
 
 ```
 
-As mentioned earlier Ulysses sequence parallelism is normally overlayed with data parallelism - same ranks are used for feeding unique data streams and also perform Ulysses Sequence Parallelism. But you could also create replicas like so:
+As mentioned earlier Ulysses sequence parallelism is normally overlaid with data parallelism - same ranks are used for feeding unique data streams and also perform Ulysses Sequence Parallelism. But you could also create replicas like so:
 
 ```python
 # Example: 4 GPUs with 2D parallelism (SP=2, DP=2)

diff --git a/docs/source/package_reference/cli.md b/docs/source/package_reference/cli.md
@@ -218,7 +218,7 @@ The following arguments are only useful when `use_fsdp` is passed or Fully Shard
 * `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
 * `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
 * `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
-* `--fsdp_cpu_ram_efficient_loading` (`str`) -- If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
+* `--fsdp_cpu_ram_efficient_loading` (`str`) -- If true, only the first process loads the pretrained model checkpoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
 * `--fsdp_sync_module_states` (`str`) -- If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
 * `--fsdp_activation_checkpointing` (`bool`) -- Decides Whether intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder
 

diff --git a/docs/source/usage_guides/compilation.md b/docs/source/usage_guides/compilation.md
@@ -14,7 +14,7 @@ The first execution of compiled code typically takes longer as it includes the c
 
 ## Using `torch.compile` with Accelerate
 
-Accelerate provides `TorchDynamoPlugin` for easy and seemless integration of `torch.compile` into your training scripts.
+Accelerate provides `TorchDynamoPlugin` for easy and seamless integration of `torch.compile` into your training scripts.
 
 ```python
 from accelerate import Accelerator

diff --git a/docs/source/usage_guides/low_precision_training.md b/docs/source/usage_guides/low_precision_training.md
@@ -122,7 +122,7 @@ kwargs = [FP8RecipeKwargs(backend="te", ...)]
 accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
 ```
 
-Or during `accelerate launch` via `--fp8_backend=te ...`. Use `accelerate launch --fp8_backend=te -h` to see relevent arguments.
+Or during `accelerate launch` via `--fp8_backend=te ...`. Use `accelerate launch --fp8_backend=te -h` to see relevant arguments.
 
 Similarly this can be set in your `config.yaml`:
 
@@ -174,7 +174,7 @@ accelerator = Accelerator(
 )
 ```
 
-Or during `accelerate launch` via `--fp8_backend=ao ...`. Use `accelerate launch --fp8_backend=ao -h` to see relevent arguments.
+Or during `accelerate launch` via `--fp8_backend=ao ...`. Use `accelerate launch --fp8_backend=ao -h` to see relevant arguments.
 
 Similarly, this can be set in `config.yaml`:
 

diff --git a/docs/source/usage_guides/megatron_lm.md b/docs/source/usage_guides/megatron_lm.md
@@ -125,7 +125,7 @@ pip install --no-use-pep517 -e .
 ```
 
 ## Prepare Megaton-LM checkpoint
-If you want to fine-tune a model, make sure you have a torch dist format checkpoint ready. If you only have access to the huggingface model, please consider converting it to a torch dist format checkpoint acceptable to Megatron. One examle can be using slime's script, take GLM models as an example:
+If you want to fine-tune a model, make sure you have a torch dist format checkpoint ready. If you only have access to the huggingface model, please consider converting it to a torch dist format checkpoint acceptable to Megatron. One example can be using slime's script, take GLM models as an example:
 ```
 source /your/path/to/slime/scripts/models/glm4.5-355B-A32B.sh
 srun torchrun --nproc-per-node 8 \

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1309,7 +1309,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
                 PyTorch Module that was prepared with `Accelerator.prepare` for DistributedDataParallel training.
             even_batches (`bool`, *optional*)
                 If set, this will override the value of `even_batches` set in the `Accelerator`. If it is not provided,
-                the default `Accelerator` value wil be used.
+                the default `Accelerator` value will be used.
 
         <Tip warning={true}>
 
@@ -2346,7 +2346,7 @@ def _prepare_deepspeed(self, *args):
             deepspeed_plugin.deepspeed_config_process(must_match=False, **config_kwargs)
             self.deepspeed_config = deepspeed_plugin.deepspeed_config
 
-            # note: batch_size derivation is all over the map, especiall in HF Trainer, so try to fix it at the last moment if needed
+            # note: batch_size derivation is all over the map, especially in HF Trainer, so try to fix it at the last moment if needed
             pc = self.parallelism_config
             if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_size > 1:
                 self.deepspeed_config["train_batch_size"] = (
@@ -3382,7 +3382,7 @@ def log(self, values: dict, step: int | None = None, log_kwargs: dict | None = {
 
     def end_training(self):
         """
-        Runs any special end training behaviors, such as stopping trackers on the main process only or destoying
+        Runs any special end training behaviors, such as stopping trackers on the main process only or destroying
         process group. Should always be called at the end of your script if using experiment tracking.
 
         Example:

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -585,7 +585,7 @@ def launch_command_parser(subparsers=None):
         "--fsdp_cpu_ram_efficient_loading",
         default="true",
         type=str,
-        help="If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
+        help="If True, only the first process loads the pretrained model checkpoint while all other processes have empty weights. "
         "Only applicable for 🤗 Transformers. When using this, `--fsdp_sync_module_states` needs to True. "
         "(useful only when `use_fsdp` flag is passed).",
     )

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -1631,7 +1631,7 @@ class FullyShardedDataParallelPlugin:
             A technique to reduce memory usage by clearing activations of certain layers and recomputing them during a
             backward pass. Effectively, this trades extra computation time for reduced memory usage.
         cpu_ram_efficient_loading (`bool`, defaults to `None`):
-            If True, only the first process loads the pretrained model checkoint while all other processes have empty
+            If True, only the first process loads the pretrained model checkpoint while all other processes have empty
             weights. Only applicable for Transformers. When using this, `sync_module_states` needs to be `True`.
         transformer_cls_names_to_wrap (`Optional[List[str]]`, defaults to `None`):
             A list of transformer layer class names to wrap. Only applicable when `auto_wrap_policy` is
@@ -1780,7 +1780,7 @@ class FullyShardedDataParallelPlugin:
     cpu_ram_efficient_loading: bool = field(
         default=None,
         metadata={
-            "help": "If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
+            "help": "If True, only the first process loads the pretrained model checkpoint while all other processes have empty weights. "
             "Only applicable for 🤗 Transformers. When using this, `sync_module_states` needs to be `True`. Defaults to `False`."
         },
     )

diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
@@ -1917,7 +1917,7 @@ def load_checkpoint_in_model(
         checkpoint_files = sorted(list(set(index.values())))
         checkpoint_files = [os.path.join(checkpoint_folder, f) for f in checkpoint_files]
 
-    # Logic for missing/unexepected keys goes here.
+    # Logic for missing/unexpected keys goes here.
 
     offload_index = {}
     if offload_state_dict: