From 4a4e983e66bb73806331f155ccfeb4ae3f474c07 Mon Sep 17 00:00:00 2001 From: Tony Date: Sat, 27 Jun 2026 20:19:58 +0000 Subject: [PATCH] Fix DDP gradient sync: training bypasses the wrapper The model is DDP-wrapped, but the training loop computes loss on the unwrapped module (`_get_model().compute_loss()`), which bypasses DDP's reducer -- so gradients are never synced across ranks. Each rank trains an independent replica on its own data shard and only rank 0's is saved, so multi-GPU runs do not data-parallelize. Fix: average gradients across ranks after backward(). --- curriculum_learning.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/curriculum_learning.py b/curriculum_learning.py index 4b25fe25..6bd0d15a 100644 --- a/curriculum_learning.py +++ b/curriculum_learning.py @@ -1115,6 +1115,15 @@ def _train_stage( loss = self._get_model().compute_loss(batch) loss.backward() + # _get_model() unwraps DDP, so its gradient sync is bypassed; average grads manually. + if dist.is_initialized(): + for p in self._get_model().parameters(): + if not p.requires_grad: + continue + if p.grad is None: + p.grad = torch.zeros_like(p) + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + # Handle gradient clipping for distributed training clip_grad_norm_(self._get_model().parameters(), GRAD_CLIP_NORM)