From 4a4e983e66bb73806331f155ccfeb4ae3f474c07 Mon Sep 17 00:00:00 2001
From: Tony <tc3416@columbia.edu>
Date: Sat, 27 Jun 2026 20:19:58 +0000
Subject: [PATCH] Fix DDP gradient sync: training bypasses the wrapper

The model is DDP-wrapped, but the training loop computes loss on the unwrapped module
(`_get_model().compute_loss()`), which bypasses DDP's reducer -- so gradients are never
synced across ranks. Each rank trains an independent replica on its own data shard and
only rank 0's is saved, so multi-GPU runs do not data-parallelize.

Fix: average gradients across ranks after backward().
---
 curriculum_learning.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/curriculum_learning.py b/curriculum_learning.py
index 4b25fe25..6bd0d15a 100644
--- a/curriculum_learning.py
+++ b/curriculum_learning.py
@@ -1115,6 +1115,15 @@ def _train_stage(
                     loss = self._get_model().compute_loss(batch)
                     loss.backward()
 
+                    # _get_model() unwraps DDP, so its gradient sync is bypassed; average grads manually.
+                    if dist.is_initialized():
+                        for p in self._get_model().parameters():
+                            if not p.requires_grad:
+                                continue
+                            if p.grad is None:
+                                p.grad = torch.zeros_like(p)
+                            dist.all_reduce(p.grad, op=dist.ReduceOp.AVG)
+
                     # Handle gradient clipping for distributed training
                     clip_grad_norm_(self._get_model().parameters(), GRAD_CLIP_NORM)