From db800636d42e6b1e7f9579115575bac32826156b Mon Sep 17 00:00:00 2001
From: ruit <ruit@nvidia.com>
Date: Sat, 30 May 2026 22:41:23 -0700
Subject: [PATCH 1/3] fix: lazily import megatron-core in model_utils

model_utils.py is on the GRPO driver's import path, so its top-level
megatron-core import (added in #2036/#2078) forced the driver env to
include the optional "mcore" extra just to import the module.

Move the megatron-core imports into the two linear-CE-fusion functions
that use them and guard the GPTModel annotation with TYPE_CHECKING, so
the module imports without mcore. megatron is imported only when the
GPTModel forward patch runs (megatron paths that already have mcore).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: ruit <ruit@nvidia.com>
---
 nemo_rl/distributed/model_utils.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/nemo_rl/distributed/model_utils.py b/nemo_rl/distributed/model_utils.py
index c4717a7bfc..a1ae4e2d70 100644
--- a/nemo_rl/distributed/model_utils.py
+++ b/nemo_rl/distributed/model_utils.py
@@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
-from megatron.core.models.gpt import GPTModel
-from megatron.core.parallel_state import (
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-)
-from megatron.core.utils import deprecate_inference_params, get_pg_size
 from torch.distributed.tensor import DTensor, distribute_tensor
 
 from nemo_rl.algorithms.logits_sampling_utils import (
@@ -29,6 +23,11 @@
     need_top_k_or_top_p_filtering,
 )
 
+if TYPE_CHECKING:
+    # megatron-core (optional "mcore" extra) is imported lazily below so this
+    # module imports without mcore installed.
+    from megatron.core.models.gpt import GPTModel
+
 
 @torch.no_grad()
 def _compute_distributed_log_softmax(
@@ -2044,6 +2043,8 @@ def backward(
 
 
 def patch_gpt_model_forward_for_linear_ce_fusion(*, chunk_size: int) -> None:
+    from megatron.core.models.gpt import GPTModel
+
     if getattr(GPTModel, "_linear_ce_fusion_forward_patched", False):
         GPTModel._linear_ce_fusion_chunk_size = chunk_size
         return
@@ -2054,7 +2055,7 @@ def patch_gpt_model_forward_for_linear_ce_fusion(*, chunk_size: int) -> None:
 
 
 def _gpt_forward_with_linear_ce_fusion(
-    self: GPTModel,
+    self: "GPTModel",
     input_ids: torch.Tensor,
     position_ids: torch.Tensor,
     attention_mask: torch.Tensor,
@@ -2070,6 +2071,12 @@ def _gpt_forward_with_linear_ce_fusion(
     padding_mask: Optional[torch.Tensor] = None,
     return_logprobs_for_linear_ce_fusion: bool = False,
 ) -> torch.Tensor:
+    from megatron.core.parallel_state import (
+        get_tensor_model_parallel_group,
+        get_tensor_model_parallel_rank,
+    )
+    from megatron.core.utils import deprecate_inference_params, get_pg_size
+
     if not return_logprobs_for_linear_ce_fusion:
         return self._original_forward_for_linear_ce_fusion(
             input_ids=input_ids,

From 686d3db636c06d76a75045f914215d2ef0c14ee5 Mon Sep 17 00:00:00 2001
From: ruit <ruit@nvidia.com>
Date: Sun, 31 May 2026 19:45:53 -0700
Subject: [PATCH 2/3] fix(test): force vLLM spawn for vllm-marked tests to
 avoid CUDA fork failure

Tests that build a bare in-process vllm.LLM (rather than going through a Ray
actor) crash with "Cannot re-initialize CUDA in forked subprocess" when CUDA is
already initialized in the parent pytest process and vLLM forks its EngineCore.
This surfaces when such a test runs first/alone in a shard (e.g. under FAST mode,
where most other vLLM tests are deselected).

Add an autouse fixture that sets VLLM_WORKER_MULTIPROC_METHOD=spawn for any
vllm-marked test and restores the previous value afterward, making these tests
robust to ordering.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: ruit <ruit@nvidia.com>
---
 tests/unit/conftest.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 7b3e706762..700d23546c 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -203,6 +203,32 @@ def pytest_collection_modifyitems(config, items):
     items[:] = new_items
 
 
+@pytest.fixture(autouse=True)
+def _vllm_force_spawn(request):
+    """Force vLLM's worker multiprocessing start method to ``spawn`` for vllm tests.
+
+    Tests that build a bare in-process ``vllm.LLM`` (rather than going through a
+    Ray actor) crash with "Cannot re-initialize CUDA in forked subprocess" when
+    CUDA is already initialized in the parent pytest process and vLLM forks its
+    EngineCore. This happens whenever such a test runs first/alone in a shard
+    (e.g. under FAST mode, where most other vLLM tests are deselected). Forcing
+    ``spawn`` makes these tests robust to ordering.
+    """
+    if request.node.get_closest_marker("vllm") is None:
+        yield
+        return
+
+    prev = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    try:
+        yield
+    finally:
+        if prev is None:
+            os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
+        else:
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev
+
+
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(

From 7a463962391fcaaf8cbba4686cfca1ba046bd7a2 Mon Sep 17 00:00:00 2001
From: ruit <ruit@nvidia.com>
Date: Mon, 1 Jun 2026 02:49:29 -0700
Subject: [PATCH 3/3] remove unit test fix

Signed-off-by: ruit <ruit@nvidia.com>
---
 tests/unit/conftest.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 700d23546c..7b3e706762 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -203,32 +203,6 @@ def pytest_collection_modifyitems(config, items):
     items[:] = new_items
 
 
-@pytest.fixture(autouse=True)
-def _vllm_force_spawn(request):
-    """Force vLLM's worker multiprocessing start method to ``spawn`` for vllm tests.
-
-    Tests that build a bare in-process ``vllm.LLM`` (rather than going through a
-    Ray actor) crash with "Cannot re-initialize CUDA in forked subprocess" when
-    CUDA is already initialized in the parent pytest process and vLLM forks its
-    EngineCore. This happens whenever such a test runs first/alone in a shard
-    (e.g. under FAST mode, where most other vLLM tests are deselected). Forcing
-    ``spawn`` makes these tests robust to ordering.
-    """
-    if request.node.get_closest_marker("vllm") is None:
-        yield
-        return
-
-    prev = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-    try:
-        yield
-    finally:
-        if prev is None:
-            os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
-        else:
-            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev
-
-
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(