From db800636d42e6b1e7f9579115575bac32826156b Mon Sep 17 00:00:00 2001 From: ruit Date: Sat, 30 May 2026 22:41:23 -0700 Subject: [PATCH 1/3] fix: lazily import megatron-core in model_utils model_utils.py is on the GRPO driver's import path, so its top-level megatron-core import (added in #2036/#2078) forced the driver env to include the optional "mcore" extra just to import the module. Move the megatron-core imports into the two linear-CE-fusion functions that use them and guard the GPTModel annotation with TYPE_CHECKING, so the module imports without mcore. megatron is imported only when the GPTModel forward patch runs (megatron paths that already have mcore). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: ruit --- nemo_rl/distributed/model_utils.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/nemo_rl/distributed/model_utils.py b/nemo_rl/distributed/model_utils.py index c4717a7bfc..a1ae4e2d70 100644 --- a/nemo_rl/distributed/model_utils.py +++ b/nemo_rl/distributed/model_utils.py @@ -12,15 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import torch -from megatron.core.models.gpt import GPTModel -from megatron.core.parallel_state import ( - get_tensor_model_parallel_group, - get_tensor_model_parallel_rank, -) -from megatron.core.utils import deprecate_inference_params, get_pg_size from torch.distributed.tensor import DTensor, distribute_tensor from nemo_rl.algorithms.logits_sampling_utils import ( @@ -29,6 +23,11 @@ need_top_k_or_top_p_filtering, ) +if TYPE_CHECKING: + # megatron-core (optional "mcore" extra) is imported lazily below so this + # module imports without mcore installed. + from megatron.core.models.gpt import GPTModel + @torch.no_grad() def _compute_distributed_log_softmax( @@ -2044,6 +2043,8 @@ def backward( def patch_gpt_model_forward_for_linear_ce_fusion(*, chunk_size: int) -> None: + from megatron.core.models.gpt import GPTModel + if getattr(GPTModel, "_linear_ce_fusion_forward_patched", False): GPTModel._linear_ce_fusion_chunk_size = chunk_size return @@ -2054,7 +2055,7 @@ def patch_gpt_model_forward_for_linear_ce_fusion(*, chunk_size: int) -> None: def _gpt_forward_with_linear_ce_fusion( - self: GPTModel, + self: "GPTModel", input_ids: torch.Tensor, position_ids: torch.Tensor, attention_mask: torch.Tensor, @@ -2070,6 +2071,12 @@ def _gpt_forward_with_linear_ce_fusion( padding_mask: Optional[torch.Tensor] = None, return_logprobs_for_linear_ce_fusion: bool = False, ) -> torch.Tensor: + from megatron.core.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + ) + from megatron.core.utils import deprecate_inference_params, get_pg_size + if not return_logprobs_for_linear_ce_fusion: return self._original_forward_for_linear_ce_fusion( input_ids=input_ids, From 686d3db636c06d76a75045f914215d2ef0c14ee5 Mon Sep 17 00:00:00 2001 From: ruit Date: Sun, 31 May 2026 19:45:53 -0700 Subject: [PATCH 2/3] fix(test): force vLLM spawn for vllm-marked tests to avoid CUDA fork failure Tests that build a bare in-process vllm.LLM (rather than going through a Ray actor) crash with "Cannot re-initialize CUDA in forked subprocess" when CUDA is already initialized in the parent pytest process and vLLM forks its EngineCore. This surfaces when such a test runs first/alone in a shard (e.g. under FAST mode, where most other vLLM tests are deselected). Add an autouse fixture that sets VLLM_WORKER_MULTIPROC_METHOD=spawn for any vllm-marked test and restores the previous value afterward, making these tests robust to ordering. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: ruit --- tests/unit/conftest.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 7b3e706762..700d23546c 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -203,6 +203,32 @@ def pytest_collection_modifyitems(config, items): items[:] = new_items +@pytest.fixture(autouse=True) +def _vllm_force_spawn(request): + """Force vLLM's worker multiprocessing start method to ``spawn`` for vllm tests. + + Tests that build a bare in-process ``vllm.LLM`` (rather than going through a + Ray actor) crash with "Cannot re-initialize CUDA in forked subprocess" when + CUDA is already initialized in the parent pytest process and vLLM forks its + EngineCore. This happens whenever such a test runs first/alone in a shard + (e.g. under FAST mode, where most other vLLM tests are deselected). Forcing + ``spawn`` makes these tests robust to ordering. + """ + if request.node.get_closest_marker("vllm") is None: + yield + return + + prev = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + try: + yield + finally: + if prev is None: + os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None) + else: + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev + + TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join( From 7a463962391fcaaf8cbba4686cfca1ba046bd7a2 Mon Sep 17 00:00:00 2001 From: ruit Date: Mon, 1 Jun 2026 02:49:29 -0700 Subject: [PATCH 3/3] remove unit test fix Signed-off-by: ruit --- tests/unit/conftest.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 700d23546c..7b3e706762 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -203,32 +203,6 @@ def pytest_collection_modifyitems(config, items): items[:] = new_items -@pytest.fixture(autouse=True) -def _vllm_force_spawn(request): - """Force vLLM's worker multiprocessing start method to ``spawn`` for vllm tests. - - Tests that build a bare in-process ``vllm.LLM`` (rather than going through a - Ray actor) crash with "Cannot re-initialize CUDA in forked subprocess" when - CUDA is already initialized in the parent pytest process and vLLM forks its - EngineCore. This happens whenever such a test runs first/alone in a shard - (e.g. under FAST mode, where most other vLLM tests are deselected). Forcing - ``spawn`` makes these tests robust to ordering. - """ - if request.node.get_closest_marker("vllm") is None: - yield - return - - prev = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - try: - yield - finally: - if prev is None: - os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None) - else: - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev - - TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join(