From 0fd3d5504c4073aa06a9f0e74198151626413e43 Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 03:17:20 -0700
Subject: [PATCH] Support Cosmos3-Super task-specialized (Text2Image /
 Image2Video) checkpoints

These task-specialized diffusers checkpoints reuse the Cosmos3-Super
architecture but omit unused modality weights and bundle their own VLM
processor. Loading them previously failed, and the processor pulled a
redundant full base-Super download.

- inference/model.py: tolerate absent action/sound projection-head
  weights in the diffusers load planner, mirroring the existing vision
  carve-out. Fixes the masked "TypeError: cannot pickle code objects"
  that surfaced when DCP tried to broadcast the missing-tensor
  ValueError across ranks. No-op for self-consistent base checkpoints:
  Nano/Super provide all modality weights, so the guards never fire.

- inference: add CheckpointConfig.vlm_processor_from_checkpoint. When
  set, the loader sources the VLM processor from the loaded checkpoint's
  own bundled files instead of the repository hardcoded in the model
  config, avoiding a redundant base-Super download. Enabled for the two
  task checkpoints; base Nano/Super keep their configured repository.

- docs/faq.md: add EADDRINUSE / --master-port entry.

Verified: Text2Image (t2i) and Image2Video (i2v) load and generate; a
full base Cosmos3-Nano t2i run is unchanged with strict weight loading
intact (carve-out never triggers).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/args.py        | 27 +++++++++++++++++++++++
 cosmos_framework/inference/common/args.py | 15 +++++++++++++
 cosmos_framework/inference/inference.py   |  9 ++++++++
 cosmos_framework/inference/model.py       | 10 +++++++++
 docs/faq.md                               | 19 ++++++++++++++++
 5 files changed, 80 insertions(+)

diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
index c818ed1..44dcf76 100644
--- a/cosmos_framework/inference/args.py
+++ b/cosmos_framework/inference/args.py
@@ -869,6 +869,33 @@ def build_sample(self, *, model_config: Any) -> OmniSampleArgs:
             revision="main",
         ),
     ),
+    # Task-specialized Super variants published as diffusers HF checkpoints.
+    # s3_uri is unused for HF-backed checkpoints (kept for parity with the
+    # registry schema); the architecture lives in each model YAML.
+    "Cosmos3-Super-Image2Video": CheckpointConfig(
+        model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
+        config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
+        s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_image2video/",
+        hf=CheckpointDirHf(
+            repository="nvidia/Cosmos3-Super-Image2Video",
+            revision="main",
+        ),
+        # Self-contained checkpoint: use its bundled processor instead of
+        # downloading the base Cosmos3-Super repo just for the tokenizer.
+        vlm_processor_from_checkpoint=True,
+    ),
+    "Cosmos3-Super-Text2Image": CheckpointConfig(
+        model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
+        config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
+        s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_text2image/",
+        hf=CheckpointDirHf(
+            repository="nvidia/Cosmos3-Super-Text2Image",
+            revision="main",
+        ),
+        # Self-contained checkpoint: use its bundled processor instead of
+        # downloading the base Cosmos3-Super repo just for the tokenizer.
+        vlm_processor_from_checkpoint=True,
+    ),
 }
 DEFAULT_CHECKPOINT_NAME = "Cosmos3-Nano"
 DEFAULT_CHECKPOINT = _CHECKPOINTS[DEFAULT_CHECKPOINT_NAME]
diff --git a/cosmos_framework/inference/common/args.py b/cosmos_framework/inference/common/args.py
index 7c1c5cb..8d7cd5a 100644
--- a/cosmos_framework/inference/common/args.py
+++ b/cosmos_framework/inference/common/args.py
@@ -385,6 +385,17 @@ class CheckpointConfig(pydantic.BaseModel):
     hf: CheckpointDirHf
     """Config for checkpoint on Hugging Face."""
 
+    vlm_processor_from_checkpoint: bool = False
+    """When True, load the VLM text/vision processor from the checkpoint's own
+    bundled files (its local download directory) instead of the repository
+    hardcoded in the model config's ``vlm_config.tokenizer`` node.
+
+    Set this only for self-contained checkpoints that ship their own processor
+    at the repository root (e.g. the task-specialized Text2Image / Image2Video
+    diffusers checkpoints). Avoids a redundant download of the base model repo
+    just to obtain the tokenizer.
+    """
+
     def download(self) -> str:
         return self.hf.download()
 
@@ -404,6 +415,7 @@ class CheckpointArgs(ConfigArgs):
     model_memory_bytes: int | None
 
     checkpoint_hf: CheckpointDirHf | None
+    vlm_processor_from_checkpoint: bool = False
 
     credential_path: str
     use_ema_weights: bool
@@ -443,6 +455,8 @@ class CheckpointOverrides(ConfigOverrides):
 
     checkpoint_hf: Suppress[CheckpointDirHf | None] = None
     """Hugging Face checkpoint directory."""
+    vlm_processor_from_checkpoint: Suppress[bool] = False
+    """Load the VLM processor from the loaded checkpoint instead of a hardcoded repo."""
 
     credential_path: Training[str] = "credentials/gcp_checkpoint.secret"
     """Path to S3 credentials file for remote checkpoint loading."""
@@ -459,6 +473,7 @@ def _build_checkpoint(self, checkpoints: dict[str, CheckpointConfig]):
             self.model_memory_bytes = checkpoint.model_memory_bytes
             self.config_file = checkpoint.config_file
             self.checkpoint_hf = checkpoint.hf
+            self.vlm_processor_from_checkpoint = checkpoint.vlm_processor_from_checkpoint
         elif self.checkpoint_path.startswith("s3://"):
             self.checkpoint_type = CheckpointType.DCP
             self.checkpoint_path = self.checkpoint_path.rstrip("/")
diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
index 202805b..f7cf798 100644
--- a/cosmos_framework/inference/inference.py
+++ b/cosmos_framework/inference/inference.py
@@ -1050,6 +1050,15 @@ def _create(cls, setup_args: SetupArgs, **kwargs: Any) -> Self:
                 config = None
             else:
                 model_dict = setup_args.load_model_config_dict()
+                if setup_args.vlm_processor_from_checkpoint:
+                    # Source the VLM processor from the loaded checkpoint's own
+                    # bundled files instead of the repository hardcoded in the
+                    # model config. Drops the redundant base-model download.
+                    tokenizer_cfg = model_dict["config"]["vlm_config"]["tokenizer"]
+                    tokenizer_cfg.pop("repository", None)
+                    tokenizer_cfg.pop("revision", None)
+                    tokenizer_cfg.pop("subdir", None)
+                    tokenizer_cfg["tokenizer_type"] = str(checkpoint_path)
                 config = Cosmos3OmniConfig(model=model_dict)
             model = Cosmos3OmniModel.from_pretrained_dcp(
                 checkpoint_path,
diff --git a/cosmos_framework/inference/model.py b/cosmos_framework/inference/model.py
index de7dbb8..8904e6e 100644
--- a/cosmos_framework/inference/model.py
+++ b/cosmos_framework/inference/model.py
@@ -318,6 +318,16 @@ def set_up_planner(
         missing_keys = set(target_state_dict) - loaded_keys
         if not self.has_vision_weights:
             missing_keys = {key for key in missing_keys if not key.startswith("language_model.visual.")}
+        # Task-specialized checkpoints (e.g. Text2Image, Image2Video) omit the
+        # optional generative-modality projection heads (action, sound). They
+        # are unused for those tasks, so tolerate their absence the same way
+        # vision weights are tolerated when the checkpoint provides none of them.
+        for modality_prefixes in (
+            ("action2llm.", "llm2action.", "action_modality_embed"),
+            ("sound2llm.", "llm2sound.", "sound_modality_embed"),
+        ):
+            if not any(key.startswith(modality_prefixes) for key in loaded_keys):
+                missing_keys = {key for key in missing_keys if not key.startswith(modality_prefixes)}
         if missing_keys:
             sample = sorted(missing_keys)[:10]
             raise ValueError(
diff --git a/docs/faq.md b/docs/faq.md
index f9adbdb..4ddf47e 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -225,6 +225,25 @@ Delete the torchinductor cache under the /tmp directory, `rm -rf /tmp/torchinduc
 
 ---
 
+### Q: I get `torch.distributed.DistNetworkError: ... port: 29500 ... EADDRINUSE, address already in use`
+
+`torchrun` defaults its rendezvous to port `29500`. The error means that port is already taken on the node — usually because another `torchrun` job (yours or someone else's on a shared node) is still using it.
+
+Pass a different free port with `--master-port`, placed **before** `-m` (it is a `torchrun` argument, not an inference argument):
+
+```shell
+torchrun --nproc-per-node=8 --master-port=29501 -m cosmos_framework.scripts.inference \
+  --parallelism-preset=throughput \
+  -i "inputs/omni/t2i.json" \
+  -o outputs/omni_t2i \
+  --checkpoint-path Cosmos3-Super-Text2Image \
+  --seed=0
+```
+
+Any free port works (e.g. `29501`, `29510`); give each concurrent job on the same node a distinct port. Alternatively, `--rdzv-endpoint=localhost:0` lets `torchrun` auto-pick a free port.
+
+---
+
 ## Training
 
 ### Q: I get `torch.cuda.OutOfMemoryError` during training (SFT)