NVIDIA · lfengad · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · foreverlms
diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
@@ -869,6 +869,33 @@ def build_sample(self, *, model_config: Any) -> OmniSampleArgs:
             revision="main",
         ),
     ),
+    # Task-specialized Super variants published as diffusers HF checkpoints.
+    # s3_uri is unused for HF-backed checkpoints (kept for parity with the
+    # registry schema); the architecture lives in each model YAML.
+    "Cosmos3-Super-Image2Video": CheckpointConfig(
+        model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
+        config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
+        s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_image2video/",
+        hf=CheckpointDirHf(
+            repository="nvidia/Cosmos3-Super-Image2Video",
+            revision="main",
+        ),
+        # Self-contained checkpoint: use its bundled processor instead of
+        # downloading the base Cosmos3-Super repo just for the tokenizer.
+        vlm_processor_from_checkpoint=True,
+    ),
+    "Cosmos3-Super-Text2Image": CheckpointConfig(
+        model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
+        config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
+        s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_text2image/",
+        hf=CheckpointDirHf(
+            repository="nvidia/Cosmos3-Super-Text2Image",
+            revision="main",
+        ),
+        # Self-contained checkpoint: use its bundled processor instead of
+        # downloading the base Cosmos3-Super repo just for the tokenizer.
+        vlm_processor_from_checkpoint=True,
+    ),
 }
 DEFAULT_CHECKPOINT_NAME = "Cosmos3-Nano"
 DEFAULT_CHECKPOINT = _CHECKPOINTS[DEFAULT_CHECKPOINT_NAME]

diff --git a/cosmos_framework/inference/common/args.py b/cosmos_framework/inference/common/args.py
@@ -385,6 +385,17 @@ class CheckpointConfig(pydantic.BaseModel):
     hf: CheckpointDirHf
     """Config for checkpoint on Hugging Face."""
 
+    vlm_processor_from_checkpoint: bool = False
+    """When True, load the VLM text/vision processor from the checkpoint's own
+    bundled files (its local download directory) instead of the repository
+    hardcoded in the model config's ``vlm_config.tokenizer`` node.
+
+    Set this only for self-contained checkpoints that ship their own processor
+    at the repository root (e.g. the task-specialized Text2Image / Image2Video
+    diffusers checkpoints). Avoids a redundant download of the base model repo
+    just to obtain the tokenizer.
+    """
+
     def download(self) -> str:
         return self.hf.download()
 
@@ -404,6 +415,7 @@ class CheckpointArgs(ConfigArgs):
     model_memory_bytes: int | None
 
     checkpoint_hf: CheckpointDirHf | None
+    vlm_processor_from_checkpoint: bool = False
 
     credential_path: str
     use_ema_weights: bool
@@ -443,6 +455,8 @@ class CheckpointOverrides(ConfigOverrides):
 
     checkpoint_hf: Suppress[CheckpointDirHf | None] = None
     """Hugging Face checkpoint directory."""
+    vlm_processor_from_checkpoint: Suppress[bool] = False
+    """Load the VLM processor from the loaded checkpoint instead of a hardcoded repo."""
 
     credential_path: Training[str] = "credentials/gcp_checkpoint.secret"
     """Path to S3 credentials file for remote checkpoint loading."""
@@ -459,6 +473,7 @@ def _build_checkpoint(self, checkpoints: dict[str, CheckpointConfig]):
             self.model_memory_bytes = checkpoint.model_memory_bytes
             self.config_file = checkpoint.config_file
             self.checkpoint_hf = checkpoint.hf
+            self.vlm_processor_from_checkpoint = checkpoint.vlm_processor_from_checkpoint
         elif self.checkpoint_path.startswith("s3://"):
             self.checkpoint_type = CheckpointType.DCP
             self.checkpoint_path = self.checkpoint_path.rstrip("/")

diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
@@ -1050,6 +1050,15 @@ def _create(cls, setup_args: SetupArgs, **kwargs: Any) -> Self:
                 config = None
             else:
                 model_dict = setup_args.load_model_config_dict()
+                if setup_args.vlm_processor_from_checkpoint:
+                    # Source the VLM processor from the loaded checkpoint's own
+                    # bundled files instead of the repository hardcoded in the
+                    # model config. Drops the redundant base-model download.
+                    tokenizer_cfg = model_dict["config"]["vlm_config"]["tokenizer"]
+                    tokenizer_cfg.pop("repository", None)
+                    tokenizer_cfg.pop("revision", None)
+                    tokenizer_cfg.pop("subdir", None)
+                    tokenizer_cfg["tokenizer_type"] = str(checkpoint_path)
                 config = Cosmos3OmniConfig(model=model_dict)
             model = Cosmos3OmniModel.from_pretrained_dcp(
                 checkpoint_path,

diff --git a/cosmos_framework/inference/model.py b/cosmos_framework/inference/model.py
@@ -318,6 +318,16 @@ def set_up_planner(
         missing_keys = set(target_state_dict) - loaded_keys
         if not self.has_vision_weights:
             missing_keys = {key for key in missing_keys if not key.startswith("language_model.visual.")}
+        # Task-specialized checkpoints (e.g. Text2Image, Image2Video) omit the
+        # optional generative-modality projection heads (action, sound). They
+        # are unused for those tasks, so tolerate their absence the same way
+        # vision weights are tolerated when the checkpoint provides none of them.
+        for modality_prefixes in (
+            ("action2llm.", "llm2action.", "action_modality_embed"),
+            ("sound2llm.", "llm2sound.", "sound_modality_embed"),
+        ):
+            if not any(key.startswith(modality_prefixes) for key in loaded_keys):
+                missing_keys = {key for key in missing_keys if not key.startswith(modality_prefixes)}
         if missing_keys:
             sample = sorted(missing_keys)[:10]
             raise ValueError(

diff --git a/docs/faq.md b/docs/faq.md
@@ -225,6 +225,25 @@ Delete the torchinductor cache under the /tmp directory, `rm -rf /tmp/torchinduc
 
 ---
 
+### Q: I get `torch.distributed.DistNetworkError: ... port: 29500 ... EADDRINUSE, address already in use`
+
+`torchrun` defaults its rendezvous to port `29500`. The error means that port is already taken on the node — usually because another `torchrun` job (yours or someone else's on a shared node) is still using it.
+
+Pass a different free port with `--master-port`, placed **before** `-m` (it is a `torchrun` argument, not an inference argument):
+
+```shell
+torchrun --nproc-per-node=8 --master-port=29501 -m cosmos_framework.scripts.inference \
+  --parallelism-preset=throughput \
+  -i "inputs/omni/t2i.json" \
+  -o outputs/omni_t2i \
+  --checkpoint-path Cosmos3-Super-Text2Image \
+  --seed=0
+```
+
+Any free port works (e.g. `29501`, `29510`); give each concurrent job on the same node a distinct port. Alternatively, `--rdzv-endpoint=localhost:0` lets `torchrun` auto-pick a free port.
+
+---
+
 ## Training
 
 ### Q: I get `torch.cuda.OutOfMemoryError` during training (SFT)