Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cosmos_framework/inference/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,33 @@ def build_sample(self, *, model_config: Any) -> OmniSampleArgs:
revision="main",
),
),
# Task-specialized Super variants published as diffusers HF checkpoints.
# s3_uri is unused for HF-backed checkpoints (kept for parity with the
# registry schema); the architecture lives in each model YAML.
"Cosmos3-Super-Image2Video": CheckpointConfig(
model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_image2video/",
hf=CheckpointDirHf(
repository="nvidia/Cosmos3-Super-Image2Video",
revision="main",
),
# Self-contained checkpoint: use its bundled processor instead of
# downloading the base Cosmos3-Super repo just for the tokenizer.
vlm_processor_from_checkpoint=True,
),
"Cosmos3-Super-Text2Image": CheckpointConfig(
model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"],
config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"),
Copy link
Copy Markdown
Collaborator

@foreverlms foreverlms Jun 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that these two specialized models could just reuse this super yaml file, that they do not have specialized training config?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we have run the test and verified.

s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_text2image/",
hf=CheckpointDirHf(
repository="nvidia/Cosmos3-Super-Text2Image",
revision="main",
),
# Self-contained checkpoint: use its bundled processor instead of
# downloading the base Cosmos3-Super repo just for the tokenizer.
vlm_processor_from_checkpoint=True,
),
}
DEFAULT_CHECKPOINT_NAME = "Cosmos3-Nano"
DEFAULT_CHECKPOINT = _CHECKPOINTS[DEFAULT_CHECKPOINT_NAME]
Expand Down
15 changes: 15 additions & 0 deletions cosmos_framework/inference/common/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,17 @@ class CheckpointConfig(pydantic.BaseModel):
hf: CheckpointDirHf
"""Config for checkpoint on Hugging Face."""

vlm_processor_from_checkpoint: bool = False
"""When True, load the VLM text/vision processor from the checkpoint's own
bundled files (its local download directory) instead of the repository
hardcoded in the model config's ``vlm_config.tokenizer`` node.

Set this only for self-contained checkpoints that ship their own processor
at the repository root (e.g. the task-specialized Text2Image / Image2Video
diffusers checkpoints). Avoids a redundant download of the base model repo
just to obtain the tokenizer.
"""

def download(self) -> str:
return self.hf.download()

Expand All @@ -404,6 +415,7 @@ class CheckpointArgs(ConfigArgs):
model_memory_bytes: int | None

checkpoint_hf: CheckpointDirHf | None
vlm_processor_from_checkpoint: bool = False

credential_path: str
use_ema_weights: bool
Expand Down Expand Up @@ -443,6 +455,8 @@ class CheckpointOverrides(ConfigOverrides):

checkpoint_hf: Suppress[CheckpointDirHf | None] = None
"""Hugging Face checkpoint directory."""
vlm_processor_from_checkpoint: Suppress[bool] = False
"""Load the VLM processor from the loaded checkpoint instead of a hardcoded repo."""

credential_path: Training[str] = "credentials/gcp_checkpoint.secret"
"""Path to S3 credentials file for remote checkpoint loading."""
Expand All @@ -459,6 +473,7 @@ def _build_checkpoint(self, checkpoints: dict[str, CheckpointConfig]):
self.model_memory_bytes = checkpoint.model_memory_bytes
self.config_file = checkpoint.config_file
self.checkpoint_hf = checkpoint.hf
self.vlm_processor_from_checkpoint = checkpoint.vlm_processor_from_checkpoint
elif self.checkpoint_path.startswith("s3://"):
self.checkpoint_type = CheckpointType.DCP
self.checkpoint_path = self.checkpoint_path.rstrip("/")
Expand Down
9 changes: 9 additions & 0 deletions cosmos_framework/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,15 @@ def _create(cls, setup_args: SetupArgs, **kwargs: Any) -> Self:
config = None
else:
model_dict = setup_args.load_model_config_dict()
if setup_args.vlm_processor_from_checkpoint:
# Source the VLM processor from the loaded checkpoint's own
# bundled files instead of the repository hardcoded in the
# model config. Drops the redundant base-model download.
tokenizer_cfg = model_dict["config"]["vlm_config"]["tokenizer"]
tokenizer_cfg.pop("repository", None)
tokenizer_cfg.pop("revision", None)
tokenizer_cfg.pop("subdir", None)
tokenizer_cfg["tokenizer_type"] = str(checkpoint_path)
config = Cosmos3OmniConfig(model=model_dict)
model = Cosmos3OmniModel.from_pretrained_dcp(
checkpoint_path,
Expand Down
10 changes: 10 additions & 0 deletions cosmos_framework/inference/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,16 @@ def set_up_planner(
missing_keys = set(target_state_dict) - loaded_keys
if not self.has_vision_weights:
missing_keys = {key for key in missing_keys if not key.startswith("language_model.visual.")}
# Task-specialized checkpoints (e.g. Text2Image, Image2Video) omit the
# optional generative-modality projection heads (action, sound). They
# are unused for those tasks, so tolerate their absence the same way
# vision weights are tolerated when the checkpoint provides none of them.
for modality_prefixes in (
("action2llm.", "llm2action.", "action_modality_embed"),
("sound2llm.", "llm2sound.", "sound_modality_embed"),
):
if not any(key.startswith(modality_prefixes) for key in loaded_keys):
missing_keys = {key for key in missing_keys if not key.startswith(modality_prefixes)}
if missing_keys:
sample = sorted(missing_keys)[:10]
raise ValueError(
Expand Down
19 changes: 19 additions & 0 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,25 @@ Delete the torchinductor cache under the /tmp directory, `rm -rf /tmp/torchinduc

---

### Q: I get `torch.distributed.DistNetworkError: ... port: 29500 ... EADDRINUSE, address already in use`

`torchrun` defaults its rendezvous to port `29500`. The error means that port is already taken on the node — usually because another `torchrun` job (yours or someone else's on a shared node) is still using it.

Pass a different free port with `--master-port`, placed **before** `-m` (it is a `torchrun` argument, not an inference argument):

```shell
torchrun --nproc-per-node=8 --master-port=29501 -m cosmos_framework.scripts.inference \
--parallelism-preset=throughput \
-i "inputs/omni/t2i.json" \
-o outputs/omni_t2i \
--checkpoint-path Cosmos3-Super-Text2Image \
--seed=0
```

Any free port works (e.g. `29501`, `29510`); give each concurrent job on the same node a distinct port. Alternatively, `--rdzv-endpoint=localhost:0` lets `torchrun` auto-pick a free port.

---

## Training

### Q: I get `torch.cuda.OutOfMemoryError` during training (SFT)
Expand Down