From 0fd3d5504c4073aa06a9f0e74198151626413e43 Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 03:17:20 -0700 Subject: [PATCH] Support Cosmos3-Super task-specialized (Text2Image / Image2Video) checkpoints These task-specialized diffusers checkpoints reuse the Cosmos3-Super architecture but omit unused modality weights and bundle their own VLM processor. Loading them previously failed, and the processor pulled a redundant full base-Super download. - inference/model.py: tolerate absent action/sound projection-head weights in the diffusers load planner, mirroring the existing vision carve-out. Fixes the masked "TypeError: cannot pickle code objects" that surfaced when DCP tried to broadcast the missing-tensor ValueError across ranks. No-op for self-consistent base checkpoints: Nano/Super provide all modality weights, so the guards never fire. - inference: add CheckpointConfig.vlm_processor_from_checkpoint. When set, the loader sources the VLM processor from the loaded checkpoint's own bundled files instead of the repository hardcoded in the model config, avoiding a redundant base-Super download. Enabled for the two task checkpoints; base Nano/Super keep their configured repository. - docs/faq.md: add EADDRINUSE / --master-port entry. Verified: Text2Image (t2i) and Image2Video (i2v) load and generate; a full base Cosmos3-Nano t2i run is unchanged with strict weight loading intact (carve-out never triggers). Co-Authored-By: Claude Opus 4.8 (1M context) --- cosmos_framework/inference/args.py | 27 +++++++++++++++++++++++ cosmos_framework/inference/common/args.py | 15 +++++++++++++ cosmos_framework/inference/inference.py | 9 ++++++++ cosmos_framework/inference/model.py | 10 +++++++++ docs/faq.md | 19 ++++++++++++++++ 5 files changed, 80 insertions(+) diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py index c818ed1..44dcf76 100644 --- a/cosmos_framework/inference/args.py +++ b/cosmos_framework/inference/args.py @@ -869,6 +869,33 @@ def build_sample(self, *, model_config: Any) -> OmniSampleArgs: revision="main", ), ), + # Task-specialized Super variants published as diffusers HF checkpoints. + # s3_uri is unused for HF-backed checkpoints (kept for parity with the + # registry schema); the architecture lives in each model YAML. + "Cosmos3-Super-Image2Video": CheckpointConfig( + model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"], + config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"), + s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_image2video/", + hf=CheckpointDirHf( + repository="nvidia/Cosmos3-Super-Image2Video", + revision="main", + ), + # Self-contained checkpoint: use its bundled processor instead of + # downloading the base Cosmos3-Super repo just for the tokenizer. + vlm_processor_from_checkpoint=True, + ), + "Cosmos3-Super-Text2Image": CheckpointConfig( + model_memory_bytes=MODEL_MEMORY_BYTES_BY_SIZE["32B"], + config_file=str(CONFIG_DIR / "model/Cosmos3-Super.yaml"), + s3_uri="s3://bucket1/cosmos3_vfm/cosmos3_ga_text2image/", + hf=CheckpointDirHf( + repository="nvidia/Cosmos3-Super-Text2Image", + revision="main", + ), + # Self-contained checkpoint: use its bundled processor instead of + # downloading the base Cosmos3-Super repo just for the tokenizer. + vlm_processor_from_checkpoint=True, + ), } DEFAULT_CHECKPOINT_NAME = "Cosmos3-Nano" DEFAULT_CHECKPOINT = _CHECKPOINTS[DEFAULT_CHECKPOINT_NAME] diff --git a/cosmos_framework/inference/common/args.py b/cosmos_framework/inference/common/args.py index 7c1c5cb..8d7cd5a 100644 --- a/cosmos_framework/inference/common/args.py +++ b/cosmos_framework/inference/common/args.py @@ -385,6 +385,17 @@ class CheckpointConfig(pydantic.BaseModel): hf: CheckpointDirHf """Config for checkpoint on Hugging Face.""" + vlm_processor_from_checkpoint: bool = False + """When True, load the VLM text/vision processor from the checkpoint's own + bundled files (its local download directory) instead of the repository + hardcoded in the model config's ``vlm_config.tokenizer`` node. + + Set this only for self-contained checkpoints that ship their own processor + at the repository root (e.g. the task-specialized Text2Image / Image2Video + diffusers checkpoints). Avoids a redundant download of the base model repo + just to obtain the tokenizer. + """ + def download(self) -> str: return self.hf.download() @@ -404,6 +415,7 @@ class CheckpointArgs(ConfigArgs): model_memory_bytes: int | None checkpoint_hf: CheckpointDirHf | None + vlm_processor_from_checkpoint: bool = False credential_path: str use_ema_weights: bool @@ -443,6 +455,8 @@ class CheckpointOverrides(ConfigOverrides): checkpoint_hf: Suppress[CheckpointDirHf | None] = None """Hugging Face checkpoint directory.""" + vlm_processor_from_checkpoint: Suppress[bool] = False + """Load the VLM processor from the loaded checkpoint instead of a hardcoded repo.""" credential_path: Training[str] = "credentials/gcp_checkpoint.secret" """Path to S3 credentials file for remote checkpoint loading.""" @@ -459,6 +473,7 @@ def _build_checkpoint(self, checkpoints: dict[str, CheckpointConfig]): self.model_memory_bytes = checkpoint.model_memory_bytes self.config_file = checkpoint.config_file self.checkpoint_hf = checkpoint.hf + self.vlm_processor_from_checkpoint = checkpoint.vlm_processor_from_checkpoint elif self.checkpoint_path.startswith("s3://"): self.checkpoint_type = CheckpointType.DCP self.checkpoint_path = self.checkpoint_path.rstrip("/") diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py index 202805b..f7cf798 100644 --- a/cosmos_framework/inference/inference.py +++ b/cosmos_framework/inference/inference.py @@ -1050,6 +1050,15 @@ def _create(cls, setup_args: SetupArgs, **kwargs: Any) -> Self: config = None else: model_dict = setup_args.load_model_config_dict() + if setup_args.vlm_processor_from_checkpoint: + # Source the VLM processor from the loaded checkpoint's own + # bundled files instead of the repository hardcoded in the + # model config. Drops the redundant base-model download. + tokenizer_cfg = model_dict["config"]["vlm_config"]["tokenizer"] + tokenizer_cfg.pop("repository", None) + tokenizer_cfg.pop("revision", None) + tokenizer_cfg.pop("subdir", None) + tokenizer_cfg["tokenizer_type"] = str(checkpoint_path) config = Cosmos3OmniConfig(model=model_dict) model = Cosmos3OmniModel.from_pretrained_dcp( checkpoint_path, diff --git a/cosmos_framework/inference/model.py b/cosmos_framework/inference/model.py index de7dbb8..8904e6e 100644 --- a/cosmos_framework/inference/model.py +++ b/cosmos_framework/inference/model.py @@ -318,6 +318,16 @@ def set_up_planner( missing_keys = set(target_state_dict) - loaded_keys if not self.has_vision_weights: missing_keys = {key for key in missing_keys if not key.startswith("language_model.visual.")} + # Task-specialized checkpoints (e.g. Text2Image, Image2Video) omit the + # optional generative-modality projection heads (action, sound). They + # are unused for those tasks, so tolerate their absence the same way + # vision weights are tolerated when the checkpoint provides none of them. + for modality_prefixes in ( + ("action2llm.", "llm2action.", "action_modality_embed"), + ("sound2llm.", "llm2sound.", "sound_modality_embed"), + ): + if not any(key.startswith(modality_prefixes) for key in loaded_keys): + missing_keys = {key for key in missing_keys if not key.startswith(modality_prefixes)} if missing_keys: sample = sorted(missing_keys)[:10] raise ValueError( diff --git a/docs/faq.md b/docs/faq.md index f9adbdb..4ddf47e 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -225,6 +225,25 @@ Delete the torchinductor cache under the /tmp directory, `rm -rf /tmp/torchinduc --- +### Q: I get `torch.distributed.DistNetworkError: ... port: 29500 ... EADDRINUSE, address already in use` + +`torchrun` defaults its rendezvous to port `29500`. The error means that port is already taken on the node — usually because another `torchrun` job (yours or someone else's on a shared node) is still using it. + +Pass a different free port with `--master-port`, placed **before** `-m` (it is a `torchrun` argument, not an inference argument): + +```shell +torchrun --nproc-per-node=8 --master-port=29501 -m cosmos_framework.scripts.inference \ + --parallelism-preset=throughput \ + -i "inputs/omni/t2i.json" \ + -o outputs/omni_t2i \ + --checkpoint-path Cosmos3-Super-Text2Image \ + --seed=0 +``` + +Any free port works (e.g. `29501`, `29510`); give each concurrent job on the same node a distinct port. Alternatively, `--rdzv-endpoint=localhost:0` lets `torchrun` auto-pick a free port. + +--- + ## Training ### Q: I get `torch.cuda.OutOfMemoryError` during training (SFT)