From 5763d5074e6b6abae807cdf2525480f618acd0db Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 06:56:03 -0700 Subject: [PATCH 1/8] Add Cosmos3-Nano GPU smoke tests + GPU CI; self-prep regression inputs - tests/nano_inference_smoke_test.py: 8-GPU Cosmos3-Nano t2vs (text2video + sound) smoke. Asserts a vision.mp4 is produced, then decodes its muxed audio track (PyAV) and checks it is real sound: finite, non-empty, non-constant, above the silence floor. - tests/nano_training_smoke_test.py: 8-GPU Vision SFT 1-iter smoke. Downloads the bridge subset + Wan VAE, converts Cosmos3-Nano -> DCP, runs the 1-iter launcher, and asserts training finishes with a finite loss + a written checkpoint. All run output goes under the pytest tmp dir. - tests/launch_regression_test.py: prepare inputs in-test via the new h100_inputs fixture (download + convert, honoring pre-set env vars, cleaned on teardown) instead of requiring tests/_stage_h100_inputs.sh env vars; re-captured h100 goldens at transformers==4.57.6; map H200 to the h100 goldens key (the GPU CI runs on 8xH200). - tests/{launch_sft_vision_nano_1iter.sh,vision_sft_nano_1iter.toml}: 1-iter SFT recipe fixtures (moved from examples/; the launcher reuses the shared examples/_sft_launcher_common.sh). - .github/workflows/gpu-tests.yml: on push/PR to main, run the 8-GPU smoke tests and the 4-GPU SFT regression on a self-hosted 8xH200 runner. All GPU tests are gated by the gpus()/level() markers + --num-gpus/--levels, so the no-GPU pre-commit CI is unaffected. Verified on 8xH100: nano smoke 2 passed, SFT regression 2 passed. --- .github/workflows/gpu-tests.yml | 72 ++++++++++ tests/launch_regression_test.py | 160 +++++++++++++++------ tests/launch_sft_vision_nano_1iter.sh | 17 +++ tests/nano_inference_smoke_test.py | 177 +++++++++++++++++++++++ tests/nano_training_smoke_test.py | 197 ++++++++++++++++++++++++++ tests/vision_sft_nano_1iter.toml | 91 ++++++++++++ 6 files changed, 673 insertions(+), 41 deletions(-) create mode 100644 .github/workflows/gpu-tests.yml create mode 100755 tests/launch_sft_vision_nano_1iter.sh create mode 100644 tests/nano_inference_smoke_test.py create mode 100644 tests/nano_training_smoke_test.py create mode 100644 tests/vision_sft_nano_1iter.toml diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml new file mode 100644 index 0000000..816c06f --- /dev/null +++ b/.github/workflows/gpu-tests.yml @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# GPU regression + smoke tests on a self-hosted 8×H200 runner. +# +# Requires: +# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, +# NVIDIA drivers, and `uv` on PATH; +# * an `HF_TOKEN` repository secret (gated dataset/model downloads). +# +# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct) +# are downloaded/converted in-test and cached in the runner's HF cache; the +# first run is slow (~30 GB Nano + ~16 GB Qwen + DCP convert), later runs reuse +# the cache. +name: GPU Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +# Don't pile up 8-GPU runs: cancel an in-progress run for the same ref when a +# newer commit arrives. +concurrency: + group: gpu-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + gpu-tests: + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 90 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # 8-GPU smoke tests: Cosmos3-Nano t2vs inference (+ sound check) and a + # 1-iter Vision SFT. MAX_GPUS defaults to 8. + - name: Nano smoke tests (8 GPU) + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \ + --num-gpus=8 --levels=2 -o addopts= + + # SFT loss/grad-norm regression on a 4-GPU subset (h100 goldens; H200 maps + # to the same key). TEST_MAX_GPUS=4 selects the 4-GPU test variant. + - name: SFT regression (4-GPU subset) + env: + TEST_MAX_GPUS: "4" + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts= + + # Clear the run's heavy artifacts (even on failure) to keep the runner's + # disk bounded: examples/checkpoints (the Cosmos3-Nano DCP + Wan VAE, + # ~30 GB) and the pytest tmp dirs (smoke-test videos + the SFT checkpoint). + # The small examples/data dataset and the HF cache are intentionally kept + # so subsequent runs reuse them. + - name: Clean up run outputs + if: always() + run: | + rm -rf examples/checkpoints || true + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py index 5545766..2952b68 100644 --- a/tests/launch_regression_test.py +++ b/tests/launch_regression_test.py @@ -93,24 +93,36 @@ # below skips the GB200 arch instead of re-running it. -def _h100_paths_from_env() -> dict[str, str]: - """Resolve H100 input paths from env vars (set by tests/_stage_h100_inputs.sh). +def _hf_download(args: list[str]) -> str: + """``uvx hf download --quiet`` -> the local path it prints (from the HF cache).""" + result = subprocess.run( + ["uvx", "hf@latest", "download", *args, "--quiet"], + cwd=str(REPO_ROOT), + capture_output=True, + text=True, + ) + if result.returncode != 0: + pytest.fail(f"hf download failed for {args} (exit {result.returncode}):\n{result.stdout}\n{result.stderr}") + lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()] + if not lines: + pytest.fail(f"hf download for {args} printed no path:\n{result.stdout}\n{result.stderr}") + return lines[-1] - All four env vars are required because the SFT TOMLs interpolate - ``DATASET_PATH`` / ``WAN_VAE_PATH`` / ``BASE_CHECKPOINT_PATH`` at load time - and the VLM spec passes ``MODEL_PATH`` as a Hydra backbone override. - """ - missing = [ - var - for var in ("DATASET_PATH", "WAN_VAE_PATH", "BASE_CHECKPOINT_PATH", "MODEL_PATH") - if not os.environ.get(var) - ] - if missing: - pytest.skip( - f"H100 regression needs env vars: {missing}. " - "Run tests/_stage_h100_inputs.sh and `source $STAGE_DIR/env.sh` first." - ) - return {"vlm_model_path": os.environ["MODEL_PATH"]} + +def _convert_nano_dcp(dest: Path) -> None: + """Convert the Cosmos3-Nano checkpoint to DCP at ``dest`` (Step 2 of docs/training.md).""" + env = os.environ.copy() + env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" + result = subprocess.run( + [ + sys.executable, "-m", "cosmos_framework.scripts.convert_model_to_dcp", + "-o", str(dest), "--checkpoint-path", "Cosmos3-Nano", + ], + cwd=str(REPO_ROOT), + env=env, + ) + if result.returncode != 0: + pytest.fail(f"convert_model_to_dcp (Cosmos3-Nano) failed with exit code {result.returncode}") def _detect_arch() -> str: @@ -122,17 +134,17 @@ def _detect_arch() -> str: name = torch.cuda.get_device_name(0).upper() if "GB200" in name: return "gb200" - if "H100" in name: + # H200 shares the Hopper kernels with H100 and is treated identically here: + # both map to the ``h100`` goldens key (the GitHub GPU CI runs on 8×H200). + if "H100" in name or "H200" in name: return "h100" return "unknown" -def _resolve_paths(arch: str) -> dict[str, str]: - if arch == "h100": - return _h100_paths_from_env() - if arch == "gb200": - pytest.skip("gb200 inputs not in OSS layout; goldens kept for historical reference only.") - pytest.skip(f"no regression goldens for GPU arch {arch!r}; only h100 supported") +# Pinned revisions mirror tests/_stage_h100_inputs.sh so prepared inputs match +# the captured h100 goldens. +_BRIDGE_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7" +_QWEN_VL_REVISION = "0c351dd01ed87e9c1b53cbc748cba10e6187ff3b" # Tolerances for ``pytest.approx``. The launch passes ``--deterministic`` and @@ -336,20 +348,30 @@ def _run_torchrun(spec: LaunchSpec, run_dir: Path) -> Path: env["IMAGINAIRE_OUTPUT_ROOT"] = str(run_dir / "output") env.update(spec.extra_env) + # Tee: stream the torchrun output live to stdout (so CI shows training + # progress under ``pytest -s``) while capturing it into the log file. with log_file.open("w") as fp: - result = subprocess.run( + proc = subprocess.Popen( cmd, env=env, cwd=str(REPO_ROOT), - stdout=fp, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, + bufsize=1, ) - if result.returncode != 0: + assert proc.stdout is not None + for line in proc.stdout: + sys.stdout.write(line) + sys.stdout.flush() + fp.write(line) + returncode = proc.wait() + if returncode != 0: # Tolerate harmless PyGIL teardown warnings if training did complete. text = log_file.read_text(errors="replace") if "Done with training" not in text: pytest.fail( - f"{spec.key}: torchrun failed with exit code {result.returncode} " + f"{spec.key}: torchrun failed with exit code {returncode} " "and log does not contain 'Done with training'.\n" f"Log tail:\n{text[-2000:]}" ) @@ -372,13 +394,68 @@ def _require_4_gpus() -> None: pytest.skip(f"requires 4 visible CUDA devices, found {torch.cuda.device_count()}") +@pytest.fixture(scope="module") +def h100_inputs(tmp_path_factory: pytest.TempPathFactory): + """Provide the regression input paths, preparing any not already set in env. + + Mirrors the download/convert steps of ``tests/_stage_h100_inputs.sh`` (it + does NOT set up the environment -- ``uv sync`` and the ``transformers`` + pin still belong to that script / the caller). Honors pre-set env vars (so + ``source env.sh`` still works); anything prepared here goes under a temp + stage dir that is removed on teardown. The four vars are exported because + the SFT TOMLs interpolate ``DATASET_PATH`` / ``WAN_VAE_PATH`` / + ``BASE_CHECKPOINT_PATH`` at load time and the VLM spec passes ``MODEL_PATH`` + as a Hydra backbone override. + """ + arch = _detect_arch() + if arch == "gb200": + pytest.skip("gb200 inputs not in OSS layout; goldens kept for historical reference only.") + if arch != "h100": + pytest.skip(f"no regression goldens for GPU arch {arch!r}; only h100 supported") + if shutil.which("uvx") is None: + pytest.skip("uvx not on PATH -- required to prepare regression inputs") + + stage = tmp_path_factory.mktemp("h100_stage") + set_vars: list[str] = [] + + def _ensure(var: str, value_fn) -> None: + if not os.environ.get(var): + os.environ[var] = str(value_fn()) + set_vars.append(var) + + _ensure( + "DATASET_PATH", + lambda: Path( + _hf_download( + ["--repo-type", "dataset", "nvidia/bridge-v2-subset-synthetic-captions", + "--revision", _BRIDGE_REVISION] + ) + ) / "sft_dataset_bridge", + ) + _ensure("WAN_VAE_PATH", lambda: _hf_download(["Wan-AI/Wan2.2-TI2V-5B", "Wan2.2_VAE.pth"])) + _ensure("MODEL_PATH", lambda: _hf_download(["Qwen/Qwen3-VL-8B-Instruct", "--revision", _QWEN_VL_REVISION])) + + def _make_dcp() -> Path: + dest = stage / "Cosmos3-Nano-DCP" + _convert_nano_dcp(dest) + return dest + + _ensure("BASE_CHECKPOINT_PATH", _make_dcp) + + try: + yield {"vlm_model_path": os.environ["MODEL_PATH"]} + finally: + for var in set_vars: + os.environ.pop(var, None) + shutil.rmtree(stage, ignore_errors=True) + + # --- tests ------------------------------------------------------------------- -def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path) -> None: +def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, str]) -> None: """Re-run ``spec``'s torchrun command and check loss / grad-norm against goldens.""" arch = _detect_arch() - paths = _resolve_paths(arch) spec = _build_specs(paths)[spec_key] log_path = _run_torchrun(spec, tmp_path) @@ -430,9 +507,9 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path) -> None: @pytest.mark.level(2) @pytest.mark.gpus(4) @pytest.mark.parametrize("spec_key", _SPEC_KEYS, ids=lambda k: k.removeprefix("launch_")) - def test_launch_regression(spec_key: str, tmp_path: Path) -> None: + def test_launch_regression(spec_key: str, tmp_path: Path, h100_inputs: dict[str, str]) -> None: """Re-run ``spec``'s torchrun command and check loss / grad-norm against goldens.""" - _assert_spec_matches_goldens(spec_key, tmp_path) + _assert_spec_matches_goldens(spec_key, tmp_path, h100_inputs) if MAX_GPUS == 8: @@ -443,9 +520,9 @@ def test_launch_regression(spec_key: str, tmp_path: Path) -> None: @pytest.mark.parametrize( "spec_key", _SPEC_KEYS_8GPU, ids=lambda k: k.removeprefix("launch_") ) - def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None: + def test_launch_regression_8gpu(spec_key: str, tmp_path: Path, h100_inputs: dict[str, str]) -> None: """8-GPU variant for ``vision_sft_super`` (dp_shard=4 × cp=2).""" - _assert_spec_matches_goldens(spec_key, tmp_path) + _assert_spec_matches_goldens(spec_key, tmp_path, h100_inputs) # Goldens keyed by GPU arch then ``LaunchSpec.key``. Refresh with @@ -463,11 +540,12 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None: ], }, }, - # Captured 2026-05-27 on a 4 × NVIDIA H100 80GB HBM3 node with seed 42. - # Inputs come from ``tests/_stage_h100_inputs.sh``; VLM model is - # ``Qwen/Qwen3-VL-8B-Instruct``. + # Recaptured 2026-06-03 on a 4 × NVIDIA H100 80GB HBM3 node with seed 42 and + # transformers==4.57.6. VLM model is ``Qwen/Qwen3-VL-8B-Instruct``; inputs are + # prepared in-test by the ``h100_inputs`` fixture (or via + # ``tests/_stage_h100_inputs.sh`` if its env vars are pre-set). "h100": { - # Recaptured 2026-05-27 with deterministic mode off (both ``--deterministic`` + # Recaptured 2026-06-03 with deterministic mode off (both ``--deterministic`` # and ``model.config.deterministic`` are False — the Hopper FMHA # backward refuses to run under PyTorch deterministic mode on H100, see # ``LaunchSpec.deterministic`` and the spec's hydra override). The full @@ -475,16 +553,16 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None: # loss is asserted; iter 1+ drifts because the backward isn't bit-exact, # and even iter-0 grad-norm drifts (so grad_norm is skipped via ``None``). "llava_ov_datapacker": { - "loss": [0.88798, 1.01436, 1.06162, 1.04558, 1.00519, 0.91837, 1.10527, 1.03337, 0.9421, 0.69604], + "loss": [0.88798, 1.01583, 1.06096, 1.05566, 1.00613, 0.91551, 1.10534, 1.03794, 0.94166, 0.69613], "grad_norm": None, }, - # Recaptured 2026-05-27 after the TOML-config rewrite shifted some + # Recaptured 2026-06-03 after the TOML-config rewrite shifted some # defaults. Runs under ``--deterministic`` so loss reproduces bit-exact # across all 10 iters, but grad_norm is non-det because # ``compile.enabled=true`` makes the all-rank reduction not bit-exact # on H100. "vision_sft_nano": { - "loss": [0.2337, 0.2233, 0.2075, 0.2374, 0.2228, 0.2778, 0.2907, 0.223, 0.2125, 0.2699], + "loss": [0.2272, 0.2181, 0.2028, 0.2306, 0.218, 0.2734, 0.2865, 0.2162, 0.2055, 0.2643], "grad_norm": None, }, "vision_sft_super": { diff --git a/tests/launch_sft_vision_nano_1iter.sh b/tests/launch_sft_vision_nano_1iter.sh new file mode 100755 index 0000000..546df96 --- /dev/null +++ b/tests/launch_sft_vision_nano_1iter.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# SMOKE wrapper (test fixture) mirroring examples/launch_sft_vision_nano.sh but +# pointing at the tests/vision_sft_nano_1iter.toml recipe (max_iter=1, +# save_iter=1). Lives under tests/ and reuses the shared launcher helper from +# examples/. Paths below are resolved relative to the repo root by +# _sft_launcher_common.sh. + +TOML_FILE="tests/vision_sft_nano_1iter.toml" +: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}" +: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}" + +EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' + +source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh" diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py new file mode 100644 index 0000000..d8b90cb --- /dev/null +++ b/tests/nano_inference_smoke_test.py @@ -0,0 +1,177 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""8-GPU smoke test for Cosmos3-Nano text-to-video-with-sound (t2vs) inference. + +Runs the canonical Cosmos3-Nano inference command from ``docs/inference.md`` on +the ``inputs/omni/t2vs.json`` sample (``model_mode=text2video`` + +``enable_sound=True``) on 8 GPUs, and asserts that the run completes, writes a +video, and the muxed audio track is real sound (finite, non-empty, not silence, +not a degenerate/constant signal) -- not numeric goldens (that is +``launch_regression_test.py``'s job). + +The checkpoint (and its sound tokenizer) download from the Hugging Face Hub on +first run and are reused from the HF cache afterward. + +Invocation (inside the inference container, from the repo root, on an 8-GPU +node):: + + pytest -s tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts= + +* ``--num-gpus=8 --levels=2`` matches the markers below; the conftest pins + ``CUDA_VISIBLE_DEVICES`` accordingly. +* ``-o addopts=`` clears the repo ``.pytest.toml`` addopts that reference an + optional plugin not installed in the container. + +Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is +not collected. +""" + +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest + +from cosmos_framework.inference.fixtures.args import MAX_GPUS + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Distinct from the SFT launcher (50012) and torchrun's default (29500) so a +# concurrent training smoke run does not collide on the rendezvous port. +_MASTER_PORT = 29560 + +# Audio sanity thresholds for the muxed sound track. +_RMS_SILENCE_FLOOR = 1e-4 # below this the track is effectively silence +_PEAK_SANITY_CEIL = 1.5 # decoded float audio should sit within ~[-1, 1] + + +def _run(cmd: list[str], log_file: Path) -> str: + """Run ``cmd`` from the repo root, tee combined output to ``log_file``. + + Inherits the caller's environment (notably the HF cache, so a + previously-downloaded Cosmos3-Nano is reused). Fails the test with the log + tail on a non-zero exit. + """ + env = os.environ.copy() + env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" + log_file.parent.mkdir(parents=True, exist_ok=True) + returncode, text = _stream(cmd, env, log_file) + if returncode != 0: + pytest.fail( + f"inference failed with exit code {returncode}:\n" + f" {' '.join(cmd)}\n" + f"Log tail:\n{text[-3000:]}" + ) + return text + + +def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]: + """Run ``cmd`` and tee its combined output: live to stdout (so CI shows + progress under ``pytest -s``) and into ``log_file`` + a returned string. + """ + captured: list[str] = [] + with log_file.open("w") as fp: + proc = subprocess.Popen( + cmd, env=env, cwd=str(REPO_ROOT), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, + ) + assert proc.stdout is not None + for line in proc.stdout: + sys.stdout.write(line) + sys.stdout.flush() + fp.write(line) + captured.append(line) + returncode = proc.wait() + return returncode, "".join(captured) + + +def _decode_audio_track(mp4_path: Path): + """Decode the muxed audio track of ``mp4_path`` to a (channels, samples) waveform. + + Returns ``(waveform_float64, sample_rate)``. Fails the test if the file has + no audio stream or it decodes to zero frames. + """ + import av + import numpy as np + + with av.open(str(mp4_path)) as container: + audio_streams = container.streams.audio + assert audio_streams, f"{mp4_path} has no audio stream" + astream = audio_streams[0] + sample_rate = int(astream.rate) + chunks = [frame.to_ndarray() for frame in container.decode(astream)] + assert chunks, f"audio stream in {mp4_path} decoded to zero frames" + + orig_dtype = chunks[0].dtype + wav = np.concatenate(chunks, axis=1).astype(np.float64) + if np.issubdtype(orig_dtype, np.integer): + wav = wav / float(np.iinfo(orig_dtype).max) + return wav, sample_rate + + +def _assert_sound_not_noise(mp4_path: Path) -> None: + """Assert the muxed audio is real sound: finite, non-empty, non-silent, non-constant.""" + import numpy as np + + wav, sample_rate = _decode_audio_track(mp4_path) + assert wav.size > 0, f"empty audio in {mp4_path}" + assert sample_rate > 0, f"non-positive sample rate {sample_rate} in {mp4_path}" + assert np.all(np.isfinite(wav)), f"audio in {mp4_path} contains NaN/Inf" + + peak = float(np.max(np.abs(wav))) + rms = float(np.sqrt(np.mean(wav**2))) + std = float(wav.std()) + assert peak <= _PEAK_SANITY_CEIL, f"audio peak {peak} outside expected normalized range" + assert std > 1e-6, f"audio is constant/degenerate (std={std}) in {mp4_path}" + assert rms > _RMS_SILENCE_FLOOR, f"audio is silent/near-silent (rms={rms}) in {mp4_path}" + + +@pytest.fixture(scope="module", autouse=True) +def _require_8_gpus() -> None: + """Skip the module unless we can launch an 8-GPU run here.""" + if shutil.which("torchrun") is None: + pytest.skip("torchrun not on PATH -- must run inside the inference container") + try: + import torch + except Exception as exc: # pragma: no cover -- surfaces during dev only + pytest.skip(f"torch unavailable ({exc!r})") + if not torch.cuda.is_available() or torch.cuda.device_count() < 8: + pytest.skip(f"requires 8 visible CUDA devices, found {torch.cuda.device_count()}") + + +# Defined only when the active MAX_GPUS is 8 -- the conftest rejects ``gpus(N)`` +# markers outside ``ALL_NUM_GPUS = (0, 1, MAX_GPUS)``. +if MAX_GPUS == 8: + + @pytest.mark.level(2) + @pytest.mark.gpus(8) + def test_nano_inference_t2vs(tmp_path: Path) -> None: + """Run the docs/inference.md Cosmos3-Nano t2vs command; check the video + its sound.""" + out_dir = tmp_path / "out" + cmd = [ + "torchrun", + "--nproc_per_node=8", + f"--master_port={_MASTER_PORT}", + "-m", + "cosmos_framework.scripts.inference", + "--parallelism-preset=throughput", + "-i", + "inputs/omni/t2vs.json", + "-o", + str(out_dir), + "--checkpoint-path", + "Cosmos3-Nano", + "--seed=0", + ] + _run(cmd, tmp_path / "inference.log") + + videos = list(out_dir.rglob("vision.mp4")) + assert len(videos) == 1, f"expected exactly one vision.mp4 under {out_dir}, found {videos}" + video = videos[0] + assert video.stat().st_size > 0, f"empty output video at {video}" + assert list(out_dir.rglob("sample_outputs.json")), f"no sample_outputs.json under {out_dir}" + + _assert_sound_not_noise(video) diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py new file mode 100644 index 0000000..9d1a44b --- /dev/null +++ b/tests/nano_training_smoke_test.py @@ -0,0 +1,197 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""8-GPU smoke test for Cosmos3-Nano SFT training. + +Runs the documented Vision SFT (Cosmos3-Nano) flow from ``docs/training.md`` +end to end on 8 GPUs, capped to a single optimizer step via the +``vision_sft_nano_1iter`` recipe (``max_iter=1``, ``save_iter=1``): + + 1. Step 1 -- download the bridge-v2 subset dataset + the Wan2.2 VAE. + 2. Step 2 -- ``convert_model_to_dcp`` the Cosmos3-Nano checkpoint to DCP. + 3. Step 3 -- run the paired launch shell ``launch_sft_vision_nano_1iter.sh``. + +It asserts only that training completes and writes a checkpoint with a finite +loss (smoke -- no numeric goldens; that is ``launch_regression_test.py``'s job). + +Inputs land in the documented, ``.gitignore``-d default locations +(``examples/data/``, ``examples/checkpoints/``) so they are cached across runs; +the training output goes under ``outputs/`` (also git-ignored). Steps 1-2 are +skipped when their artifacts already exist. + +Invocation (inside the training container, from the repo root, on an 8-GPU +node):: + + pytest -s tests/nano_training_smoke_test.py --num-gpus=8 --levels=2 -o addopts= + +Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is +not collected. +""" + +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest + +from cosmos_framework.inference.fixtures.args import MAX_GPUS + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Documented default locations (all git-ignored). Match the launcher defaults so +# Step 3 needs no path overrides. +_DATA_DIR = REPO_ROOT / "examples/data/bridge-v2-subset-synthetic-captions" +_DATASET_PATH = _DATA_DIR / "sft_dataset_bridge" +_DATASET_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7" +_WAN_VAE = REPO_ROOT / "examples/checkpoints/wan22_vae/Wan2.2_VAE.pth" +_DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano" +_LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh" + +# Distinct from torchrun's default (29500) and the inference smoke port (29560). +_MASTER_PORT = 50112 + + +def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]: + """Run ``cmd`` from the repo root, tee combined output to ``log_file``. + + Returns ``(returncode, combined_output)``. Inherits the caller's env (HF + cache, etc.) plus ``PYTHONPATH=.``. + """ + env = os.environ.copy() + env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" + if extra_env: + env.update(extra_env) + log_file.parent.mkdir(parents=True, exist_ok=True) + # Tee: stream the subprocess output live to stdout (so CI shows progress + # under ``pytest -s``) while capturing it into the log file + a string. + captured: list[str] = [] + with log_file.open("w") as fp: + proc = subprocess.Popen( + cmd, env=env, cwd=str(REPO_ROOT), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, + ) + assert proc.stdout is not None + for line in proc.stdout: + sys.stdout.write(line) + sys.stdout.flush() + fp.write(line) + captured.append(line) + returncode = proc.wait() + return returncode, "".join(captured) + + +def _ensure_inputs(log_dir: Path) -> None: + """Step 1: download the dataset + Wan2.2 VAE if not already present.""" + if not (_DATASET_PATH / "train" / "video_dataset_file.jsonl").is_file(): + rc, out = _run( + [ + "uvx", "hf@latest", "download", "--repo-type", "dataset", + "nvidia/bridge-v2-subset-synthetic-captions", + "--revision", _DATASET_REVISION, + "--local-dir", str(_DATA_DIR), "--quiet", + ], + log_dir / "download_dataset.log", + ) + assert rc == 0, f"dataset download failed (exit {rc}):\n{out[-2000:]}" + assert (_DATASET_PATH / "train" / "video_dataset_file.jsonl").is_file(), ( + f"dataset missing {_DATASET_PATH}/train/video_dataset_file.jsonl after download" + ) + + if not _WAN_VAE.is_file(): + rc, out = _run( + [ + "uvx", "hf@latest", "download", "Wan-AI/Wan2.2-TI2V-5B", "Wan2.2_VAE.pth", + "--local-dir", str(_WAN_VAE.parent), "--quiet", + ], + log_dir / "download_wan_vae.log", + ) + assert rc == 0, f"Wan VAE download failed (exit {rc}):\n{out[-2000:]}" + assert _WAN_VAE.is_file(), f"Wan VAE missing at {_WAN_VAE} after download" + + +def _ensure_dcp(log_dir: Path) -> None: + """Step 2: convert Cosmos3-Nano to DCP if not already present.""" + if _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()): + return + rc, out = _run( + [ + "python", "-m", "cosmos_framework.scripts.convert_model_to_dcp", + "--checkpoint-path", "Cosmos3-Nano", + "-o", str(_DCP_DIR), + ], + log_dir / "convert_to_dcp.log", + ) + assert rc == 0, f"convert_model_to_dcp failed (exit {rc}):\n{out[-3000:]}" + assert _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()), f"DCP not written to {_DCP_DIR}" + + +def _finite_losses(text: str) -> list[float]: + """Parse per-iteration ``Loss:`` values from the training log. + + Matches the ``iter_speed`` callback line, e.g. + ``Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ...``. + """ + vals = [] + for m in re.finditer(r"Loss:\s*([-+0-9.eE]+)", text): + try: + v = float(m.group(1)) + except ValueError: + continue + if v == v and abs(v) != float("inf"): # finite (NaN != NaN) + vals.append(v) + return vals + + +@pytest.fixture(scope="module", autouse=True) +def _require_8_gpus() -> None: + """Skip the module unless we can launch an 8-GPU training run here.""" + if shutil.which("torchrun") is None: + pytest.skip("torchrun not on PATH -- must run inside the training container") + if shutil.which("uvx") is None: + pytest.skip("uvx not on PATH -- required to download the dataset / Wan VAE") + try: + import torch + except Exception as exc: # pragma: no cover + pytest.skip(f"torch unavailable ({exc!r})") + if not torch.cuda.is_available() or torch.cuda.device_count() < 8: + pytest.skip(f"requires 8 visible CUDA devices, found {torch.cuda.device_count()}") + + +if MAX_GPUS == 8: + + @pytest.mark.level(2) + @pytest.mark.gpus(8) + def test_nano_sft_vision_1iter(tmp_path: Path) -> None: + """Run the full Vision SFT (Cosmos3-Nano) 1-iter flow and check it trains a step.""" + _ensure_inputs(tmp_path) + _ensure_dcp(tmp_path) + + # Route all run-specific output (launcher logs + the saved checkpoint via + # the harness's IMAGINAIRE_OUTPUT_ROOT) under the pytest tmp dir, which + # pytest auto-cleans. Nothing run-specific is left in the repo tree. + rc, out = _run( + ["bash", _LAUNCHER], + tmp_path / "train.log", + extra_env={ + "MASTER_PORT": str(_MASTER_PORT), + "OUTPUT_ROOT": str(tmp_path / "launcher_out"), + "NPROC_PER_NODE": "8", + }, + ) + assert rc == 0, f"SFT launch failed (exit {rc}):\nLog tail:\n{out[-4000:]}" + + assert "Done with training" in out, f"training did not finish cleanly:\nLog tail:\n{out[-4000:]}" + + losses = _finite_losses(out) + assert losses, f"no finite per-iteration 'Loss:' value found in training log:\n{out[-3000:]}" + + # save_iter=1 -> the trainer logs the DCP checkpoint path it wrote. Its + # location is governed by IMAGINAIRE_OUTPUT_ROOT (the test harness points + # this at a pytest tmp dir), so read it from the log rather than guessing. + saved = re.findall(r"Saved checkpoint to (\S+)", out) + assert saved, f"no 'Saved checkpoint to ...' line in training log (save_iter=1):\n{out[-3000:]}" + ckpt = Path(saved[-1]) + assert ckpt.is_dir() and any(ckpt.iterdir()), f"saved checkpoint dir missing/empty: {ckpt}" diff --git a/tests/vision_sft_nano_1iter.toml b/tests/vision_sft_nano_1iter.toml new file mode 100644 index 0000000..c88eed6 --- /dev/null +++ b/tests/vision_sft_nano_1iter.toml @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano) +# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. +# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here). +# +# SMOKE COPY of vision_sft_nano.toml: max_iter=1 + save_iter=1 so it trains a +# single optimizer step and immediately writes a DCP checkpoint. + +[job] +task = "vfm" +experiment = "vision_sft_nano" +project = "cosmos3" +group = "sft" +name = "vision_sft_nano_1iter" +wandb_mode = "disabled" + +[model] +max_num_tokens_after_packing = 45056 +joint_attn_implementation = "two_way" +precision = "bfloat16" # was [model.parallelism].precision + +[model.ema] +enabled = true +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = -1 # -1 = auto from WORLD_SIZE (matches legacy) +data_parallel_replicate_degree = 1 + +[model.compile] +enabled = true # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-6 +fused = true +keys_to_select = [ + "moe_gen", + "time_embedder", + "vae2llm", + "llm2vae", +] +lr = 2.0e-5 +weight_decay = 0 # int matches legacy YAML repr +# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands). + +[scheduler] +cycle_lengths = [1000] +f_max = [1.0] +f_min = [0.0] +f_start = [0.0] +verbosity_interval = 0 +warm_up_steps = [50] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 2 +logging_iter = 1 +max_iter = 1 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false +# warmup_resolutions omitted (None at experiment level) + +[trainer.callbacks.grad_clip] +clip_norm = 0.1 +force_finite = true + +[checkpoint] +keys_to_skip_loading = ["net_ema."] +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 1 + +[dataloader_train] +max_sequence_length = 45056 +# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count) +# seed omitted — PackingDataLoader has no seed ctor kwarg From 4bede64171536cdc64931b89481967a845b10b9f Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 07:46:38 -0700 Subject: [PATCH 2/8] Split GPU CI into separate smoke and regression workflows Replace the single gpu-tests.yml (one job, two test steps) with two workflows so the 8-GPU nano smoke tests and the 4-GPU SFT regression run and report independently: - .github/workflows/gpu-smoke-tests.yml: nano t2vs + 1-iter SFT smoke (--num-gpus=8 --levels=2). - .github/workflows/gpu-regression.yml: SFT loss/grad-norm regression (TEST_MAX_GPUS=4, --num-gpus=4 --levels=2). Both run on [self-hosted, gpu, h200] for push/PR to main with pytest -v -s (live logs) and an if: always() cleanup; distinct concurrency groups so they don't cancel each other. --- .github/workflows/gpu-regression.yml | 57 +++++++++++++++++++++ .github/workflows/gpu-smoke-tests.yml | 58 +++++++++++++++++++++ .github/workflows/gpu-tests.yml | 72 --------------------------- 3 files changed, 115 insertions(+), 72 deletions(-) create mode 100644 .github/workflows/gpu-regression.yml create mode 100644 .github/workflows/gpu-smoke-tests.yml delete mode 100644 .github/workflows/gpu-tests.yml diff --git a/.github/workflows/gpu-regression.yml b/.github/workflows/gpu-regression.yml new file mode 100644 index 0000000..691daaf --- /dev/null +++ b/.github/workflows/gpu-regression.yml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# SFT loss/grad-norm regression on a self-hosted 8×H200 runner (4-GPU subset). +# +# Requires: +# * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs, +# NVIDIA drivers, and `uv` on PATH; +# * an `HF_TOKEN` repository secret (gated dataset/model downloads). +# +# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct) +# are downloaded / converted in-test into a temp stage that is removed on +# teardown; raw downloads are cached in the runner's HF cache. The h100 goldens +# are reused on H200 (see _detect_arch). +name: GPU Regression + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: gpu-regression-${{ github.ref }} + cancel-in-progress: true + +jobs: + sft-regression: + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 60 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + # Select the 4-GPU regression test variant (uses 4 of the 8 GPUs). + TEST_MAX_GPUS: "4" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # llava_ov_datapacker + vision_sft_nano vs the h100 goldens (H200 maps to + # the same key). -s streams the live training log. + - name: SFT regression (4-GPU subset) + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts= + + # The h100_inputs fixture removes its DCP stage on teardown; clear the + # pytest tmp dirs too (logs + any run output). The HF cache is kept. + - name: Clean up run outputs + if: always() + run: | + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true diff --git a/.github/workflows/gpu-smoke-tests.yml b/.github/workflows/gpu-smoke-tests.yml new file mode 100644 index 0000000..7d4f3f4 --- /dev/null +++ b/.github/workflows/gpu-smoke-tests.yml @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Cosmos3-Nano 8-GPU smoke tests on a self-hosted 8×H200 runner. +# +# Requires: +# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, +# NVIDIA drivers, and `uv` on PATH; +# * an `HF_TOKEN` repository secret (gated dataset/model downloads). +# +# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE) are downloaded / +# converted in-test and cached in the runner's HF cache; the first run is slow +# (~30 GB Nano + DCP convert), later runs reuse the cache. +name: GPU Smoke Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: gpu-smoke-tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + nano-smoke: + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 60 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # Cosmos3-Nano t2vs inference (+ sound check) and a 1-iter Vision SFT. + # MAX_GPUS defaults to 8. -s streams the live process log. + - name: Nano smoke tests (8 GPU) + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \ + --num-gpus=8 --levels=2 -o addopts= + + # Clear the run's heavy artifacts (even on failure): examples/checkpoints + # (the Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs + # (t2vs video + the SFT checkpoint). The small examples/data dataset and + # the HF cache are intentionally kept so subsequent runs reuse them. + - name: Clean up run outputs + if: always() + run: | + rm -rf examples/checkpoints || true + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml deleted file mode 100644 index 816c06f..0000000 --- a/.github/workflows/gpu-tests.yml +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# GPU regression + smoke tests on a self-hosted 8×H200 runner. -# -# Requires: -# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, -# NVIDIA drivers, and `uv` on PATH; -# * an `HF_TOKEN` repository secret (gated dataset/model downloads). -# -# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct) -# are downloaded/converted in-test and cached in the runner's HF cache; the -# first run is slow (~30 GB Nano + ~16 GB Qwen + DCP convert), later runs reuse -# the cache. -name: GPU Tests - -on: - push: - branches: [main] - pull_request: - branches: [main] - -# Don't pile up 8-GPU runs: cancel an in-progress run for the same ref when a -# newer commit arrives. -concurrency: - group: gpu-tests-${{ github.ref }} - cancel-in-progress: true - -jobs: - gpu-tests: - runs-on: [self-hosted, gpu, h200] - timeout-minutes: 90 - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_DISABLE_XET: "1" - steps: - - uses: actions/checkout@v6 - - - uses: astral-sh/setup-uv@v7 - - - name: Sync environment (cu128-train) - run: uv sync --all-extras --group=cu128-train - - # 8-GPU smoke tests: Cosmos3-Nano t2vs inference (+ sound check) and a - # 1-iter Vision SFT. MAX_GPUS defaults to 8. - - name: Nano smoke tests (8 GPU) - run: | - export LD_LIBRARY_PATH= - uv run --all-extras --group=cu128-train python -m pytest -v -s \ - tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \ - --num-gpus=8 --levels=2 -o addopts= - - # SFT loss/grad-norm regression on a 4-GPU subset (h100 goldens; H200 maps - # to the same key). TEST_MAX_GPUS=4 selects the 4-GPU test variant. - - name: SFT regression (4-GPU subset) - env: - TEST_MAX_GPUS: "4" - run: | - export LD_LIBRARY_PATH= - uv run --all-extras --group=cu128-train python -m pytest -v -s \ - tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts= - - # Clear the run's heavy artifacts (even on failure) to keep the runner's - # disk bounded: examples/checkpoints (the Cosmos3-Nano DCP + Wan VAE, - # ~30 GB) and the pytest tmp dirs (smoke-test videos + the SFT checkpoint). - # The small examples/data dataset and the HF cache are intentionally kept - # so subsequent runs reuse them. - - name: Clean up run outputs - if: always() - run: | - rm -rf examples/checkpoints || true - rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true From 6ab86a8c66f7740a7c9d33edc38f52c9b54ce6b7 Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 08:04:06 -0700 Subject: [PATCH 3/8] Pick a free torchrun master port in the GPU tests The smoke + regression tests used hardcoded --master_port values (50012/ 50022/50023, 29560/50112), which raise `DistNetworkError: ... EADDRINUSE ... port: 50022` when a port is held by a lingering process, in TIME_WAIT, or a concurrent run. Each test now binds an OS-assigned free port (_free_port) right before launching torchrun and passes it as --master_port / the launcher MASTER_PORT. Dropped the now-unused LaunchSpec.master_port field. Verified on 8xH100: nano training smoke 1 passed, no EADDRINUSE. --- tests/launch_regression_test.py | 15 ++++++++++----- tests/nano_inference_smoke_test.py | 17 +++++++++++++---- tests/nano_training_smoke_test.py | 12 +++++++++--- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py index 2952b68..0512d2a 100644 --- a/tests/launch_regression_test.py +++ b/tests/launch_regression_test.py @@ -71,6 +71,7 @@ import os import re import shutil +import socket import subprocess import sys from dataclasses import dataclass, field @@ -85,6 +86,14 @@ # the repo root; we always invoke torchrun from there. REPO_ROOT = THIS_DIR.parent + +def _free_port() -> int: + """Return a currently-free TCP port for torchrun's rendezvous, instead of a + hardcoded ``master_port`` that ``EADDRINUSE``s when a prior run lingers.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + # --- per-arch input paths ---------------------------------------------------- # # GB200: the original input snapshot lived on an internal read-only filesystem @@ -178,7 +187,6 @@ class LaunchSpec: key: str # goldens key + pytest parametrize id source sft_toml: str # ``--sft-toml=...`` value, relative to REPO_ROOT - master_port: int extra_hydra_args: tuple[str, ...] loss_re: re.Pattern[str] deterministic_iters: int # how many leading iters are bit-exact deterministic @@ -216,7 +224,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]: # Replicates launch_sft_llava_ov.sh, capped to 10 iters. key="llava_ov_datapacker", sft_toml="examples/toml/sft_config/llava_ov_datapacker.toml", - master_port=50012, extra_hydra_args=( # TAIL_OVERRIDES from launch_sft_llava_ov.sh — fields not modeled # by SFTExperimentConfig. @@ -261,7 +268,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]: # needed beyond the regression-cap overrides below. key="vision_sft_nano", sft_toml="examples/toml/sft_config/vision_sft_nano.toml", - master_port=50022, extra_hydra_args=( "model.config.parallelism.data_parallel_shard_degree=4", "model.config.compile.enabled=true", @@ -280,7 +286,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]: # backbone's compile path is not bit-exact across runs on H100. key="vision_sft_super", sft_toml="examples/toml/sft_config/vision_sft_super.toml", - master_port=50023, nproc_per_node=8, extra_hydra_args=( "model.config.parallelism.data_parallel_shard_degree=4", @@ -327,7 +332,7 @@ def _run_torchrun(spec: LaunchSpec, run_dir: Path) -> Path: cmd = [ "torchrun", f"--nproc_per_node={spec.nproc_per_node}", - f"--master_port={spec.master_port}", + f"--master_port={_free_port()}", "-m", "cosmos_framework.scripts.train", f"--sft-toml={spec.sft_toml}", diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py index d8b90cb..b44d7c0 100644 --- a/tests/nano_inference_smoke_test.py +++ b/tests/nano_inference_smoke_test.py @@ -29,6 +29,7 @@ import os import shutil +import socket import subprocess import sys from pathlib import Path @@ -39,9 +40,17 @@ REPO_ROOT = Path(__file__).resolve().parents[1] -# Distinct from the SFT launcher (50012) and torchrun's default (29500) so a -# concurrent training smoke run does not collide on the rendezvous port. -_MASTER_PORT = 29560 + +def _free_port() -> int: + """Return a currently-free TCP port for torchrun's rendezvous. + + Avoids hardcoded ports that ``EADDRINUSE`` when a prior run's process + lingers or a port is in TIME_WAIT. (Small TOCTOU window between close and + torchrun's bind, acceptable for a single-node test.) + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] # Audio sanity thresholds for the muxed sound track. _RMS_SILENCE_FLOOR = 1e-4 # below this the track is effectively silence @@ -154,7 +163,7 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None: cmd = [ "torchrun", "--nproc_per_node=8", - f"--master_port={_MASTER_PORT}", + f"--master_port={_free_port()}", "-m", "cosmos_framework.scripts.inference", "--parallelism-preset=throughput", diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py index 9d1a44b..ecdc545 100644 --- a/tests/nano_training_smoke_test.py +++ b/tests/nano_training_smoke_test.py @@ -31,6 +31,7 @@ import os import re import shutil +import socket import subprocess import sys from pathlib import Path @@ -50,8 +51,13 @@ _DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano" _LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh" -# Distinct from torchrun's default (29500) and the inference smoke port (29560). -_MASTER_PORT = 50112 + +def _free_port() -> int: + """Return a currently-free TCP port for the launcher's torchrun rendezvous + (avoids EADDRINUSE from a hardcoded port / lingering process).""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]: @@ -176,7 +182,7 @@ def test_nano_sft_vision_1iter(tmp_path: Path) -> None: ["bash", _LAUNCHER], tmp_path / "train.log", extra_env={ - "MASTER_PORT": str(_MASTER_PORT), + "MASTER_PORT": str(_free_port()), "OUTPUT_ROOT": str(tmp_path / "launcher_out"), "NPROC_PER_NODE": "8", }, From ef331c40331a2ddc01b517deab58a56323b456ec Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 08:34:22 -0700 Subject: [PATCH 4/8] Split regression CI into generator/reasoner workflows; show run log on failure - Replace gpu-regression.yml with two single-spec workflows so the generator (VFM, vision_sft_nano) and reasoner (VLM, llava_ov_datapacker) regressions run and report independently, each via `pytest -k `: .github/workflows/gpu-regression-generator.yml .github/workflows/gpu-regression-reasoner.yml - launch_regression_test.py: on a goldens/parse mismatch, include the run-log tail and the got-vs-expected series in the failure message (the log also streams live under `pytest -s`), so failures carry the run detail. --- ...ssion.yml => gpu-regression-generator.yml} | 23 ++++---- .github/workflows/gpu-regression-reasoner.yml | 58 +++++++++++++++++++ tests/launch_regression_test.py | 20 +++++-- 3 files changed, 85 insertions(+), 16 deletions(-) rename .github/workflows/{gpu-regression.yml => gpu-regression-generator.yml} (65%) create mode 100644 .github/workflows/gpu-regression-reasoner.yml diff --git a/.github/workflows/gpu-regression.yml b/.github/workflows/gpu-regression-generator.yml similarity index 65% rename from .github/workflows/gpu-regression.yml rename to .github/workflows/gpu-regression-generator.yml index 691daaf..0de44fc 100644 --- a/.github/workflows/gpu-regression.yml +++ b/.github/workflows/gpu-regression-generator.yml @@ -1,18 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -# SFT loss/grad-norm regression on a self-hosted 8×H200 runner (4-GPU subset). +# Generator (VFM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU +# subset). Runs the single ``vision_sft_nano`` spec of +# tests/launch_regression_test.py. # # Requires: # * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs, # NVIDIA drivers, and `uv` on PATH; # * an `HF_TOKEN` repository secret (gated dataset/model downloads). # -# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct) -# are downloaded / converted in-test into a temp stage that is removed on -# teardown; raw downloads are cached in the runner's HF cache. The h100 goldens -# are reused on H200 (see _detect_arch). -name: GPU Regression +# The h100 goldens are reused on H200 (see _detect_arch). +name: GPU Regression (Generator) on: push: @@ -21,11 +20,11 @@ on: branches: [main] concurrency: - group: gpu-regression-${{ github.ref }} + group: gpu-regression-generator-${{ github.ref }} cancel-in-progress: true jobs: - sft-regression: + generator-regression: runs-on: [self-hosted, gpu, h200] timeout-minutes: 60 env: @@ -41,13 +40,13 @@ jobs: - name: Sync environment (cu128-train) run: uv sync --all-extras --group=cu128-train - # llava_ov_datapacker + vision_sft_nano vs the h100 goldens (H200 maps to - # the same key). -s streams the live training log. - - name: SFT regression (4-GPU subset) + # Generator (vision_sft_nano) loss vs the h100 goldens. -s streams the live log. + - name: Generator regression (vision_sft_nano, 4-GPU subset) run: | export LD_LIBRARY_PATH= uv run --all-extras --group=cu128-train python -m pytest -v -s \ - tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts= + tests/launch_regression_test.py -k vision_sft_nano \ + --num-gpus=4 --levels=2 -o addopts= # The h100_inputs fixture removes its DCP stage on teardown; clear the # pytest tmp dirs too (logs + any run output). The HF cache is kept. diff --git a/.github/workflows/gpu-regression-reasoner.yml b/.github/workflows/gpu-regression-reasoner.yml new file mode 100644 index 0000000..57b8ce7 --- /dev/null +++ b/.github/workflows/gpu-regression-reasoner.yml @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Reasoner (VLM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU +# subset). Runs the single ``llava_ov_datapacker`` spec of +# tests/launch_regression_test.py. +# +# Requires: +# * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs, +# NVIDIA drivers, and `uv` on PATH; +# * an `HF_TOKEN` repository secret (gated dataset/model downloads, incl. the +# streamed LLaVA-OneVision-Data dataset). +# +# The h100 goldens are reused on H200 (see _detect_arch). +name: GPU Regression (Reasoner) + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: gpu-regression-reasoner-${{ github.ref }} + cancel-in-progress: true + +jobs: + reasoner-regression: + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 60 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + # Select the 4-GPU regression test variant (uses 4 of the 8 GPUs). + TEST_MAX_GPUS: "4" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # Reasoner (llava_ov_datapacker) iter-0 loss vs the h100 goldens. -s streams + # the live log. + - name: Reasoner regression (llava_ov_datapacker, 4-GPU subset) + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/launch_regression_test.py -k llava_ov_datapacker \ + --num-gpus=4 --levels=2 -o addopts= + + # The h100_inputs fixture removes its DCP stage on teardown; clear the + # pytest tmp dirs too (logs + any run output). The HF cache is kept. + - name: Clean up run outputs + if: always() + run: | + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py index 0512d2a..65b4358 100644 --- a/tests/launch_regression_test.py +++ b/tests/launch_regression_test.py @@ -464,8 +464,12 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, spec = _build_specs(paths)[spec_key] log_path = _run_torchrun(spec, tmp_path) - loss, grad_norm = _parse_series(log_path.read_text(errors="replace"), spec.loss_re) - assert len(loss) == 10, f"expected 10 iterations, parsed {len(loss)} (loss={loss})" + log_text = log_path.read_text(errors="replace") + loss, grad_norm = _parse_series(log_text, spec.loss_re) + # The run log also streamed live under ``pytest -s``; include its tail in any + # failure message so the run detail is attached to the failure report too. + run_detail = f"\n--- {spec.key} run log (last 4000 chars) ---\n{log_text[-4000:]}" + assert len(loss) == 10, f"expected 10 iterations, parsed {len(loss)} (loss={loss}){run_detail}" # Refresh path: print captured values for manual copy into ``_GOLDENS``. if os.environ.get("COSMOS_REGRESSION_UPDATE_GOLDENS") == "1": @@ -495,14 +499,22 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, assert loss[:n] == pytest.approx( expected["loss"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL - ), f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens" + ), ( + f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens\n" + f" got : {loss[:n]}\n" + f" expected: {expected['loss'][:n]}{run_detail}" + ) # ``grad_norm`` is optional: ``None`` skips the check when the FSDP # global-norm all-reduce isn't bit-exact on this arch. if expected["grad_norm"] is None: return assert grad_norm[:n] == pytest.approx( expected["grad_norm"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL - ), f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens" + ), ( + f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens\n" + f" got : {grad_norm[:n]}\n" + f" expected: {expected['grad_norm'][:n]}{run_detail}" + ) # Define only the test function matching MAX_GPUS — the conftest rejects From 9f0a7329564c548ebe8f55fcf00b747d5441e040 Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 20:36:38 -0700 Subject: [PATCH 5/8] Expand Nano smoke tests (multi-modality inference + train/export/infer) and split smoke CI - tests/nano_inference_smoke_test.py: one inference call over three modalities (t2vs text2video+sound, action policy, action forward_dynamics); validates each sample's vision.mp4 (PyAV decode), the t2vs audio (not-noise), and the policy action array. - tests/nano_training_smoke_test.py: convert -> train 5 -> export -> t2i-from- export pipeline with per-step checks: DCP + exported-model completeness (file/shard + index counts + tensor-manifest self-consistency, no tensor load), loss-degrades (min(loss) gpu-smoke-training.yml} | 34 +-- tests/launch_sft_vision_nano_1iter.sh | 17 -- tests/launch_sft_vision_nano_5iter.sh | 13 + tests/nano_inference_smoke_test.py | 154 +++++++----- tests/nano_training_smoke_test.py | 237 +++++++++++++++--- ..._1iter.toml => vision_sft_nano_5iter.toml} | 15 +- 7 files changed, 390 insertions(+), 134 deletions(-) create mode 100644 .github/workflows/gpu-smoke-inference.yml rename .github/workflows/{gpu-smoke-tests.yml => gpu-smoke-training.yml} (50%) delete mode 100755 tests/launch_sft_vision_nano_1iter.sh create mode 100755 tests/launch_sft_vision_nano_5iter.sh rename tests/{vision_sft_nano_1iter.toml => vision_sft_nano_5iter.toml} (80%) diff --git a/.github/workflows/gpu-smoke-inference.yml b/.github/workflows/gpu-smoke-inference.yml new file mode 100644 index 0000000..66375be --- /dev/null +++ b/.github/workflows/gpu-smoke-inference.yml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Cosmos3-Nano 8-GPU multi-modality inference smoke (t2vs + policy + forward_dynamics) on a +# self-hosted 8×H200 runner. +# +# Requires: +# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, +# NVIDIA drivers, and `uv` on PATH; +# * an `HF_TOKEN` repository secret (gated model downloads). +# +# The Cosmos3-Nano checkpoint (and its sound tokenizer) download to the runner's +# HF cache; later runs reuse it. +name: GPU Smoke (Inference) + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: gpu-smoke-inference-${{ github.ref }} + cancel-in-progress: true + +jobs: + inference-smoke: + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 60 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # One inference call over t2vs (+sound), action policy, and forward_dynamics; checks each output. + # MAX_GPUS defaults to 8. -s streams the live process log. + - name: Nano inference smoke (t2vs + action policy + forward_dynamics, 8 GPU) + run: | + export LD_LIBRARY_PATH= + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts= + + # Inference writes only the pytest tmp dir (the t2vs video + logs); the + # checkpoint download stays in the HF cache (kept). No examples/ artifacts. + - name: Clean up run outputs + if: always() + run: | + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true diff --git a/.github/workflows/gpu-smoke-tests.yml b/.github/workflows/gpu-smoke-training.yml similarity index 50% rename from .github/workflows/gpu-smoke-tests.yml rename to .github/workflows/gpu-smoke-training.yml index 7d4f3f4..1d861b2 100644 --- a/.github/workflows/gpu-smoke-tests.yml +++ b/.github/workflows/gpu-smoke-training.yml @@ -1,17 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -# Cosmos3-Nano 8-GPU smoke tests on a self-hosted 8×H200 runner. +# Cosmos3-Nano 8-GPU SFT pipeline smoke test (convert -> train 5 -> export -> +# t2i infer) on a self-hosted 8×H200 runner. # # Requires: # * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, # NVIDIA drivers, and `uv` on PATH; # * an `HF_TOKEN` repository secret (gated dataset/model downloads). # -# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE) are downloaded / -# converted in-test and cached in the runner's HF cache; the first run is slow -# (~30 GB Nano + DCP convert), later runs reuse the cache. -name: GPU Smoke Tests +# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE) are downloaded / +# converted in-test and cached under examples/ + the HF cache; the first run is +# slow (~30 GB Nano + DCP convert + 5-step train + export + a t2i generation), +# later runs reuse the cache. +name: GPU Smoke (Training) on: push: @@ -20,13 +22,13 @@ on: branches: [main] concurrency: - group: gpu-smoke-tests-${{ github.ref }} + group: gpu-smoke-training-${{ github.ref }} cancel-in-progress: true jobs: - nano-smoke: + training-smoke: runs-on: [self-hosted, gpu, h200] - timeout-minutes: 60 + timeout-minutes: 90 env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_DISABLE_XET: "1" @@ -38,19 +40,19 @@ jobs: - name: Sync environment (cu128-train) run: uv sync --all-extras --group=cu128-train - # Cosmos3-Nano t2vs inference (+ sound check) and a 1-iter Vision SFT. + # Full SFT pipeline: download + convert Nano->DCP, train 5 steps (loss + # trend), export to HF safetensors, then a t2i generation from the export. # MAX_GPUS defaults to 8. -s streams the live process log. - - name: Nano smoke tests (8 GPU) + - name: Nano SFT pipeline smoke (convert -> train 5 -> export -> t2i, 8 GPU) run: | export LD_LIBRARY_PATH= uv run --all-extras --group=cu128-train python -m pytest -v -s \ - tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \ - --num-gpus=8 --levels=2 -o addopts= + tests/nano_training_smoke_test.py --num-gpus=8 --levels=2 -o addopts= - # Clear the run's heavy artifacts (even on failure): examples/checkpoints - # (the Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs - # (t2vs video + the SFT checkpoint). The small examples/data dataset and - # the HF cache are intentionally kept so subsequent runs reuse them. + # Clear the heavy artifacts (even on failure): examples/checkpoints (the + # Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs (the SFT + # checkpoint + logs). The small examples/data dataset and the HF cache are + # intentionally kept so subsequent runs reuse them. - name: Clean up run outputs if: always() run: | diff --git a/tests/launch_sft_vision_nano_1iter.sh b/tests/launch_sft_vision_nano_1iter.sh deleted file mode 100755 index 546df96..0000000 --- a/tests/launch_sft_vision_nano_1iter.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# SMOKE wrapper (test fixture) mirroring examples/launch_sft_vision_nano.sh but -# pointing at the tests/vision_sft_nano_1iter.toml recipe (max_iter=1, -# save_iter=1). Lives under tests/ and reuses the shared launcher helper from -# examples/. Paths below are resolved relative to the repo root by -# _sft_launcher_common.sh. - -TOML_FILE="tests/vision_sft_nano_1iter.toml" -: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}" -: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}" - -EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' - -source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh" diff --git a/tests/launch_sft_vision_nano_5iter.sh b/tests/launch_sft_vision_nano_5iter.sh new file mode 100755 index 0000000..0fdf748 --- /dev/null +++ b/tests/launch_sft_vision_nano_5iter.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# SMOKE wrapper (test fixture) for tests/nano_training_smoke_test.py — mirrors +# examples/launch_sft_vision_nano.sh but points at tests/vision_sft_nano_5iter.toml +# (max_iter=5, save_iter=5). Reuses the shared launcher helper from examples/. +# Paths below are resolved relative to the repo root by _sft_launcher_common.sh. + +TOML_FILE="tests/vision_sft_nano_5iter.toml" +: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}" +: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}" + +EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' + +source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh" diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py index b44d7c0..a2f0e1b 100644 --- a/tests/nano_inference_smoke_test.py +++ b/tests/nano_inference_smoke_test.py @@ -1,32 +1,36 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -"""8-GPU smoke test for Cosmos3-Nano text-to-video-with-sound (t2vs) inference. +"""8-GPU multi-modality inference smoke test for Cosmos3-Nano. -Runs the canonical Cosmos3-Nano inference command from ``docs/inference.md`` on -the ``inputs/omni/t2vs.json`` sample (``model_mode=text2video`` + -``enable_sound=True``) on 8 GPUs, and asserts that the run completes, writes a -video, and the muxed audio track is real sound (finite, non-empty, not silence, -not a degenerate/constant signal) -- not numeric goldens (that is -``launch_regression_test.py``'s job). +Runs ONE ``cosmos_framework.scripts.inference`` call over three input samples of +different modalities (the ``-i`` flag takes a list of files) and validates each +sample's output: -The checkpoint (and its sound tokenizer) download from the Hugging Face Hub on -first run and are reused from the HF cache afterward. + * ``inputs/omni/t2vs.json`` (text2video + sound) -> a ``vision.mp4`` whose + muxed audio is real sound (finite, non-empty, non-silent, non-constant). + * ``inputs/omni/action_forward_dynamics_camera.json`` (forward_dynamics) -> a + ``vision.mp4`` that decodes to at least one valid video frame (``action_path`` + is an input, not an output). + * ``inputs/omni/action_policy_robot.json`` (policy) -> BOTH a ``vision.mp4`` and + a finite, non-empty predicted ``action`` array in ``sample_outputs.json``. + +All three samples produce a video; the policy sample additionally produces an +action and the t2vs sample an audio track. + +Smoke-level only (output validity, not numeric goldens). The checkpoint + its +tokenizers download from the HF Hub on first run and are reused afterward. Invocation (inside the inference container, from the repo root, on an 8-GPU node):: pytest -s tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts= -* ``--num-gpus=8 --levels=2`` matches the markers below; the conftest pins - ``CUDA_VISIBLE_DEVICES`` accordingly. -* ``-o addopts=`` clears the repo ``.pytest.toml`` addopts that reference an - optional plugin not installed in the container. - Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is not collected. """ +import json import os import shutil import socket @@ -40,47 +44,32 @@ REPO_ROOT = Path(__file__).resolve().parents[1] +_INPUTS = [ + "inputs/omni/t2vs.json", + "inputs/omni/action_policy_robot.json", + "inputs/omni/action_forward_dynamics_camera.json", +] -def _free_port() -> int: - """Return a currently-free TCP port for torchrun's rendezvous. +# Audio sanity thresholds for the muxed sound track. +_RMS_SILENCE_FLOOR = 1e-4 # below this the track is effectively silence +_PEAK_SANITY_CEIL = 1.5 # decoded float audio should sit within ~[-1, 1] - Avoids hardcoded ports that ``EADDRINUSE`` when a prior run's process - lingers or a port is in TIME_WAIT. (Small TOCTOU window between close and - torchrun's bind, acceptable for a single-node test.) - """ + +def _free_port() -> int: + """Return a currently-free TCP port for torchrun's rendezvous (avoids + EADDRINUSE from a hardcoded port / lingering process).""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] -# Audio sanity thresholds for the muxed sound track. -_RMS_SILENCE_FLOOR = 1e-4 # below this the track is effectively silence -_PEAK_SANITY_CEIL = 1.5 # decoded float audio should sit within ~[-1, 1] - def _run(cmd: list[str], log_file: Path) -> str: - """Run ``cmd`` from the repo root, tee combined output to ``log_file``. - - Inherits the caller's environment (notably the HF cache, so a - previously-downloaded Cosmos3-Nano is reused). Fails the test with the log - tail on a non-zero exit. - """ + """Run ``cmd`` from the repo root, tee combined output (live to stdout under + ``pytest -s`` + into ``log_file``). Inherits the caller's env (HF cache, ...) + plus ``PYTHONPATH=.``. Fails with the log tail on a non-zero exit.""" env = os.environ.copy() env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" log_file.parent.mkdir(parents=True, exist_ok=True) - returncode, text = _stream(cmd, env, log_file) - if returncode != 0: - pytest.fail( - f"inference failed with exit code {returncode}:\n" - f" {' '.join(cmd)}\n" - f"Log tail:\n{text[-3000:]}" - ) - return text - - -def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]: - """Run ``cmd`` and tee its combined output: live to stdout (so CI shows - progress under ``pytest -s``) and into ``log_file`` + a returned string. - """ captured: list[str] = [] with log_file.open("w") as fp: proc = subprocess.Popen( @@ -94,14 +83,17 @@ def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]: fp.write(line) captured.append(line) returncode = proc.wait() - return returncode, "".join(captured) + text = "".join(captured) + if returncode != 0: + pytest.fail(f"inference failed with exit code {returncode}:\n {' '.join(cmd)}\nLog tail:\n{text[-3000:]}") + return text def _decode_audio_track(mp4_path: Path): """Decode the muxed audio track of ``mp4_path`` to a (channels, samples) waveform. - Returns ``(waveform_float64, sample_rate)``. Fails the test if the file has - no audio stream or it decodes to zero frames. + Returns ``(waveform_float64, sample_rate)``. Fails if there is no audio + stream or it decodes to zero frames. """ import av import numpy as np @@ -138,6 +130,33 @@ def _assert_sound_not_noise(mp4_path: Path) -> None: assert rms > _RMS_SILENCE_FLOOR, f"audio is silent/near-silent (rms={rms}) in {mp4_path}" +def _assert_valid_video(mp4_path: Path) -> None: + """Assert ``mp4_path`` decodes to at least one valid, non-degenerate video frame.""" + import av + + assert mp4_path.is_file() and mp4_path.stat().st_size > 1024, f"video missing/too small: {mp4_path}" + with av.open(str(mp4_path)) as container: + vstreams = container.streams.video + assert vstreams, f"no video stream in {mp4_path}" + width = height = frames = 0 + for frame in container.decode(vstreams[0]): + width, height, frames = frame.width, frame.height, frames + 1 + break + assert frames >= 1 and width > 0 and height > 0, f"no decodable video frame in {mp4_path}" + + +def _assert_valid_action(content: dict, where: str) -> None: + """Assert a policy sample's predicted ``action`` is a non-empty, all-finite array.""" + import numpy as np + + assert isinstance(content, dict) and content.get("action") is not None, ( + f"no 'action' in policy output ({where}); content keys={list(content) if isinstance(content, dict) else content}" + ) + arr = np.asarray(content["action"], dtype=np.float64) + assert arr.size > 0, f"empty action output ({where})" + assert np.all(np.isfinite(arr)), f"action output has NaN/Inf ({where})" + + @pytest.fixture(scope="module", autouse=True) def _require_8_gpus() -> None: """Skip the module unless we can launch an 8-GPU run here.""" @@ -157,8 +176,8 @@ def _require_8_gpus() -> None: @pytest.mark.level(2) @pytest.mark.gpus(8) - def test_nano_inference_t2vs(tmp_path: Path) -> None: - """Run the docs/inference.md Cosmos3-Nano t2vs command; check the video + its sound.""" + def test_nano_inference_omni(tmp_path: Path) -> None: + """One Cosmos3-Nano inference call over t2vs + policy + forward_dynamics; check each output.""" out_dir = tmp_path / "out" cmd = [ "torchrun", @@ -168,7 +187,7 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None: "cosmos_framework.scripts.inference", "--parallelism-preset=throughput", "-i", - "inputs/omni/t2vs.json", + *_INPUTS, "-o", str(out_dir), "--checkpoint-path", @@ -177,10 +196,33 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None: ] _run(cmd, tmp_path / "inference.log") - videos = list(out_dir.rglob("vision.mp4")) - assert len(videos) == 1, f"expected exactly one vision.mp4 under {out_dir}, found {videos}" - video = videos[0] - assert video.stat().st_size > 0, f"empty output video at {video}" - assert list(out_dir.rglob("sample_outputs.json")), f"no sample_outputs.json under {out_dir}" + results = sorted(out_dir.rglob("sample_outputs.json")) + assert len(results) == len(_INPUTS), ( + f"expected {len(_INPUTS)} sample_outputs.json (one per input), found {[str(p) for p in results]}" + ) - _assert_sound_not_noise(video) + # Dispatch validation by what each sample produced (robust to model_mode + # string formatting): a vision.mp4 -> valid video (+ sound if enabled); + # an `action` content -> valid action array. + n_video = n_sound = n_action = 0 + for so in results: + data = json.loads(so.read_text()) + args = data.get("args", {}) + content = data["outputs"][0]["content"] + sample_dir = so.parent + video = sample_dir / "vision.mp4" + if video.is_file(): + _assert_valid_video(video) + n_video += 1 + if args.get("enable_sound"): + _assert_sound_not_noise(video) + n_sound += 1 + if isinstance(content, dict) and content.get("action") is not None: + _assert_valid_action(content, str(so)) + n_action += 1 + + # Every sample produces a valid video (t2vs, forward_dynamics, policy); + # the policy sample additionally yields an action, t2vs an audio track. + assert n_video == len(_INPUTS), f"expected every sample to produce a valid video, got {n_video}/{len(_INPUTS)}" + assert n_sound >= 1, f"expected the t2vs sample's audio to be checked, got {n_sound}" + assert n_action >= 1, f"expected the policy sample's action to be checked, got {n_action}" diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py index ecdc545..ab7ecf2 100644 --- a/tests/nano_training_smoke_test.py +++ b/tests/nano_training_smoke_test.py @@ -1,23 +1,27 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: OpenMDW-1.1 -"""8-GPU smoke test for Cosmos3-Nano SFT training. +"""8-GPU Cosmos3-Nano SFT pipeline smoke test (train -> export -> infer). -Runs the documented Vision SFT (Cosmos3-Nano) flow from ``docs/training.md`` -end to end on 8 GPUs, capped to a single optimizer step via the -``vision_sft_nano_1iter`` recipe (``max_iter=1``, ``save_iter=1``): +Runs the documented Vision SFT (Cosmos3-Nano) lifecycle from ``docs/training.md`` +end to end on 8 GPUs and validates each artifact: 1. Step 1 -- download the bridge-v2 subset dataset + the Wan2.2 VAE. - 2. Step 2 -- ``convert_model_to_dcp`` the Cosmos3-Nano checkpoint to DCP. - 3. Step 3 -- run the paired launch shell ``launch_sft_vision_nano_1iter.sh``. - -It asserts only that training completes and writes a checkpoint with a finite -loss (smoke -- no numeric goldens; that is ``launch_regression_test.py``'s job). - -Inputs land in the documented, ``.gitignore``-d default locations -(``examples/data/``, ``examples/checkpoints/``) so they are cached across runs; -the training output goes under ``outputs/`` (also git-ignored). Steps 1-2 are -skipped when their artifacts already exist. + 2. Step 2 -- ``convert_model_to_dcp`` Cosmos3-Nano -> DCP; check DCP completeness. + 3. Step 3 -- train 5 steps (``vision_sft_nano_5iter``); check the rank-0 loss + drops below its starting value (``min(loss) < loss[0]``; per-step diffusion + loss is too noisy for a strict trend over only 5 steps). + 4. Export -- ``export_model`` the trained DCP -> HF safetensors; check export + completeness (the ``checkpoint.json`` sentinel + config + safetensors). + 5. Inference -- a t2i generation from the exported model; check the image is + valid. + +Smoke-level checks only (artifact validity + a downward loss trend), not numeric +goldens -- that is ``launch_regression_test.py``'s job. + +Inputs land in the documented ``.gitignore``-d locations (``examples/data/``, +``examples/checkpoints/``, cached across runs); run output goes under the pytest +tmp dir. Steps 1-2 are skipped when their artifacts already exist. Invocation (inside the training container, from the repo root, on an 8-GPU node):: @@ -28,6 +32,7 @@ not collected. """ +import json import os import re import shutil @@ -49,7 +54,13 @@ _DATASET_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7" _WAN_VAE = REPO_ROOT / "examples/checkpoints/wan22_vae/Wan2.2_VAE.pth" _DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano" -_LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh" +_LAUNCHER = "tests/launch_sft_vision_nano_5iter.sh" + +# rank-0 per-iteration loss from the IterSpeed callback, e.g. +# [RANK 0] Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ... +_RANK0_LOSS_RE = re.compile( + r"\[RANK\s+0\]\s+Iteration\s+\d+:\s+Hit counter:[^|]+\|\s+Loss:\s+([-+0-9.eE]+)" +) def _free_port() -> int: @@ -63,16 +74,15 @@ def _free_port() -> int: def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]: """Run ``cmd`` from the repo root, tee combined output to ``log_file``. - Returns ``(returncode, combined_output)``. Inherits the caller's env (HF - cache, etc.) plus ``PYTHONPATH=.``. + Returns ``(returncode, combined_output)``. Streams live to stdout (so CI + shows progress under ``pytest -s``) while capturing into the log + a string. + Inherits the caller's env (HF cache, LD_LIBRARY_PATH, ...) plus ``PYTHONPATH=.``. """ env = os.environ.copy() env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" if extra_env: env.update(extra_env) log_file.parent.mkdir(parents=True, exist_ok=True) - # Tee: stream the subprocess output live to stdout (so CI shows progress - # under ``pytest -s``) while capturing it into the log file + a string. captured: list[str] = [] with log_file.open("w") as fp: proc = subprocess.Popen( @@ -134,14 +144,10 @@ def _ensure_dcp(log_dir: Path) -> None: assert _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()), f"DCP not written to {_DCP_DIR}" -def _finite_losses(text: str) -> list[float]: - """Parse per-iteration ``Loss:`` values from the training log. - - Matches the ``iter_speed`` callback line, e.g. - ``Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ...``. - """ +def _rank0_losses(text: str) -> list[float]: + """Parse the rank-0 per-iteration ``Loss:`` series (one value per step).""" vals = [] - for m in re.finditer(r"Loss:\s*([-+0-9.eE]+)", text): + for m in _RANK0_LOSS_RE.finditer(text): try: v = float(m.group(1)) except ValueError: @@ -151,6 +157,119 @@ def _finite_losses(text: str) -> list[float]: return vals +def _safetensors_tensor_names(path: Path) -> set[str]: + """Validate a .safetensors header (8-byte LE length + JSON) and return its tensor names.""" + assert path.is_file() and path.stat().st_size > 8, f"safetensors shard missing/empty: {path}" + with path.open("rb") as f: + header_len = int.from_bytes(f.read(8), "little") + assert 0 < header_len < path.stat().st_size, f"bad safetensors header length in {path}: {header_len}" + header = json.loads(f.read(header_len)) # raises if the header isn't valid JSON + return {k for k in header if k != "__metadata__"} + + +def _assert_dcp_complete(dcp_root: Path) -> None: + """Structural + index-consistency completeness of a torch DCP (no tensor load). + + For each ``.metadata`` under ``dcp_root``: the shard files beside it must all + exist and be non-empty, and the set/count of ``*.distcp`` files on disk must + match the storage files the ``.metadata`` index references (no missing/extra). + Reading ``.metadata`` only parses the index, not the tensors. + """ + assert dcp_root.is_dir(), f"DCP dir missing: {dcp_root}" + metas = list(dcp_root.rglob(".metadata")) + assert metas, f"no DCP .metadata under {dcp_root}" + from torch.distributed.checkpoint import FileSystemReader + + for meta in metas: + assert meta.stat().st_size > 0, f"empty DCP .metadata: {meta}" + present = sorted(p.name for p in meta.parent.glob("*.distcp")) + assert present, f"no .distcp shards beside {meta}" + empty = [s for s in present if (meta.parent / s).stat().st_size == 0] + assert not empty, f"empty .distcp shards beside {meta}: {empty}" + + # Index consistency: the .metadata declares which shard files exist. + metadata = FileSystemReader(str(meta.parent)).read_metadata() + referenced = {getattr(info, "relative_path", None) for info in metadata.storage_data.values()} + referenced.discard(None) + if referenced: # skip only if this reader doesn't expose shard paths + missing = sorted(set(referenced) - set(present)) + assert not missing, ( + f"DCP {meta.parent}: .metadata references {len(referenced)} shard file(s) but " + f"these are missing on disk: {missing}" + ) + assert len(present) == len(referenced), ( + f"DCP {meta.parent}: {len(present)} .distcp file(s) on disk != " + f"{len(referenced)} referenced by .metadata ({present} vs {sorted(referenced)})" + ) + + # Tensor-manifest self-consistency: every tensor the .metadata declares + # (state_dict_metadata) must be backed by storage (no omitted param). + declared = set(metadata.state_dict_metadata.keys()) + stored = {getattr(idx, "fqn", None) for idx in metadata.storage_data.keys()} + stored.discard(None) + assert declared, f"DCP .metadata declares no tensors: {meta}" + if stored: # skip only if storage keys don't expose fqn + unstored = sorted(declared - stored) + assert not unstored, ( + f"DCP {meta.parent}: {len(unstored)} declared tensor(s) have no storage " + f"(omitted): {unstored[:10]}" + ) + + +def _assert_export_complete(model_dir: Path) -> None: + """Structural + index completeness of an exported HF safetensors checkpoint.""" + assert model_dir.is_dir(), f"export dir missing: {model_dir}" + # export_model writes checkpoint.json LAST as the "model is complete" sentinel. + for name in ("checkpoint.json", "config.json"): + p = model_dir / name + assert p.is_file() and p.stat().st_size > 0, f"export missing/empty {name} in {model_dir}" + json.loads(p.read_text()) # valid JSON + index = model_dir / "model.safetensors.index.json" + on_disk = sorted(p.name for p in model_dir.glob("*.safetensors")) + if index.is_file(): + weight_map = json.loads(index.read_text()).get("weight_map", {}) + declared = set(weight_map.keys()) + shards = sorted(set(weight_map.values())) + assert declared and shards, f"empty weight_map in {index}" + missing = sorted(set(shards) - set(on_disk)) + assert not missing, f"export {model_dir}: index references missing shards: {missing}" + # File-count consistency: exactly the index's shards on disk (no extra/missing). + assert len(on_disk) == len(shards), ( + f"export {model_dir}: {len(on_disk)} .safetensors on disk != {len(shards)} in index " + f"weight_map ({on_disk} vs {shards})" + ) + # Tensor-manifest self-consistency: the tensors actually stored across the + # shards must equal the index's declared keys (no omitted/extra param). + stored: set[str] = set() + for shard in shards: + stored |= _safetensors_tensor_names(model_dir / shard) + assert declared == stored, ( + f"export {model_dir}: index declares {len(declared)} tensors but shards hold {len(stored)} " + f"(missing from shards: {sorted(declared - stored)[:10]}; not in index: {sorted(stored - declared)[:10]})" + ) + else: + assert on_disk == ["model.safetensors"], ( + f"export {model_dir}: expected a single model.safetensors (no index), found {on_disk}" + ) + names = _safetensors_tensor_names(model_dir / "model.safetensors") + assert names, f"export {model_dir}: model.safetensors holds no tensors" + + +def _assert_valid_image(path: Path) -> None: + """Assert ``path`` is a valid, non-degenerate image.""" + assert path.is_file() and path.stat().st_size > 1024, f"output image missing/too small: {path}" + try: + from PIL import Image + except Exception: # pragma: no cover -- PIL expected in the env + assert path.read_bytes()[:3] == b"\xff\xd8\xff", f"not a JPEG: {path}" + return + with Image.open(path) as im: + im.verify() # detects truncation/corruption + with Image.open(path) as im: + width, height = im.size + assert width > 0 and height > 0, f"degenerate image size {width}x{height}: {path}" + + @pytest.fixture(scope="module", autouse=True) def _require_8_gpus() -> None: """Skip the module unless we can launch an 8-GPU training run here.""" @@ -170,14 +289,15 @@ def _require_8_gpus() -> None: @pytest.mark.level(2) @pytest.mark.gpus(8) - def test_nano_sft_vision_1iter(tmp_path: Path) -> None: - """Run the full Vision SFT (Cosmos3-Nano) 1-iter flow and check it trains a step.""" + def test_nano_sft_train_export_infer(tmp_path: Path) -> None: + """Full Cosmos3-Nano SFT pipeline: convert -> train 5 -> export -> t2i infer.""" + # 1-2. Inputs + HF->DCP convert, then DCP completeness. _ensure_inputs(tmp_path) _ensure_dcp(tmp_path) + _assert_dcp_complete(_DCP_DIR) - # Route all run-specific output (launcher logs + the saved checkpoint via - # the harness's IMAGINAIRE_OUTPUT_ROOT) under the pytest tmp dir, which - # pytest auto-cleans. Nothing run-specific is left in the repo tree. + # 3. Train 5 steps (run output -> pytest tmp via OUTPUT_ROOT + the harness's + # IMAGINAIRE_OUTPUT_ROOT). Free port avoids EADDRINUSE. rc, out = _run( ["bash", _LAUNCHER], tmp_path / "train.log", @@ -188,16 +308,55 @@ def test_nano_sft_vision_1iter(tmp_path: Path) -> None: }, ) assert rc == 0, f"SFT launch failed (exit {rc}):\nLog tail:\n{out[-4000:]}" - assert "Done with training" in out, f"training did not finish cleanly:\nLog tail:\n{out[-4000:]}" - losses = _finite_losses(out) - assert losses, f"no finite per-iteration 'Loss:' value found in training log:\n{out[-3000:]}" + losses = _rank0_losses(out) + assert len(losses) == 5, f"expected 5 rank-0 losses, parsed {losses}\nLog tail:\n{out[-2000:]}" + # Per-step diffusion loss is noisy (a random timestep is sampled each step), + # so a strict trend over just 5 steps flakes on a single noisy step. The + # robust "training is learning" signal is that the loss dropped below its + # starting value at some point. + assert min(losses) < losses[0], ( + f"loss never dropped below the first step over 5 steps (training not degrading): {losses}" + ) - # save_iter=1 -> the trainer logs the DCP checkpoint path it wrote. Its - # location is governed by IMAGINAIRE_OUTPUT_ROOT (the test harness points - # this at a pytest tmp dir), so read it from the log rather than guessing. + # 4. Locate the trained DCP + config, export to HF safetensors, check completeness. saved = re.findall(r"Saved checkpoint to (\S+)", out) - assert saved, f"no 'Saved checkpoint to ...' line in training log (save_iter=1):\n{out[-3000:]}" + assert saved, f"no 'Saved checkpoint to ...' line in training log:\n{out[-2000:]}" ckpt = Path(saved[-1]) - assert ckpt.is_dir() and any(ckpt.iterdir()), f"saved checkpoint dir missing/empty: {ckpt}" + assert ckpt.is_dir() and any(ckpt.iterdir()), f"trained checkpoint dir missing/empty: {ckpt}" + run_dir = ckpt.parent.parent # /checkpoints/iter_X -> + config_yaml = run_dir / "config.yaml" + assert config_yaml.is_file(), f"run config.yaml missing at {config_yaml}" + + export_dir = run_dir / "model" + rc, out = _run( + [ + "python", "-m", "cosmos_framework.scripts.export_model", + "--checkpoint-path", str(ckpt), + "--config-file", str(config_yaml), + "-o", str(export_dir), + ], + tmp_path / "export.log", + ) + assert rc == 0, f"export_model failed (exit {rc}):\nLog tail:\n{out[-4000:]}" + _assert_export_complete(export_dir) + + # 5. t2i inference from the exported model; check the image is valid. + infer_out = tmp_path / "exported_out" + rc, out = _run( + [ + "torchrun", "--nproc_per_node=8", f"--master_port={_free_port()}", + "-m", "cosmos_framework.scripts.inference", + "--parallelism-preset=throughput", + "-i", "inputs/omni/t2i.json", + "-o", str(infer_out), + "--checkpoint-path", str(export_dir), + "--seed=0", + ], + tmp_path / "infer.log", + ) + assert rc == 0, f"t2i inference from exported model failed (exit {rc}):\nLog tail:\n{out[-4000:]}" + images = list(infer_out.rglob("vision.jpg")) + assert len(images) == 1, f"expected one vision.jpg under {infer_out}, found {images}" + _assert_valid_image(images[0]) diff --git a/tests/vision_sft_nano_1iter.toml b/tests/vision_sft_nano_5iter.toml similarity index 80% rename from tests/vision_sft_nano_1iter.toml rename to tests/vision_sft_nano_5iter.toml index c88eed6..b88b8b5 100644 --- a/tests/vision_sft_nano_1iter.toml +++ b/tests/vision_sft_nano_5iter.toml @@ -5,15 +5,18 @@ # Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. # Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here). # -# SMOKE COPY of vision_sft_nano.toml: max_iter=1 + save_iter=1 so it trains a -# single optimizer step and immediately writes a DCP checkpoint. +# SMOKE COPY of vision_sft_nano.toml used by tests/nano_training_smoke_test.py: +# max_iter=5 + save_iter=5 so it trains a few optimizer steps and writes a DCP +# checkpoint at the end. warm_up_steps=1 (vs the production warmup) so the LR is +# at full value almost immediately and the training loss visibly trends down +# across the 5 logged steps (the test asserts mean(loss[-2:]) < loss[0]). [job] task = "vfm" experiment = "vision_sft_nano" project = "cosmos3" group = "sft" -name = "vision_sft_nano_1iter" +name = "vision_sft_nano_5iter" wandb_mode = "disabled" [model] @@ -63,13 +66,13 @@ f_max = [1.0] f_min = [0.0] f_start = [0.0] verbosity_interval = 0 -warm_up_steps = [50] +warm_up_steps = [1] # smoke: full LR almost immediately so loss trends down in 5 steps [trainer] distributed_parallelism = "fsdp" grad_accum_iter = 2 logging_iter = 1 -max_iter = 1 +max_iter = 5 [trainer.callbacks.compile_tokenizer] compile_after_iterations = 3 @@ -83,7 +86,7 @@ force_finite = true [checkpoint] keys_to_skip_loading = ["net_ema."] load_path = "${oc.env:BASE_CHECKPOINT_PATH}" -save_iter = 1 +save_iter = 5 [dataloader_train] max_sequence_length = 45056 From b31ff926e45af422d8320d2ecdb56cc5df0dd7f9 Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 20:40:01 -0700 Subject: [PATCH 6/8] Lint --- tests/launch_sft_vision_nano_5iter.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/launch_sft_vision_nano_5iter.sh b/tests/launch_sft_vision_nano_5iter.sh index 0fdf748..2175dcc 100755 --- a/tests/launch_sft_vision_nano_5iter.sh +++ b/tests/launch_sft_vision_nano_5iter.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + # SMOKE wrapper (test fixture) for tests/nano_training_smoke_test.py — mirrors # examples/launch_sft_vision_nano.sh but points at tests/vision_sft_nano_5iter.toml # (max_iter=5, save_iter=5). Reuses the shared launcher helper from examples/. From 7257ad05fc2906cd3224d77a7e32a89f12f71796 Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 21:09:31 -0700 Subject: [PATCH 7/8] test(regression): assert all 10 reasoner iters at a loose tolerance The llava_ov_datapacker (reasoner) spec previously asserted only iter-0 loss, since it runs non-deterministically (no deterministic Hopper FMHA backward kernel; streamed LLaVA-OneVision data) and iters 1+ drift run-to-run. Give LaunchSpec per-spec loss_rtol/loss_atol (defaulting to the tight 1e-3 the deterministic generator uses) and set the reasoner to assert all 10 iters at rtol=atol=0.01. Two H200 samples differ by at most ~0.006 across the 10 iters, so 0.01 holds with margin while still catching a real numerical regression. Recapture the reasoner goldens to H200 values (iter-0 is bit-exact H100==H200). The generator (vision_sft_nano) is unchanged: still all 10 iters at 1e-3. --- tests/launch_regression_test.py | 46 ++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py index 65b4358..2b7eb46 100644 --- a/tests/launch_regression_test.py +++ b/tests/launch_regression_test.py @@ -54,9 +54,10 @@ deterministic context). ``VLMModel.__init__`` honors the config-level flag via ``init_flash_attn_meta`` independently of the launcher arg, so both must be off. It also streams ``lmms-lab/LLaVA-OneVision-Data`` from - HuggingFace Hub, so only the first 2 iters reproduce in practice (later - iters drift with shard arrival order + non-det kernels). Set - ``COSMOS_REGRESSION_VLM_FULL=1`` to assert all 10 (expected to fail). + HuggingFace Hub: iter-0 is bit-exact but iters 1+ drift run-to-run with + shard arrival order + non-det kernels. All 10 iters are asserted, but with + the spec's loose ``loss_rtol``/``loss_atol`` (vs the tight 1e-3 the + deterministic vision spec uses) to absorb that drift. Refreshing the goldens (after an intentional numerical change):: @@ -198,6 +199,11 @@ class LaunchSpec: # the tighter goldens tolerance only on the iters that still reproduce in # practice (see ``deterministic_iters``). deterministic: bool = True + # Per-spec goldens tolerance for ``pytest.approx``. Deterministic specs use + # the tight default; non-deterministic specs (e.g. the reasoner) need a + # looser band to absorb per-step drift across the iters they assert. + loss_rtol: float = _DEFAULT_RTOL + loss_atol: float = _DEFAULT_ATOL # 4-GPU specs run by ``test_launch_regression``; 8-GPU specs run by @@ -251,15 +257,21 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]: "upload_reproducible_setup=false", ), loss_re=_VLM_LOSS_RE, - # Only iter-0 loss reproduces under non-deterministic mode: it's a - # pure forward on a seed-fixed batch with seed-fixed init weights, - # so it's bit-exact. Iter 1+ depends on iter-0's non-deterministic - # backward (no deterministic Hopper FMHA kernel on H100) and drifts - # immediately. - deterministic_iters=1, + # Non-deterministic spec: iter-0 is bit-exact (pure forward on a + # seed-fixed batch + init), but iters 1+ drift run-to-run (the Hopper + # FMHA backward has no deterministic kernel and the LLaVA-OneVision + # data is streamed). We still assert all 10 iters but with a loose + # tolerance (loss_rtol/loss_atol below) to absorb that drift. + deterministic_iters=10, # See the ``deterministic=false`` override above for the # Hopper-FMHA rationale; the launcher flag is dropped to match. deterministic=False, + # Loose band for the non-deterministic per-step loss (vs the tight + # 1e-3 default the deterministic VFM spec uses). Two H200 samples + # differ by at most ~0.006 across the 10 iters, so 0.01 holds with + # margin while still catching a real numerical regression. + loss_rtol=0.01, + loss_atol=0.01, ), "vision_sft_nano": LaunchSpec( # Replicates launch_sft_vision_nano.sh, capped to 10 iters. @@ -494,11 +506,9 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, ) n = spec.deterministic_iters - if spec.key == "llava_ov_datapacker" and os.environ.get("COSMOS_REGRESSION_VLM_FULL") == "1": - n = 10 assert loss[:n] == pytest.approx( - expected["loss"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL + expected["loss"][:n], rel=spec.loss_rtol, abs=spec.loss_atol ), ( f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens\n" f" got : {loss[:n]}\n" @@ -509,7 +519,7 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, if expected["grad_norm"] is None: return assert grad_norm[:n] == pytest.approx( - expected["grad_norm"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL + expected["grad_norm"][:n], rel=spec.loss_rtol, abs=spec.loss_atol ), ( f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens\n" f" got : {grad_norm[:n]}\n" @@ -565,12 +575,12 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path, h100_inputs: dict # Recaptured 2026-06-03 with deterministic mode off (both ``--deterministic`` # and ``model.config.deterministic`` are False — the Hopper FMHA # backward refuses to run under PyTorch deterministic mode on H100, see - # ``LaunchSpec.deterministic`` and the spec's hydra override). The full - # 10-iter series is captured for reference, but only ``deterministic_iters=1`` - # loss is asserted; iter 1+ drifts because the backward isn't bit-exact, - # and even iter-0 grad-norm drifts (so grad_norm is skipped via ``None``). + # ``LaunchSpec.deterministic`` and the spec's hydra override). These are + # H200 values (iter-0 is bit-exact H100==H200). All 10 iters are asserted + # but against the spec's loose tolerance (loss_rtol/loss_atol=0.01) since + # iters 1+ drift run-to-run; grad-norm is non-det too, so skipped (None). "llava_ov_datapacker": { - "loss": [0.88798, 1.01583, 1.06096, 1.05566, 1.00613, 0.91551, 1.10534, 1.03794, 0.94166, 0.69613], + "loss": [0.88798, 1.01444, 1.0565, 1.04765, 0.99979, 0.92324, 1.1051, 1.03238, 0.93775, 0.69643], "grad_norm": None, }, # Recaptured 2026-06-03 after the TOML-config rewrite shifted some From 84b853a6fc64a00591779ebd312e681e34d7fd8f Mon Sep 17 00:00:00 2001 From: "liang.feng" Date: Wed, 3 Jun 2026 21:25:55 -0700 Subject: [PATCH 8/8] Rename for order --- .../{gpu-smoke-training.yml => 0-gpu-smoke-training.yml} | 0 ...pu-regression-generator.yml => 1-gpu-regression-generator.yml} | 0 .../{gpu-smoke-inference.yml => 2-gpu-smoke-inference.yml} | 0 ...{gpu-regression-reasoner.yml => 3-gpu-regression-reasoner.yml} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{gpu-smoke-training.yml => 0-gpu-smoke-training.yml} (100%) rename .github/workflows/{gpu-regression-generator.yml => 1-gpu-regression-generator.yml} (100%) rename .github/workflows/{gpu-smoke-inference.yml => 2-gpu-smoke-inference.yml} (100%) rename .github/workflows/{gpu-regression-reasoner.yml => 3-gpu-regression-reasoner.yml} (100%) diff --git a/.github/workflows/gpu-smoke-training.yml b/.github/workflows/0-gpu-smoke-training.yml similarity index 100% rename from .github/workflows/gpu-smoke-training.yml rename to .github/workflows/0-gpu-smoke-training.yml diff --git a/.github/workflows/gpu-regression-generator.yml b/.github/workflows/1-gpu-regression-generator.yml similarity index 100% rename from .github/workflows/gpu-regression-generator.yml rename to .github/workflows/1-gpu-regression-generator.yml diff --git a/.github/workflows/gpu-smoke-inference.yml b/.github/workflows/2-gpu-smoke-inference.yml similarity index 100% rename from .github/workflows/gpu-smoke-inference.yml rename to .github/workflows/2-gpu-smoke-inference.yml diff --git a/.github/workflows/gpu-regression-reasoner.yml b/.github/workflows/3-gpu-regression-reasoner.yml similarity index 100% rename from .github/workflows/gpu-regression-reasoner.yml rename to .github/workflows/3-gpu-regression-reasoner.yml