From 5763d5074e6b6abae807cdf2525480f618acd0db Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 06:56:03 -0700
Subject: [PATCH 1/8] Add Cosmos3-Nano GPU smoke tests + GPU CI; self-prep
 regression inputs

- tests/nano_inference_smoke_test.py: 8-GPU Cosmos3-Nano t2vs (text2video +
  sound) smoke. Asserts a vision.mp4 is produced, then decodes its muxed audio
  track (PyAV) and checks it is real sound: finite, non-empty, non-constant,
  above the silence floor.
- tests/nano_training_smoke_test.py: 8-GPU Vision SFT 1-iter smoke. Downloads
  the bridge subset + Wan VAE, converts Cosmos3-Nano -> DCP, runs the 1-iter
  launcher, and asserts training finishes with a finite loss + a written
  checkpoint. All run output goes under the pytest tmp dir.
- tests/launch_regression_test.py: prepare inputs in-test via the new
  h100_inputs fixture (download + convert, honoring pre-set env vars, cleaned
  on teardown) instead of requiring tests/_stage_h100_inputs.sh env vars;
  re-captured h100 goldens at transformers==4.57.6; map H200 to the h100
  goldens key (the GPU CI runs on 8xH200).
- tests/{launch_sft_vision_nano_1iter.sh,vision_sft_nano_1iter.toml}: 1-iter
  SFT recipe fixtures (moved from examples/; the launcher reuses the shared
  examples/_sft_launcher_common.sh).
- .github/workflows/gpu-tests.yml: on push/PR to main, run the 8-GPU smoke
  tests and the 4-GPU SFT regression on a self-hosted 8xH200 runner.

All GPU tests are gated by the gpus()/level() markers + --num-gpus/--levels, so
the no-GPU pre-commit CI is unaffected. Verified on 8xH100: nano smoke 2 passed,
SFT regression 2 passed.
---
 .github/workflows/gpu-tests.yml       |  72 ++++++++++
 tests/launch_regression_test.py       | 160 +++++++++++++++------
 tests/launch_sft_vision_nano_1iter.sh |  17 +++
 tests/nano_inference_smoke_test.py    | 177 +++++++++++++++++++++++
 tests/nano_training_smoke_test.py     | 197 ++++++++++++++++++++++++++
 tests/vision_sft_nano_1iter.toml      |  91 ++++++++++++
 6 files changed, 673 insertions(+), 41 deletions(-)
 create mode 100644 .github/workflows/gpu-tests.yml
 create mode 100755 tests/launch_sft_vision_nano_1iter.sh
 create mode 100644 tests/nano_inference_smoke_test.py
 create mode 100644 tests/nano_training_smoke_test.py
 create mode 100644 tests/vision_sft_nano_1iter.toml

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
new file mode 100644
index 0000000..816c06f
--- /dev/null
+++ b/.github/workflows/gpu-tests.yml
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# GPU regression + smoke tests on a self-hosted 8×H200 runner.
+#
+# Requires:
+#   * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
+#     NVIDIA drivers, and `uv` on PATH;
+#   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
+#
+# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct)
+# are downloaded/converted in-test and cached in the runner's HF cache; the
+# first run is slow (~30 GB Nano + ~16 GB Qwen + DCP convert), later runs reuse
+# the cache.
+name: GPU Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+# Don't pile up 8-GPU runs: cancel an in-progress run for the same ref when a
+# newer commit arrives.
+concurrency:
+  group: gpu-tests-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  gpu-tests:
+    runs-on: [self-hosted, gpu, h200]
+    timeout-minutes: 90
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_DISABLE_XET: "1"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Sync environment (cu128-train)
+        run: uv sync --all-extras --group=cu128-train
+
+      # 8-GPU smoke tests: Cosmos3-Nano t2vs inference (+ sound check) and a
+      # 1-iter Vision SFT. MAX_GPUS defaults to 8.
+      - name: Nano smoke tests (8 GPU)
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \
+            --num-gpus=8 --levels=2 -o addopts=
+
+      # SFT loss/grad-norm regression on a 4-GPU subset (h100 goldens; H200 maps
+      # to the same key). TEST_MAX_GPUS=4 selects the 4-GPU test variant.
+      - name: SFT regression (4-GPU subset)
+        env:
+          TEST_MAX_GPUS: "4"
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts=
+
+      # Clear the run's heavy artifacts (even on failure) to keep the runner's
+      # disk bounded: examples/checkpoints (the Cosmos3-Nano DCP + Wan VAE,
+      # ~30 GB) and the pytest tmp dirs (smoke-test videos + the SFT checkpoint).
+      # The small examples/data dataset and the HF cache are intentionally kept
+      # so subsequent runs reuse them.
+      - name: Clean up run outputs
+        if: always()
+        run: |
+          rm -rf examples/checkpoints || true
+          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py
index 5545766..2952b68 100644
--- a/tests/launch_regression_test.py
+++ b/tests/launch_regression_test.py
@@ -93,24 +93,36 @@
 # below skips the GB200 arch instead of re-running it.
 
 
-def _h100_paths_from_env() -> dict[str, str]:
-    """Resolve H100 input paths from env vars (set by tests/_stage_h100_inputs.sh).
+def _hf_download(args: list[str]) -> str:
+    """``uvx hf download <args> --quiet`` -> the local path it prints (from the HF cache)."""
+    result = subprocess.run(
+        ["uvx", "hf@latest", "download", *args, "--quiet"],
+        cwd=str(REPO_ROOT),
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        pytest.fail(f"hf download failed for {args} (exit {result.returncode}):\n{result.stdout}\n{result.stderr}")
+    lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
+    if not lines:
+        pytest.fail(f"hf download for {args} printed no path:\n{result.stdout}\n{result.stderr}")
+    return lines[-1]
 
-    All four env vars are required because the SFT TOMLs interpolate
-    ``DATASET_PATH`` / ``WAN_VAE_PATH`` / ``BASE_CHECKPOINT_PATH`` at load time
-    and the VLM spec passes ``MODEL_PATH`` as a Hydra backbone override.
-    """
-    missing = [
-        var
-        for var in ("DATASET_PATH", "WAN_VAE_PATH", "BASE_CHECKPOINT_PATH", "MODEL_PATH")
-        if not os.environ.get(var)
-    ]
-    if missing:
-        pytest.skip(
-            f"H100 regression needs env vars: {missing}. "
-            "Run tests/_stage_h100_inputs.sh and `source $STAGE_DIR/env.sh` first."
-        )
-    return {"vlm_model_path": os.environ["MODEL_PATH"]}
+
+def _convert_nano_dcp(dest: Path) -> None:
+    """Convert the Cosmos3-Nano checkpoint to DCP at ``dest`` (Step 2 of docs/training.md)."""
+    env = os.environ.copy()
+    env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}"
+    result = subprocess.run(
+        [
+            sys.executable, "-m", "cosmos_framework.scripts.convert_model_to_dcp",
+            "-o", str(dest), "--checkpoint-path", "Cosmos3-Nano",
+        ],
+        cwd=str(REPO_ROOT),
+        env=env,
+    )
+    if result.returncode != 0:
+        pytest.fail(f"convert_model_to_dcp (Cosmos3-Nano) failed with exit code {result.returncode}")
 
 
 def _detect_arch() -> str:
@@ -122,17 +134,17 @@ def _detect_arch() -> str:
     name = torch.cuda.get_device_name(0).upper()
     if "GB200" in name:
         return "gb200"
-    if "H100" in name:
+    # H200 shares the Hopper kernels with H100 and is treated identically here:
+    # both map to the ``h100`` goldens key (the GitHub GPU CI runs on 8×H200).
+    if "H100" in name or "H200" in name:
         return "h100"
     return "unknown"
 
 
-def _resolve_paths(arch: str) -> dict[str, str]:
-    if arch == "h100":
-        return _h100_paths_from_env()
-    if arch == "gb200":
-        pytest.skip("gb200 inputs not in OSS layout; goldens kept for historical reference only.")
-    pytest.skip(f"no regression goldens for GPU arch {arch!r}; only h100 supported")
+# Pinned revisions mirror tests/_stage_h100_inputs.sh so prepared inputs match
+# the captured h100 goldens.
+_BRIDGE_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7"
+_QWEN_VL_REVISION = "0c351dd01ed87e9c1b53cbc748cba10e6187ff3b"
 
 
 # Tolerances for ``pytest.approx``. The launch passes ``--deterministic`` and
@@ -336,20 +348,30 @@ def _run_torchrun(spec: LaunchSpec, run_dir: Path) -> Path:
     env["IMAGINAIRE_OUTPUT_ROOT"] = str(run_dir / "output")
     env.update(spec.extra_env)
 
+    # Tee: stream the torchrun output live to stdout (so CI shows training
+    # progress under ``pytest -s``) while capturing it into the log file.
     with log_file.open("w") as fp:
-        result = subprocess.run(
+        proc = subprocess.Popen(
             cmd,
             env=env,
             cwd=str(REPO_ROOT),
-            stdout=fp,
+            stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
         )
-    if result.returncode != 0:
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            sys.stdout.write(line)
+            sys.stdout.flush()
+            fp.write(line)
+        returncode = proc.wait()
+    if returncode != 0:
         # Tolerate harmless PyGIL teardown warnings if training did complete.
         text = log_file.read_text(errors="replace")
         if "Done with training" not in text:
             pytest.fail(
-                f"{spec.key}: torchrun failed with exit code {result.returncode} "
+                f"{spec.key}: torchrun failed with exit code {returncode} "
                 "and log does not contain 'Done with training'.\n"
                 f"Log tail:\n{text[-2000:]}"
             )
@@ -372,13 +394,68 @@ def _require_4_gpus() -> None:
         pytest.skip(f"requires 4 visible CUDA devices, found {torch.cuda.device_count()}")
 
 
+@pytest.fixture(scope="module")
+def h100_inputs(tmp_path_factory: pytest.TempPathFactory):
+    """Provide the regression input paths, preparing any not already set in env.
+
+    Mirrors the download/convert steps of ``tests/_stage_h100_inputs.sh`` (it
+    does NOT set up the environment -- ``uv sync`` and the ``transformers``
+    pin still belong to that script / the caller). Honors pre-set env vars (so
+    ``source env.sh`` still works); anything prepared here goes under a temp
+    stage dir that is removed on teardown. The four vars are exported because
+    the SFT TOMLs interpolate ``DATASET_PATH`` / ``WAN_VAE_PATH`` /
+    ``BASE_CHECKPOINT_PATH`` at load time and the VLM spec passes ``MODEL_PATH``
+    as a Hydra backbone override.
+    """
+    arch = _detect_arch()
+    if arch == "gb200":
+        pytest.skip("gb200 inputs not in OSS layout; goldens kept for historical reference only.")
+    if arch != "h100":
+        pytest.skip(f"no regression goldens for GPU arch {arch!r}; only h100 supported")
+    if shutil.which("uvx") is None:
+        pytest.skip("uvx not on PATH -- required to prepare regression inputs")
+
+    stage = tmp_path_factory.mktemp("h100_stage")
+    set_vars: list[str] = []
+
+    def _ensure(var: str, value_fn) -> None:
+        if not os.environ.get(var):
+            os.environ[var] = str(value_fn())
+            set_vars.append(var)
+
+    _ensure(
+        "DATASET_PATH",
+        lambda: Path(
+            _hf_download(
+                ["--repo-type", "dataset", "nvidia/bridge-v2-subset-synthetic-captions",
+                 "--revision", _BRIDGE_REVISION]
+            )
+        ) / "sft_dataset_bridge",
+    )
+    _ensure("WAN_VAE_PATH", lambda: _hf_download(["Wan-AI/Wan2.2-TI2V-5B", "Wan2.2_VAE.pth"]))
+    _ensure("MODEL_PATH", lambda: _hf_download(["Qwen/Qwen3-VL-8B-Instruct", "--revision", _QWEN_VL_REVISION]))
+
+    def _make_dcp() -> Path:
+        dest = stage / "Cosmos3-Nano-DCP"
+        _convert_nano_dcp(dest)
+        return dest
+
+    _ensure("BASE_CHECKPOINT_PATH", _make_dcp)
+
+    try:
+        yield {"vlm_model_path": os.environ["MODEL_PATH"]}
+    finally:
+        for var in set_vars:
+            os.environ.pop(var, None)
+        shutil.rmtree(stage, ignore_errors=True)
+
+
 # --- tests -------------------------------------------------------------------
 
 
-def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path) -> None:
+def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str, str]) -> None:
     """Re-run ``spec``'s torchrun command and check loss / grad-norm against goldens."""
     arch = _detect_arch()
-    paths = _resolve_paths(arch)
     spec = _build_specs(paths)[spec_key]
 
     log_path = _run_torchrun(spec, tmp_path)
@@ -430,9 +507,9 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path) -> None:
     @pytest.mark.level(2)
     @pytest.mark.gpus(4)
     @pytest.mark.parametrize("spec_key", _SPEC_KEYS, ids=lambda k: k.removeprefix("launch_"))
-    def test_launch_regression(spec_key: str, tmp_path: Path) -> None:
+    def test_launch_regression(spec_key: str, tmp_path: Path, h100_inputs: dict[str, str]) -> None:
         """Re-run ``spec``'s torchrun command and check loss / grad-norm against goldens."""
-        _assert_spec_matches_goldens(spec_key, tmp_path)
+        _assert_spec_matches_goldens(spec_key, tmp_path, h100_inputs)
 
 
 if MAX_GPUS == 8:
@@ -443,9 +520,9 @@ def test_launch_regression(spec_key: str, tmp_path: Path) -> None:
     @pytest.mark.parametrize(
         "spec_key", _SPEC_KEYS_8GPU, ids=lambda k: k.removeprefix("launch_")
     )
-    def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None:
+    def test_launch_regression_8gpu(spec_key: str, tmp_path: Path, h100_inputs: dict[str, str]) -> None:
         """8-GPU variant for ``vision_sft_super`` (dp_shard=4 × cp=2)."""
-        _assert_spec_matches_goldens(spec_key, tmp_path)
+        _assert_spec_matches_goldens(spec_key, tmp_path, h100_inputs)
 
 
 # Goldens keyed by GPU arch then ``LaunchSpec.key``. Refresh with
@@ -463,11 +540,12 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None:
             ],
         },
     },
-    # Captured 2026-05-27 on a 4 × NVIDIA H100 80GB HBM3 node with seed 42.
-    # Inputs come from ``tests/_stage_h100_inputs.sh``; VLM model is
-    # ``Qwen/Qwen3-VL-8B-Instruct``.
+    # Recaptured 2026-06-03 on a 4 × NVIDIA H100 80GB HBM3 node with seed 42 and
+    # transformers==4.57.6. VLM model is ``Qwen/Qwen3-VL-8B-Instruct``; inputs are
+    # prepared in-test by the ``h100_inputs`` fixture (or via
+    # ``tests/_stage_h100_inputs.sh`` if its env vars are pre-set).
     "h100": {
-        # Recaptured 2026-05-27 with deterministic mode off (both ``--deterministic``
+        # Recaptured 2026-06-03 with deterministic mode off (both ``--deterministic``
         # and ``model.config.deterministic`` are False — the Hopper FMHA
         # backward refuses to run under PyTorch deterministic mode on H100, see
         # ``LaunchSpec.deterministic`` and the spec's hydra override). The full
@@ -475,16 +553,16 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path) -> None:
         # loss is asserted; iter 1+ drifts because the backward isn't bit-exact,
         # and even iter-0 grad-norm drifts (so grad_norm is skipped via ``None``).
         "llava_ov_datapacker": {
-            "loss": [0.88798, 1.01436, 1.06162, 1.04558, 1.00519, 0.91837, 1.10527, 1.03337, 0.9421, 0.69604],
+            "loss": [0.88798, 1.01583, 1.06096, 1.05566, 1.00613, 0.91551, 1.10534, 1.03794, 0.94166, 0.69613],
             "grad_norm": None,
         },
-        # Recaptured 2026-05-27 after the TOML-config rewrite shifted some
+        # Recaptured 2026-06-03 after the TOML-config rewrite shifted some
         # defaults. Runs under ``--deterministic`` so loss reproduces bit-exact
         # across all 10 iters, but grad_norm is non-det because
         # ``compile.enabled=true`` makes the all-rank reduction not bit-exact
         # on H100.
         "vision_sft_nano": {
-            "loss": [0.2337, 0.2233, 0.2075, 0.2374, 0.2228, 0.2778, 0.2907, 0.223, 0.2125, 0.2699],
+            "loss": [0.2272, 0.2181, 0.2028, 0.2306, 0.218, 0.2734, 0.2865, 0.2162, 0.2055, 0.2643],
             "grad_norm": None,
         },
         "vision_sft_super": {
diff --git a/tests/launch_sft_vision_nano_1iter.sh b/tests/launch_sft_vision_nano_1iter.sh
new file mode 100755
index 0000000..546df96
--- /dev/null
+++ b/tests/launch_sft_vision_nano_1iter.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# SMOKE wrapper (test fixture) mirroring examples/launch_sft_vision_nano.sh but
+# pointing at the tests/vision_sft_nano_1iter.toml recipe (max_iter=1,
+# save_iter=1). Lives under tests/ and reuses the shared launcher helper from
+# examples/. Paths below are resolved relative to the repo root by
+# _sft_launcher_common.sh.
+
+TOML_FILE="tests/vision_sft_nano_1iter.toml"
+: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh"
diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py
new file mode 100644
index 0000000..d8b90cb
--- /dev/null
+++ b/tests/nano_inference_smoke_test.py
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""8-GPU smoke test for Cosmos3-Nano text-to-video-with-sound (t2vs) inference.
+
+Runs the canonical Cosmos3-Nano inference command from ``docs/inference.md`` on
+the ``inputs/omni/t2vs.json`` sample (``model_mode=text2video`` +
+``enable_sound=True``) on 8 GPUs, and asserts that the run completes, writes a
+video, and the muxed audio track is real sound (finite, non-empty, not silence,
+not a degenerate/constant signal) -- not numeric goldens (that is
+``launch_regression_test.py``'s job).
+
+The checkpoint (and its sound tokenizer) download from the Hugging Face Hub on
+first run and are reused from the HF cache afterward.
+
+Invocation (inside the inference container, from the repo root, on an 8-GPU
+node)::
+
+    pytest -s tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts=
+
+* ``--num-gpus=8 --levels=2`` matches the markers below; the conftest pins
+  ``CUDA_VISIBLE_DEVICES`` accordingly.
+* ``-o addopts=`` clears the repo ``.pytest.toml`` addopts that reference an
+  optional plugin not installed in the container.
+
+Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is
+not collected.
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from cosmos_framework.inference.fixtures.args import MAX_GPUS
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+# Distinct from the SFT launcher (50012) and torchrun's default (29500) so a
+# concurrent training smoke run does not collide on the rendezvous port.
+_MASTER_PORT = 29560
+
+# Audio sanity thresholds for the muxed sound track.
+_RMS_SILENCE_FLOOR = 1e-4  # below this the track is effectively silence
+_PEAK_SANITY_CEIL = 1.5    # decoded float audio should sit within ~[-1, 1]
+
+
+def _run(cmd: list[str], log_file: Path) -> str:
+    """Run ``cmd`` from the repo root, tee combined output to ``log_file``.
+
+    Inherits the caller's environment (notably the HF cache, so a
+    previously-downloaded Cosmos3-Nano is reused). Fails the test with the log
+    tail on a non-zero exit.
+    """
+    env = os.environ.copy()
+    env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}"
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    returncode, text = _stream(cmd, env, log_file)
+    if returncode != 0:
+        pytest.fail(
+            f"inference failed with exit code {returncode}:\n"
+            f"  {' '.join(cmd)}\n"
+            f"Log tail:\n{text[-3000:]}"
+        )
+    return text
+
+
+def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]:
+    """Run ``cmd`` and tee its combined output: live to stdout (so CI shows
+    progress under ``pytest -s``) and into ``log_file`` + a returned string.
+    """
+    captured: list[str] = []
+    with log_file.open("w") as fp:
+        proc = subprocess.Popen(
+            cmd, env=env, cwd=str(REPO_ROOT),
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1,
+        )
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            sys.stdout.write(line)
+            sys.stdout.flush()
+            fp.write(line)
+            captured.append(line)
+        returncode = proc.wait()
+    return returncode, "".join(captured)
+
+
+def _decode_audio_track(mp4_path: Path):
+    """Decode the muxed audio track of ``mp4_path`` to a (channels, samples) waveform.
+
+    Returns ``(waveform_float64, sample_rate)``. Fails the test if the file has
+    no audio stream or it decodes to zero frames.
+    """
+    import av
+    import numpy as np
+
+    with av.open(str(mp4_path)) as container:
+        audio_streams = container.streams.audio
+        assert audio_streams, f"{mp4_path} has no audio stream"
+        astream = audio_streams[0]
+        sample_rate = int(astream.rate)
+        chunks = [frame.to_ndarray() for frame in container.decode(astream)]
+    assert chunks, f"audio stream in {mp4_path} decoded to zero frames"
+
+    orig_dtype = chunks[0].dtype
+    wav = np.concatenate(chunks, axis=1).astype(np.float64)
+    if np.issubdtype(orig_dtype, np.integer):
+        wav = wav / float(np.iinfo(orig_dtype).max)
+    return wav, sample_rate
+
+
+def _assert_sound_not_noise(mp4_path: Path) -> None:
+    """Assert the muxed audio is real sound: finite, non-empty, non-silent, non-constant."""
+    import numpy as np
+
+    wav, sample_rate = _decode_audio_track(mp4_path)
+    assert wav.size > 0, f"empty audio in {mp4_path}"
+    assert sample_rate > 0, f"non-positive sample rate {sample_rate} in {mp4_path}"
+    assert np.all(np.isfinite(wav)), f"audio in {mp4_path} contains NaN/Inf"
+
+    peak = float(np.max(np.abs(wav)))
+    rms = float(np.sqrt(np.mean(wav**2)))
+    std = float(wav.std())
+    assert peak <= _PEAK_SANITY_CEIL, f"audio peak {peak} outside expected normalized range"
+    assert std > 1e-6, f"audio is constant/degenerate (std={std}) in {mp4_path}"
+    assert rms > _RMS_SILENCE_FLOOR, f"audio is silent/near-silent (rms={rms}) in {mp4_path}"
+
+
+@pytest.fixture(scope="module", autouse=True)
+def _require_8_gpus() -> None:
+    """Skip the module unless we can launch an 8-GPU run here."""
+    if shutil.which("torchrun") is None:
+        pytest.skip("torchrun not on PATH -- must run inside the inference container")
+    try:
+        import torch
+    except Exception as exc:  # pragma: no cover -- surfaces during dev only
+        pytest.skip(f"torch unavailable ({exc!r})")
+    if not torch.cuda.is_available() or torch.cuda.device_count() < 8:
+        pytest.skip(f"requires 8 visible CUDA devices, found {torch.cuda.device_count()}")
+
+
+# Defined only when the active MAX_GPUS is 8 -- the conftest rejects ``gpus(N)``
+# markers outside ``ALL_NUM_GPUS = (0, 1, MAX_GPUS)``.
+if MAX_GPUS == 8:
+
+    @pytest.mark.level(2)
+    @pytest.mark.gpus(8)
+    def test_nano_inference_t2vs(tmp_path: Path) -> None:
+        """Run the docs/inference.md Cosmos3-Nano t2vs command; check the video + its sound."""
+        out_dir = tmp_path / "out"
+        cmd = [
+            "torchrun",
+            "--nproc_per_node=8",
+            f"--master_port={_MASTER_PORT}",
+            "-m",
+            "cosmos_framework.scripts.inference",
+            "--parallelism-preset=throughput",
+            "-i",
+            "inputs/omni/t2vs.json",
+            "-o",
+            str(out_dir),
+            "--checkpoint-path",
+            "Cosmos3-Nano",
+            "--seed=0",
+        ]
+        _run(cmd, tmp_path / "inference.log")
+
+        videos = list(out_dir.rglob("vision.mp4"))
+        assert len(videos) == 1, f"expected exactly one vision.mp4 under {out_dir}, found {videos}"
+        video = videos[0]
+        assert video.stat().st_size > 0, f"empty output video at {video}"
+        assert list(out_dir.rglob("sample_outputs.json")), f"no sample_outputs.json under {out_dir}"
+
+        _assert_sound_not_noise(video)
diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py
new file mode 100644
index 0000000..9d1a44b
--- /dev/null
+++ b/tests/nano_training_smoke_test.py
@@ -0,0 +1,197 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""8-GPU smoke test for Cosmos3-Nano SFT training.
+
+Runs the documented Vision SFT (Cosmos3-Nano) flow from ``docs/training.md``
+end to end on 8 GPUs, capped to a single optimizer step via the
+``vision_sft_nano_1iter`` recipe (``max_iter=1``, ``save_iter=1``):
+
+  1. Step 1 -- download the bridge-v2 subset dataset + the Wan2.2 VAE.
+  2. Step 2 -- ``convert_model_to_dcp`` the Cosmos3-Nano checkpoint to DCP.
+  3. Step 3 -- run the paired launch shell ``launch_sft_vision_nano_1iter.sh``.
+
+It asserts only that training completes and writes a checkpoint with a finite
+loss (smoke -- no numeric goldens; that is ``launch_regression_test.py``'s job).
+
+Inputs land in the documented, ``.gitignore``-d default locations
+(``examples/data/``, ``examples/checkpoints/``) so they are cached across runs;
+the training output goes under ``outputs/`` (also git-ignored). Steps 1-2 are
+skipped when their artifacts already exist.
+
+Invocation (inside the training container, from the repo root, on an 8-GPU
+node)::
+
+    pytest -s tests/nano_training_smoke_test.py --num-gpus=8 --levels=2 -o addopts=
+
+Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is
+not collected.
+"""
+
+import os
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from cosmos_framework.inference.fixtures.args import MAX_GPUS
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+# Documented default locations (all git-ignored). Match the launcher defaults so
+# Step 3 needs no path overrides.
+_DATA_DIR = REPO_ROOT / "examples/data/bridge-v2-subset-synthetic-captions"
+_DATASET_PATH = _DATA_DIR / "sft_dataset_bridge"
+_DATASET_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7"
+_WAN_VAE = REPO_ROOT / "examples/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+_DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano"
+_LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh"
+
+# Distinct from torchrun's default (29500) and the inference smoke port (29560).
+_MASTER_PORT = 50112
+
+
+def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]:
+    """Run ``cmd`` from the repo root, tee combined output to ``log_file``.
+
+    Returns ``(returncode, combined_output)``. Inherits the caller's env (HF
+    cache, etc.) plus ``PYTHONPATH=.``.
+    """
+    env = os.environ.copy()
+    env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}"
+    if extra_env:
+        env.update(extra_env)
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    # Tee: stream the subprocess output live to stdout (so CI shows progress
+    # under ``pytest -s``) while capturing it into the log file + a string.
+    captured: list[str] = []
+    with log_file.open("w") as fp:
+        proc = subprocess.Popen(
+            cmd, env=env, cwd=str(REPO_ROOT),
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1,
+        )
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            sys.stdout.write(line)
+            sys.stdout.flush()
+            fp.write(line)
+            captured.append(line)
+        returncode = proc.wait()
+    return returncode, "".join(captured)
+
+
+def _ensure_inputs(log_dir: Path) -> None:
+    """Step 1: download the dataset + Wan2.2 VAE if not already present."""
+    if not (_DATASET_PATH / "train" / "video_dataset_file.jsonl").is_file():
+        rc, out = _run(
+            [
+                "uvx", "hf@latest", "download", "--repo-type", "dataset",
+                "nvidia/bridge-v2-subset-synthetic-captions",
+                "--revision", _DATASET_REVISION,
+                "--local-dir", str(_DATA_DIR), "--quiet",
+            ],
+            log_dir / "download_dataset.log",
+        )
+        assert rc == 0, f"dataset download failed (exit {rc}):\n{out[-2000:]}"
+    assert (_DATASET_PATH / "train" / "video_dataset_file.jsonl").is_file(), (
+        f"dataset missing {_DATASET_PATH}/train/video_dataset_file.jsonl after download"
+    )
+
+    if not _WAN_VAE.is_file():
+        rc, out = _run(
+            [
+                "uvx", "hf@latest", "download", "Wan-AI/Wan2.2-TI2V-5B", "Wan2.2_VAE.pth",
+                "--local-dir", str(_WAN_VAE.parent), "--quiet",
+            ],
+            log_dir / "download_wan_vae.log",
+        )
+        assert rc == 0, f"Wan VAE download failed (exit {rc}):\n{out[-2000:]}"
+    assert _WAN_VAE.is_file(), f"Wan VAE missing at {_WAN_VAE} after download"
+
+
+def _ensure_dcp(log_dir: Path) -> None:
+    """Step 2: convert Cosmos3-Nano to DCP if not already present."""
+    if _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()):
+        return
+    rc, out = _run(
+        [
+            "python", "-m", "cosmos_framework.scripts.convert_model_to_dcp",
+            "--checkpoint-path", "Cosmos3-Nano",
+            "-o", str(_DCP_DIR),
+        ],
+        log_dir / "convert_to_dcp.log",
+    )
+    assert rc == 0, f"convert_model_to_dcp failed (exit {rc}):\n{out[-3000:]}"
+    assert _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()), f"DCP not written to {_DCP_DIR}"
+
+
+def _finite_losses(text: str) -> list[float]:
+    """Parse per-iteration ``Loss:`` values from the training log.
+
+    Matches the ``iter_speed`` callback line, e.g.
+    ``Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ...``.
+    """
+    vals = []
+    for m in re.finditer(r"Loss:\s*([-+0-9.eE]+)", text):
+        try:
+            v = float(m.group(1))
+        except ValueError:
+            continue
+        if v == v and abs(v) != float("inf"):  # finite (NaN != NaN)
+            vals.append(v)
+    return vals
+
+
+@pytest.fixture(scope="module", autouse=True)
+def _require_8_gpus() -> None:
+    """Skip the module unless we can launch an 8-GPU training run here."""
+    if shutil.which("torchrun") is None:
+        pytest.skip("torchrun not on PATH -- must run inside the training container")
+    if shutil.which("uvx") is None:
+        pytest.skip("uvx not on PATH -- required to download the dataset / Wan VAE")
+    try:
+        import torch
+    except Exception as exc:  # pragma: no cover
+        pytest.skip(f"torch unavailable ({exc!r})")
+    if not torch.cuda.is_available() or torch.cuda.device_count() < 8:
+        pytest.skip(f"requires 8 visible CUDA devices, found {torch.cuda.device_count()}")
+
+
+if MAX_GPUS == 8:
+
+    @pytest.mark.level(2)
+    @pytest.mark.gpus(8)
+    def test_nano_sft_vision_1iter(tmp_path: Path) -> None:
+        """Run the full Vision SFT (Cosmos3-Nano) 1-iter flow and check it trains a step."""
+        _ensure_inputs(tmp_path)
+        _ensure_dcp(tmp_path)
+
+        # Route all run-specific output (launcher logs + the saved checkpoint via
+        # the harness's IMAGINAIRE_OUTPUT_ROOT) under the pytest tmp dir, which
+        # pytest auto-cleans. Nothing run-specific is left in the repo tree.
+        rc, out = _run(
+            ["bash", _LAUNCHER],
+            tmp_path / "train.log",
+            extra_env={
+                "MASTER_PORT": str(_MASTER_PORT),
+                "OUTPUT_ROOT": str(tmp_path / "launcher_out"),
+                "NPROC_PER_NODE": "8",
+            },
+        )
+        assert rc == 0, f"SFT launch failed (exit {rc}):\nLog tail:\n{out[-4000:]}"
+
+        assert "Done with training" in out, f"training did not finish cleanly:\nLog tail:\n{out[-4000:]}"
+
+        losses = _finite_losses(out)
+        assert losses, f"no finite per-iteration 'Loss:' value found in training log:\n{out[-3000:]}"
+
+        # save_iter=1 -> the trainer logs the DCP checkpoint path it wrote. Its
+        # location is governed by IMAGINAIRE_OUTPUT_ROOT (the test harness points
+        # this at a pytest tmp dir), so read it from the log rather than guessing.
+        saved = re.findall(r"Saved checkpoint to (\S+)", out)
+        assert saved, f"no 'Saved checkpoint to ...' line in training log (save_iter=1):\n{out[-3000:]}"
+        ckpt = Path(saved[-1])
+        assert ckpt.is_dir() and any(ckpt.iterdir()), f"saved checkpoint dir missing/empty: {ckpt}"
diff --git a/tests/vision_sft_nano_1iter.toml b/tests/vision_sft_nano_1iter.toml
new file mode 100644
index 0000000..c88eed6
--- /dev/null
+++ b/tests/vision_sft_nano_1iter.toml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano)
+# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
+# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here).
+#
+# SMOKE COPY of vision_sft_nano.toml: max_iter=1 + save_iter=1 so it trains a
+# single optimizer step and immediately writes a DCP checkpoint.
+
+[job]
+task         = "vfm"
+experiment   = "vision_sft_nano"
+project      = "cosmos3"
+group        = "sft"
+name         = "vision_sft_nano_1iter"
+wandb_mode   = "disabled"
+
+[model]
+max_num_tokens_after_packing = 45056
+joint_attn_implementation    = "two_way"
+precision                    = "bfloat16"                # was [model.parallelism].precision
+
+[model.ema]
+enabled         = true
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = -1                     # -1 = auto from WORLD_SIZE (matches legacy)
+data_parallel_replicate_degree  = 1
+
+[model.compile]
+enabled                         = true                   # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-6
+fused         = true
+keys_to_select = [
+    "moe_gen",
+    "time_embedder",
+    "vae2llm",
+    "llm2vae",
+]
+lr            = 2.0e-5
+weight_decay  = 0                                        # int matches legacy YAML repr
+# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands).
+
+[scheduler]
+cycle_lengths      = [1000]
+f_max              = [1.0]
+f_min              = [0.0]
+f_start            = [0.0]
+verbosity_interval = 0
+warm_up_steps      = [50]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 2
+logging_iter            = 1
+max_iter                = 1
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+# warmup_resolutions omitted (None at experiment level)
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 0.1
+force_finite = true
+
+[checkpoint]
+keys_to_skip_loading = ["net_ema."]
+load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter            = 1
+
+[dataloader_train]
+max_sequence_length = 45056
+# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count)
+# seed omitted — PackingDataLoader has no seed ctor kwarg

From 4bede64171536cdc64931b89481967a845b10b9f Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 07:46:38 -0700
Subject: [PATCH 2/8] Split GPU CI into separate smoke and regression workflows

Replace the single gpu-tests.yml (one job, two test steps) with two
workflows so the 8-GPU nano smoke tests and the 4-GPU SFT regression run
and report independently:

- .github/workflows/gpu-smoke-tests.yml: nano t2vs + 1-iter SFT smoke
  (--num-gpus=8 --levels=2).
- .github/workflows/gpu-regression.yml: SFT loss/grad-norm regression
  (TEST_MAX_GPUS=4, --num-gpus=4 --levels=2).

Both run on [self-hosted, gpu, h200] for push/PR to main with pytest -v -s
(live logs) and an if: always() cleanup; distinct concurrency groups so
they don't cancel each other.
---
 .github/workflows/gpu-regression.yml  | 57 +++++++++++++++++++++
 .github/workflows/gpu-smoke-tests.yml | 58 +++++++++++++++++++++
 .github/workflows/gpu-tests.yml       | 72 ---------------------------
 3 files changed, 115 insertions(+), 72 deletions(-)
 create mode 100644 .github/workflows/gpu-regression.yml
 create mode 100644 .github/workflows/gpu-smoke-tests.yml
 delete mode 100644 .github/workflows/gpu-tests.yml

diff --git a/.github/workflows/gpu-regression.yml b/.github/workflows/gpu-regression.yml
new file mode 100644
index 0000000..691daaf
--- /dev/null
+++ b/.github/workflows/gpu-regression.yml
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# SFT loss/grad-norm regression on a self-hosted 8×H200 runner (4-GPU subset).
+#
+# Requires:
+#   * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs,
+#     NVIDIA drivers, and `uv` on PATH;
+#   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
+#
+# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct)
+# are downloaded / converted in-test into a temp stage that is removed on
+# teardown; raw downloads are cached in the runner's HF cache. The h100 goldens
+# are reused on H200 (see _detect_arch).
+name: GPU Regression
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: gpu-regression-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  sft-regression:
+    runs-on: [self-hosted, gpu, h200]
+    timeout-minutes: 60
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_DISABLE_XET: "1"
+      # Select the 4-GPU regression test variant (uses 4 of the 8 GPUs).
+      TEST_MAX_GPUS: "4"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Sync environment (cu128-train)
+        run: uv sync --all-extras --group=cu128-train
+
+      # llava_ov_datapacker + vision_sft_nano vs the h100 goldens (H200 maps to
+      # the same key). -s streams the live training log.
+      - name: SFT regression (4-GPU subset)
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts=
+
+      # The h100_inputs fixture removes its DCP stage on teardown; clear the
+      # pytest tmp dirs too (logs + any run output). The HF cache is kept.
+      - name: Clean up run outputs
+        if: always()
+        run: |
+          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
diff --git a/.github/workflows/gpu-smoke-tests.yml b/.github/workflows/gpu-smoke-tests.yml
new file mode 100644
index 0000000..7d4f3f4
--- /dev/null
+++ b/.github/workflows/gpu-smoke-tests.yml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Cosmos3-Nano 8-GPU smoke tests on a self-hosted 8×H200 runner.
+#
+# Requires:
+#   * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
+#     NVIDIA drivers, and `uv` on PATH;
+#   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
+#
+# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE) are downloaded /
+# converted in-test and cached in the runner's HF cache; the first run is slow
+# (~30 GB Nano + DCP convert), later runs reuse the cache.
+name: GPU Smoke Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: gpu-smoke-tests-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nano-smoke:
+    runs-on: [self-hosted, gpu, h200]
+    timeout-minutes: 60
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_DISABLE_XET: "1"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Sync environment (cu128-train)
+        run: uv sync --all-extras --group=cu128-train
+
+      # Cosmos3-Nano t2vs inference (+ sound check) and a 1-iter Vision SFT.
+      # MAX_GPUS defaults to 8. -s streams the live process log.
+      - name: Nano smoke tests (8 GPU)
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \
+            --num-gpus=8 --levels=2 -o addopts=
+
+      # Clear the run's heavy artifacts (even on failure): examples/checkpoints
+      # (the Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs
+      # (t2vs video + the SFT checkpoint). The small examples/data dataset and
+      # the HF cache are intentionally kept so subsequent runs reuse them.
+      - name: Clean up run outputs
+        if: always()
+        run: |
+          rm -rf examples/checkpoints || true
+          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
deleted file mode 100644
index 816c06f..0000000
--- a/.github/workflows/gpu-tests.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# GPU regression + smoke tests on a self-hosted 8×H200 runner.
-#
-# Requires:
-#   * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
-#     NVIDIA drivers, and `uv` on PATH;
-#   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
-#
-# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct)
-# are downloaded/converted in-test and cached in the runner's HF cache; the
-# first run is slow (~30 GB Nano + ~16 GB Qwen + DCP convert), later runs reuse
-# the cache.
-name: GPU Tests
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-
-# Don't pile up 8-GPU runs: cancel an in-progress run for the same ref when a
-# newer commit arrives.
-concurrency:
-  group: gpu-tests-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  gpu-tests:
-    runs-on: [self-hosted, gpu, h200]
-    timeout-minutes: 90
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      HF_HUB_DISABLE_XET: "1"
-    steps:
-      - uses: actions/checkout@v6
-
-      - uses: astral-sh/setup-uv@v7
-
-      - name: Sync environment (cu128-train)
-        run: uv sync --all-extras --group=cu128-train
-
-      # 8-GPU smoke tests: Cosmos3-Nano t2vs inference (+ sound check) and a
-      # 1-iter Vision SFT. MAX_GPUS defaults to 8.
-      - name: Nano smoke tests (8 GPU)
-        run: |
-          export LD_LIBRARY_PATH=
-          uv run --all-extras --group=cu128-train python -m pytest -v -s \
-            tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \
-            --num-gpus=8 --levels=2 -o addopts=
-
-      # SFT loss/grad-norm regression on a 4-GPU subset (h100 goldens; H200 maps
-      # to the same key). TEST_MAX_GPUS=4 selects the 4-GPU test variant.
-      - name: SFT regression (4-GPU subset)
-        env:
-          TEST_MAX_GPUS: "4"
-        run: |
-          export LD_LIBRARY_PATH=
-          uv run --all-extras --group=cu128-train python -m pytest -v -s \
-            tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts=
-
-      # Clear the run's heavy artifacts (even on failure) to keep the runner's
-      # disk bounded: examples/checkpoints (the Cosmos3-Nano DCP + Wan VAE,
-      # ~30 GB) and the pytest tmp dirs (smoke-test videos + the SFT checkpoint).
-      # The small examples/data dataset and the HF cache are intentionally kept
-      # so subsequent runs reuse them.
-      - name: Clean up run outputs
-        if: always()
-        run: |
-          rm -rf examples/checkpoints || true
-          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true

From 6ab86a8c66f7740a7c9d33edc38f52c9b54ce6b7 Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 08:04:06 -0700
Subject: [PATCH 3/8] Pick a free torchrun master port in the GPU tests

The smoke + regression tests used hardcoded --master_port values (50012/
50022/50023, 29560/50112), which raise
`DistNetworkError: ... EADDRINUSE ... port: 50022` when a port is held by a
lingering process, in TIME_WAIT, or a concurrent run. Each test now binds an
OS-assigned free port (_free_port) right before launching torchrun and passes
it as --master_port / the launcher MASTER_PORT. Dropped the now-unused
LaunchSpec.master_port field.

Verified on 8xH100: nano training smoke 1 passed, no EADDRINUSE.
---
 tests/launch_regression_test.py    | 15 ++++++++++-----
 tests/nano_inference_smoke_test.py | 17 +++++++++++++----
 tests/nano_training_smoke_test.py  | 12 +++++++++---
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py
index 2952b68..0512d2a 100644
--- a/tests/launch_regression_test.py
+++ b/tests/launch_regression_test.py
@@ -71,6 +71,7 @@
 import os
 import re
 import shutil
+import socket
 import subprocess
 import sys
 from dataclasses import dataclass, field
@@ -85,6 +86,14 @@
 # the repo root; we always invoke torchrun from there.
 REPO_ROOT = THIS_DIR.parent
 
+
+def _free_port() -> int:
+    """Return a currently-free TCP port for torchrun's rendezvous, instead of a
+    hardcoded ``master_port`` that ``EADDRINUSE``s when a prior run lingers."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
 # --- per-arch input paths ----------------------------------------------------
 #
 # GB200: the original input snapshot lived on an internal read-only filesystem
@@ -178,7 +187,6 @@ class LaunchSpec:
 
     key: str  # goldens key + pytest parametrize id source
     sft_toml: str  # ``--sft-toml=...`` value, relative to REPO_ROOT
-    master_port: int
     extra_hydra_args: tuple[str, ...]
     loss_re: re.Pattern[str]
     deterministic_iters: int  # how many leading iters are bit-exact deterministic
@@ -216,7 +224,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]:
             # Replicates launch_sft_llava_ov.sh, capped to 10 iters.
             key="llava_ov_datapacker",
             sft_toml="examples/toml/sft_config/llava_ov_datapacker.toml",
-            master_port=50012,
             extra_hydra_args=(
                 # TAIL_OVERRIDES from launch_sft_llava_ov.sh — fields not modeled
                 # by SFTExperimentConfig.
@@ -261,7 +268,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]:
             # needed beyond the regression-cap overrides below.
             key="vision_sft_nano",
             sft_toml="examples/toml/sft_config/vision_sft_nano.toml",
-            master_port=50022,
             extra_hydra_args=(
                 "model.config.parallelism.data_parallel_shard_degree=4",
                 "model.config.compile.enabled=true",
@@ -280,7 +286,6 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]:
             # backbone's compile path is not bit-exact across runs on H100.
             key="vision_sft_super",
             sft_toml="examples/toml/sft_config/vision_sft_super.toml",
-            master_port=50023,
             nproc_per_node=8,
             extra_hydra_args=(
                 "model.config.parallelism.data_parallel_shard_degree=4",
@@ -327,7 +332,7 @@ def _run_torchrun(spec: LaunchSpec, run_dir: Path) -> Path:
     cmd = [
         "torchrun",
         f"--nproc_per_node={spec.nproc_per_node}",
-        f"--master_port={spec.master_port}",
+        f"--master_port={_free_port()}",
         "-m",
         "cosmos_framework.scripts.train",
         f"--sft-toml={spec.sft_toml}",
diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py
index d8b90cb..b44d7c0 100644
--- a/tests/nano_inference_smoke_test.py
+++ b/tests/nano_inference_smoke_test.py
@@ -29,6 +29,7 @@
 
 import os
 import shutil
+import socket
 import subprocess
 import sys
 from pathlib import Path
@@ -39,9 +40,17 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
 
-# Distinct from the SFT launcher (50012) and torchrun's default (29500) so a
-# concurrent training smoke run does not collide on the rendezvous port.
-_MASTER_PORT = 29560
+
+def _free_port() -> int:
+    """Return a currently-free TCP port for torchrun's rendezvous.
+
+    Avoids hardcoded ports that ``EADDRINUSE`` when a prior run's process
+    lingers or a port is in TIME_WAIT. (Small TOCTOU window between close and
+    torchrun's bind, acceptable for a single-node test.)
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
 
 # Audio sanity thresholds for the muxed sound track.
 _RMS_SILENCE_FLOOR = 1e-4  # below this the track is effectively silence
@@ -154,7 +163,7 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None:
         cmd = [
             "torchrun",
             "--nproc_per_node=8",
-            f"--master_port={_MASTER_PORT}",
+            f"--master_port={_free_port()}",
             "-m",
             "cosmos_framework.scripts.inference",
             "--parallelism-preset=throughput",
diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py
index 9d1a44b..ecdc545 100644
--- a/tests/nano_training_smoke_test.py
+++ b/tests/nano_training_smoke_test.py
@@ -31,6 +31,7 @@
 import os
 import re
 import shutil
+import socket
 import subprocess
 import sys
 from pathlib import Path
@@ -50,8 +51,13 @@
 _DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano"
 _LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh"
 
-# Distinct from torchrun's default (29500) and the inference smoke port (29560).
-_MASTER_PORT = 50112
+
+def _free_port() -> int:
+    """Return a currently-free TCP port for the launcher's torchrun rendezvous
+    (avoids EADDRINUSE from a hardcoded port / lingering process)."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
 
 
 def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]:
@@ -176,7 +182,7 @@ def test_nano_sft_vision_1iter(tmp_path: Path) -> None:
             ["bash", _LAUNCHER],
             tmp_path / "train.log",
             extra_env={
-                "MASTER_PORT": str(_MASTER_PORT),
+                "MASTER_PORT": str(_free_port()),
                 "OUTPUT_ROOT": str(tmp_path / "launcher_out"),
                 "NPROC_PER_NODE": "8",
             },

From ef331c40331a2ddc01b517deab58a56323b456ec Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 08:34:22 -0700
Subject: [PATCH 4/8] Split regression CI into generator/reasoner workflows;
 show run log on failure

- Replace gpu-regression.yml with two single-spec workflows so the generator
  (VFM, vision_sft_nano) and reasoner (VLM, llava_ov_datapacker) regressions
  run and report independently, each via `pytest -k <spec>`:
    .github/workflows/gpu-regression-generator.yml
    .github/workflows/gpu-regression-reasoner.yml
- launch_regression_test.py: on a goldens/parse mismatch, include the run-log
  tail and the got-vs-expected series in the failure message (the log also
  streams live under `pytest -s`), so failures carry the run detail.
---
 ...ssion.yml => gpu-regression-generator.yml} | 23 ++++----
 .github/workflows/gpu-regression-reasoner.yml | 58 +++++++++++++++++++
 tests/launch_regression_test.py               | 20 +++++--
 3 files changed, 85 insertions(+), 16 deletions(-)
 rename .github/workflows/{gpu-regression.yml => gpu-regression-generator.yml} (65%)
 create mode 100644 .github/workflows/gpu-regression-reasoner.yml

diff --git a/.github/workflows/gpu-regression.yml b/.github/workflows/gpu-regression-generator.yml
similarity index 65%
rename from .github/workflows/gpu-regression.yml
rename to .github/workflows/gpu-regression-generator.yml
index 691daaf..0de44fc 100644
--- a/.github/workflows/gpu-regression.yml
+++ b/.github/workflows/gpu-regression-generator.yml
@@ -1,18 +1,17 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-# SFT loss/grad-norm regression on a self-hosted 8×H200 runner (4-GPU subset).
+# Generator (VFM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU
+# subset). Runs the single ``vision_sft_nano`` spec of
+# tests/launch_regression_test.py.
 #
 # Requires:
 #   * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs,
 #     NVIDIA drivers, and `uv` on PATH;
 #   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
 #
-# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE, Qwen3-VL-8B-Instruct)
-# are downloaded / converted in-test into a temp stage that is removed on
-# teardown; raw downloads are cached in the runner's HF cache. The h100 goldens
-# are reused on H200 (see _detect_arch).
-name: GPU Regression
+# The h100 goldens are reused on H200 (see _detect_arch).
+name: GPU Regression (Generator)
 
 on:
   push:
@@ -21,11 +20,11 @@ on:
     branches: [main]
 
 concurrency:
-  group: gpu-regression-${{ github.ref }}
+  group: gpu-regression-generator-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
-  sft-regression:
+  generator-regression:
     runs-on: [self-hosted, gpu, h200]
     timeout-minutes: 60
     env:
@@ -41,13 +40,13 @@ jobs:
       - name: Sync environment (cu128-train)
         run: uv sync --all-extras --group=cu128-train
 
-      # llava_ov_datapacker + vision_sft_nano vs the h100 goldens (H200 maps to
-      # the same key). -s streams the live training log.
-      - name: SFT regression (4-GPU subset)
+      # Generator (vision_sft_nano) loss vs the h100 goldens. -s streams the live log.
+      - name: Generator regression (vision_sft_nano, 4-GPU subset)
         run: |
           export LD_LIBRARY_PATH=
           uv run --all-extras --group=cu128-train python -m pytest -v -s \
-            tests/launch_regression_test.py --num-gpus=4 --levels=2 -o addopts=
+            tests/launch_regression_test.py -k vision_sft_nano \
+            --num-gpus=4 --levels=2 -o addopts=
 
       # The h100_inputs fixture removes its DCP stage on teardown; clear the
       # pytest tmp dirs too (logs + any run output). The HF cache is kept.
diff --git a/.github/workflows/gpu-regression-reasoner.yml b/.github/workflows/gpu-regression-reasoner.yml
new file mode 100644
index 0000000..57b8ce7
--- /dev/null
+++ b/.github/workflows/gpu-regression-reasoner.yml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Reasoner (VLM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU
+# subset). Runs the single ``llava_ov_datapacker`` spec of
+# tests/launch_regression_test.py.
+#
+# Requires:
+#   * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs,
+#     NVIDIA drivers, and `uv` on PATH;
+#   * an `HF_TOKEN` repository secret (gated dataset/model downloads, incl. the
+#     streamed LLaVA-OneVision-Data dataset).
+#
+# The h100 goldens are reused on H200 (see _detect_arch).
+name: GPU Regression (Reasoner)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: gpu-regression-reasoner-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  reasoner-regression:
+    runs-on: [self-hosted, gpu, h200]
+    timeout-minutes: 60
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_DISABLE_XET: "1"
+      # Select the 4-GPU regression test variant (uses 4 of the 8 GPUs).
+      TEST_MAX_GPUS: "4"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Sync environment (cu128-train)
+        run: uv sync --all-extras --group=cu128-train
+
+      # Reasoner (llava_ov_datapacker) iter-0 loss vs the h100 goldens. -s streams
+      # the live log.
+      - name: Reasoner regression (llava_ov_datapacker, 4-GPU subset)
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/launch_regression_test.py -k llava_ov_datapacker \
+            --num-gpus=4 --levels=2 -o addopts=
+
+      # The h100_inputs fixture removes its DCP stage on teardown; clear the
+      # pytest tmp dirs too (logs + any run output). The HF cache is kept.
+      - name: Clean up run outputs
+        if: always()
+        run: |
+          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py
index 0512d2a..65b4358 100644
--- a/tests/launch_regression_test.py
+++ b/tests/launch_regression_test.py
@@ -464,8 +464,12 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str,
     spec = _build_specs(paths)[spec_key]
 
     log_path = _run_torchrun(spec, tmp_path)
-    loss, grad_norm = _parse_series(log_path.read_text(errors="replace"), spec.loss_re)
-    assert len(loss) == 10, f"expected 10 iterations, parsed {len(loss)} (loss={loss})"
+    log_text = log_path.read_text(errors="replace")
+    loss, grad_norm = _parse_series(log_text, spec.loss_re)
+    # The run log also streamed live under ``pytest -s``; include its tail in any
+    # failure message so the run detail is attached to the failure report too.
+    run_detail = f"\n--- {spec.key} run log (last 4000 chars) ---\n{log_text[-4000:]}"
+    assert len(loss) == 10, f"expected 10 iterations, parsed {len(loss)} (loss={loss}){run_detail}"
 
     # Refresh path: print captured values for manual copy into ``_GOLDENS``.
     if os.environ.get("COSMOS_REGRESSION_UPDATE_GOLDENS") == "1":
@@ -495,14 +499,22 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str,
 
     assert loss[:n] == pytest.approx(
         expected["loss"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL
-    ), f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens"
+    ), (
+        f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens\n"
+        f"  got     : {loss[:n]}\n"
+        f"  expected: {expected['loss'][:n]}{run_detail}"
+    )
     # ``grad_norm`` is optional: ``None`` skips the check when the FSDP
     # global-norm all-reduce isn't bit-exact on this arch.
     if expected["grad_norm"] is None:
         return
     assert grad_norm[:n] == pytest.approx(
         expected["grad_norm"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL
-    ), f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens"
+    ), (
+        f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens\n"
+        f"  got     : {grad_norm[:n]}\n"
+        f"  expected: {expected['grad_norm'][:n]}{run_detail}"
+    )
 
 
 # Define only the test function matching MAX_GPUS — the conftest rejects

From 9f0a7329564c548ebe8f55fcf00b747d5441e040 Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 20:36:38 -0700
Subject: [PATCH 5/8] Expand Nano smoke tests (multi-modality inference +
 train/export/infer) and split smoke CI

- tests/nano_inference_smoke_test.py: one inference call over three modalities
  (t2vs text2video+sound, action policy, action forward_dynamics); validates
  each sample's vision.mp4 (PyAV decode), the t2vs audio (not-noise), and the
  policy action array.
- tests/nano_training_smoke_test.py: convert -> train 5 -> export -> t2i-from-
  export pipeline with per-step checks: DCP + exported-model completeness
  (file/shard + index counts + tensor-manifest self-consistency, no tensor
  load), loss-degrades (min(loss)<first), and a valid output image.
- tests/{vision_sft_nano_5iter.toml,launch_sft_vision_nano_5iter.sh}: 5-step
  smoke recipe (replaces the 1-iter fixtures).
- CI: split gpu-smoke-tests.yml into gpu-smoke-inference.yml and
  gpu-smoke-training.yml (training timeout 90 min). Each if: always() cleanup
  clears run output (pytest tmp; + examples/checkpoints for training), keeping
  examples/data + the HF cache.

Verified on 8xH100: inference smoke and training pipeline both pass.
---
 .github/workflows/gpu-smoke-inference.yml     |  54 ++++
 ...smoke-tests.yml => gpu-smoke-training.yml} |  34 +--
 tests/launch_sft_vision_nano_1iter.sh         |  17 --
 tests/launch_sft_vision_nano_5iter.sh         |  13 +
 tests/nano_inference_smoke_test.py            | 154 +++++++-----
 tests/nano_training_smoke_test.py             | 237 +++++++++++++++---
 ..._1iter.toml => vision_sft_nano_5iter.toml} |  15 +-
 7 files changed, 390 insertions(+), 134 deletions(-)
 create mode 100644 .github/workflows/gpu-smoke-inference.yml
 rename .github/workflows/{gpu-smoke-tests.yml => gpu-smoke-training.yml} (50%)
 delete mode 100755 tests/launch_sft_vision_nano_1iter.sh
 create mode 100755 tests/launch_sft_vision_nano_5iter.sh
 rename tests/{vision_sft_nano_1iter.toml => vision_sft_nano_5iter.toml} (80%)

diff --git a/.github/workflows/gpu-smoke-inference.yml b/.github/workflows/gpu-smoke-inference.yml
new file mode 100644
index 0000000..66375be
--- /dev/null
+++ b/.github/workflows/gpu-smoke-inference.yml
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Cosmos3-Nano 8-GPU multi-modality inference smoke (t2vs + policy + forward_dynamics) on a
+# self-hosted 8×H200 runner.
+#
+# Requires:
+#   * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
+#     NVIDIA drivers, and `uv` on PATH;
+#   * an `HF_TOKEN` repository secret (gated model downloads).
+#
+# The Cosmos3-Nano checkpoint (and its sound tokenizer) download to the runner's
+# HF cache; later runs reuse it.
+name: GPU Smoke (Inference)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: gpu-smoke-inference-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  inference-smoke:
+    runs-on: [self-hosted, gpu, h200]
+    timeout-minutes: 60
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_HUB_DISABLE_XET: "1"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+
+      - name: Sync environment (cu128-train)
+        run: uv sync --all-extras --group=cu128-train
+
+      # One inference call over t2vs (+sound), action policy, and forward_dynamics; checks each output.
+      # MAX_GPUS defaults to 8. -s streams the live process log.
+      - name: Nano inference smoke (t2vs + action policy + forward_dynamics, 8 GPU)
+        run: |
+          export LD_LIBRARY_PATH=
+          uv run --all-extras --group=cu128-train python -m pytest -v -s \
+            tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts=
+
+      # Inference writes only the pytest tmp dir (the t2vs video + logs); the
+      # checkpoint download stays in the HF cache (kept). No examples/ artifacts.
+      - name: Clean up run outputs
+        if: always()
+        run: |
+          rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
diff --git a/.github/workflows/gpu-smoke-tests.yml b/.github/workflows/gpu-smoke-training.yml
similarity index 50%
rename from .github/workflows/gpu-smoke-tests.yml
rename to .github/workflows/gpu-smoke-training.yml
index 7d4f3f4..1d861b2 100644
--- a/.github/workflows/gpu-smoke-tests.yml
+++ b/.github/workflows/gpu-smoke-training.yml
@@ -1,17 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-# Cosmos3-Nano 8-GPU smoke tests on a self-hosted 8×H200 runner.
+# Cosmos3-Nano 8-GPU SFT pipeline smoke test (convert -> train 5 -> export ->
+# t2i infer) on a self-hosted 8×H200 runner.
 #
 # Requires:
 #   * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
 #     NVIDIA drivers, and `uv` on PATH;
 #   * an `HF_TOKEN` repository secret (gated dataset/model downloads).
 #
-# Inputs (Cosmos3-Nano checkpoint, bridge dataset, Wan VAE) are downloaded /
-# converted in-test and cached in the runner's HF cache; the first run is slow
-# (~30 GB Nano + DCP convert), later runs reuse the cache.
-name: GPU Smoke Tests
+# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE) are downloaded /
+# converted in-test and cached under examples/ + the HF cache; the first run is
+# slow (~30 GB Nano + DCP convert + 5-step train + export + a t2i generation),
+# later runs reuse the cache.
+name: GPU Smoke (Training)
 
 on:
   push:
@@ -20,13 +22,13 @@ on:
     branches: [main]
 
 concurrency:
-  group: gpu-smoke-tests-${{ github.ref }}
+  group: gpu-smoke-training-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
-  nano-smoke:
+  training-smoke:
     runs-on: [self-hosted, gpu, h200]
-    timeout-minutes: 60
+    timeout-minutes: 90
     env:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
       HF_HUB_DISABLE_XET: "1"
@@ -38,19 +40,19 @@ jobs:
       - name: Sync environment (cu128-train)
         run: uv sync --all-extras --group=cu128-train
 
-      # Cosmos3-Nano t2vs inference (+ sound check) and a 1-iter Vision SFT.
+      # Full SFT pipeline: download + convert Nano->DCP, train 5 steps (loss
+      # trend), export to HF safetensors, then a t2i generation from the export.
       # MAX_GPUS defaults to 8. -s streams the live process log.
-      - name: Nano smoke tests (8 GPU)
+      - name: Nano SFT pipeline smoke (convert -> train 5 -> export -> t2i, 8 GPU)
         run: |
           export LD_LIBRARY_PATH=
           uv run --all-extras --group=cu128-train python -m pytest -v -s \
-            tests/nano_inference_smoke_test.py tests/nano_training_smoke_test.py \
-            --num-gpus=8 --levels=2 -o addopts=
+            tests/nano_training_smoke_test.py --num-gpus=8 --levels=2 -o addopts=
 
-      # Clear the run's heavy artifacts (even on failure): examples/checkpoints
-      # (the Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs
-      # (t2vs video + the SFT checkpoint). The small examples/data dataset and
-      # the HF cache are intentionally kept so subsequent runs reuse them.
+      # Clear the heavy artifacts (even on failure): examples/checkpoints (the
+      # Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs (the SFT
+      # checkpoint + logs). The small examples/data dataset and the HF cache are
+      # intentionally kept so subsequent runs reuse them.
       - name: Clean up run outputs
         if: always()
         run: |
diff --git a/tests/launch_sft_vision_nano_1iter.sh b/tests/launch_sft_vision_nano_1iter.sh
deleted file mode 100755
index 546df96..0000000
--- a/tests/launch_sft_vision_nano_1iter.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# SMOKE wrapper (test fixture) mirroring examples/launch_sft_vision_nano.sh but
-# pointing at the tests/vision_sft_nano_1iter.toml recipe (max_iter=1,
-# save_iter=1). Lives under tests/ and reuses the shared launcher helper from
-# examples/. Paths below are resolved relative to the repo root by
-# _sft_launcher_common.sh.
-
-TOML_FILE="tests/vision_sft_nano_1iter.toml"
-: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}"
-: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}"
-
-EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
-
-source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh"
diff --git a/tests/launch_sft_vision_nano_5iter.sh b/tests/launch_sft_vision_nano_5iter.sh
new file mode 100755
index 0000000..0fdf748
--- /dev/null
+++ b/tests/launch_sft_vision_nano_5iter.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# SMOKE wrapper (test fixture) for tests/nano_training_smoke_test.py — mirrors
+# examples/launch_sft_vision_nano.sh but points at tests/vision_sft_nano_5iter.toml
+# (max_iter=5, save_iter=5). Reuses the shared launcher helper from examples/.
+# Paths below are resolved relative to the repo root by _sft_launcher_common.sh.
+
+TOML_FILE="tests/vision_sft_nano_5iter.toml"
+: "${DATASET_PATH:=examples/data/bridge-v2-subset-synthetic-captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=examples/checkpoints/Cosmos3-Nano}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+source "$(dirname "${BASH_SOURCE[0]}")/../examples/_sft_launcher_common.sh"
diff --git a/tests/nano_inference_smoke_test.py b/tests/nano_inference_smoke_test.py
index b44d7c0..a2f0e1b 100644
--- a/tests/nano_inference_smoke_test.py
+++ b/tests/nano_inference_smoke_test.py
@@ -1,32 +1,36 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-"""8-GPU smoke test for Cosmos3-Nano text-to-video-with-sound (t2vs) inference.
+"""8-GPU multi-modality inference smoke test for Cosmos3-Nano.
 
-Runs the canonical Cosmos3-Nano inference command from ``docs/inference.md`` on
-the ``inputs/omni/t2vs.json`` sample (``model_mode=text2video`` +
-``enable_sound=True``) on 8 GPUs, and asserts that the run completes, writes a
-video, and the muxed audio track is real sound (finite, non-empty, not silence,
-not a degenerate/constant signal) -- not numeric goldens (that is
-``launch_regression_test.py``'s job).
+Runs ONE ``cosmos_framework.scripts.inference`` call over three input samples of
+different modalities (the ``-i`` flag takes a list of files) and validates each
+sample's output:
 
-The checkpoint (and its sound tokenizer) download from the Hugging Face Hub on
-first run and are reused from the HF cache afterward.
+  * ``inputs/omni/t2vs.json`` (text2video + sound) -> a ``vision.mp4`` whose
+    muxed audio is real sound (finite, non-empty, non-silent, non-constant).
+  * ``inputs/omni/action_forward_dynamics_camera.json`` (forward_dynamics) -> a
+    ``vision.mp4`` that decodes to at least one valid video frame (``action_path``
+    is an input, not an output).
+  * ``inputs/omni/action_policy_robot.json`` (policy) -> BOTH a ``vision.mp4`` and
+    a finite, non-empty predicted ``action`` array in ``sample_outputs.json``.
+
+All three samples produce a video; the policy sample additionally produces an
+action and the t2vs sample an audio track.
+
+Smoke-level only (output validity, not numeric goldens). The checkpoint + its
+tokenizers download from the HF Hub on first run and are reused afterward.
 
 Invocation (inside the inference container, from the repo root, on an 8-GPU
 node)::
 
     pytest -s tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts=
 
-* ``--num-gpus=8 --levels=2`` matches the markers below; the conftest pins
-  ``CUDA_VISIBLE_DEVICES`` accordingly.
-* ``-o addopts=`` clears the repo ``.pytest.toml`` addopts that reference an
-  optional plugin not installed in the container.
-
 Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is
 not collected.
 """
 
+import json
 import os
 import shutil
 import socket
@@ -40,47 +44,32 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
 
+_INPUTS = [
+    "inputs/omni/t2vs.json",
+    "inputs/omni/action_policy_robot.json",
+    "inputs/omni/action_forward_dynamics_camera.json",
+]
 
-def _free_port() -> int:
-    """Return a currently-free TCP port for torchrun's rendezvous.
+# Audio sanity thresholds for the muxed sound track.
+_RMS_SILENCE_FLOOR = 1e-4  # below this the track is effectively silence
+_PEAK_SANITY_CEIL = 1.5    # decoded float audio should sit within ~[-1, 1]
 
-    Avoids hardcoded ports that ``EADDRINUSE`` when a prior run's process
-    lingers or a port is in TIME_WAIT. (Small TOCTOU window between close and
-    torchrun's bind, acceptable for a single-node test.)
-    """
+
+def _free_port() -> int:
+    """Return a currently-free TCP port for torchrun's rendezvous (avoids
+    EADDRINUSE from a hardcoded port / lingering process)."""
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("", 0))
         return s.getsockname()[1]
 
-# Audio sanity thresholds for the muxed sound track.
-_RMS_SILENCE_FLOOR = 1e-4  # below this the track is effectively silence
-_PEAK_SANITY_CEIL = 1.5    # decoded float audio should sit within ~[-1, 1]
-
 
 def _run(cmd: list[str], log_file: Path) -> str:
-    """Run ``cmd`` from the repo root, tee combined output to ``log_file``.
-
-    Inherits the caller's environment (notably the HF cache, so a
-    previously-downloaded Cosmos3-Nano is reused). Fails the test with the log
-    tail on a non-zero exit.
-    """
+    """Run ``cmd`` from the repo root, tee combined output (live to stdout under
+    ``pytest -s`` + into ``log_file``). Inherits the caller's env (HF cache, ...)
+    plus ``PYTHONPATH=.``. Fails with the log tail on a non-zero exit."""
     env = os.environ.copy()
     env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}"
     log_file.parent.mkdir(parents=True, exist_ok=True)
-    returncode, text = _stream(cmd, env, log_file)
-    if returncode != 0:
-        pytest.fail(
-            f"inference failed with exit code {returncode}:\n"
-            f"  {' '.join(cmd)}\n"
-            f"Log tail:\n{text[-3000:]}"
-        )
-    return text
-
-
-def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]:
-    """Run ``cmd`` and tee its combined output: live to stdout (so CI shows
-    progress under ``pytest -s``) and into ``log_file`` + a returned string.
-    """
     captured: list[str] = []
     with log_file.open("w") as fp:
         proc = subprocess.Popen(
@@ -94,14 +83,17 @@ def _stream(cmd: list[str], env: dict, log_file: Path) -> tuple[int, str]:
             fp.write(line)
             captured.append(line)
         returncode = proc.wait()
-    return returncode, "".join(captured)
+    text = "".join(captured)
+    if returncode != 0:
+        pytest.fail(f"inference failed with exit code {returncode}:\n  {' '.join(cmd)}\nLog tail:\n{text[-3000:]}")
+    return text
 
 
 def _decode_audio_track(mp4_path: Path):
     """Decode the muxed audio track of ``mp4_path`` to a (channels, samples) waveform.
 
-    Returns ``(waveform_float64, sample_rate)``. Fails the test if the file has
-    no audio stream or it decodes to zero frames.
+    Returns ``(waveform_float64, sample_rate)``. Fails if there is no audio
+    stream or it decodes to zero frames.
     """
     import av
     import numpy as np
@@ -138,6 +130,33 @@ def _assert_sound_not_noise(mp4_path: Path) -> None:
     assert rms > _RMS_SILENCE_FLOOR, f"audio is silent/near-silent (rms={rms}) in {mp4_path}"
 
 
+def _assert_valid_video(mp4_path: Path) -> None:
+    """Assert ``mp4_path`` decodes to at least one valid, non-degenerate video frame."""
+    import av
+
+    assert mp4_path.is_file() and mp4_path.stat().st_size > 1024, f"video missing/too small: {mp4_path}"
+    with av.open(str(mp4_path)) as container:
+        vstreams = container.streams.video
+        assert vstreams, f"no video stream in {mp4_path}"
+        width = height = frames = 0
+        for frame in container.decode(vstreams[0]):
+            width, height, frames = frame.width, frame.height, frames + 1
+            break
+    assert frames >= 1 and width > 0 and height > 0, f"no decodable video frame in {mp4_path}"
+
+
+def _assert_valid_action(content: dict, where: str) -> None:
+    """Assert a policy sample's predicted ``action`` is a non-empty, all-finite array."""
+    import numpy as np
+
+    assert isinstance(content, dict) and content.get("action") is not None, (
+        f"no 'action' in policy output ({where}); content keys={list(content) if isinstance(content, dict) else content}"
+    )
+    arr = np.asarray(content["action"], dtype=np.float64)
+    assert arr.size > 0, f"empty action output ({where})"
+    assert np.all(np.isfinite(arr)), f"action output has NaN/Inf ({where})"
+
+
 @pytest.fixture(scope="module", autouse=True)
 def _require_8_gpus() -> None:
     """Skip the module unless we can launch an 8-GPU run here."""
@@ -157,8 +176,8 @@ def _require_8_gpus() -> None:
 
     @pytest.mark.level(2)
     @pytest.mark.gpus(8)
-    def test_nano_inference_t2vs(tmp_path: Path) -> None:
-        """Run the docs/inference.md Cosmos3-Nano t2vs command; check the video + its sound."""
+    def test_nano_inference_omni(tmp_path: Path) -> None:
+        """One Cosmos3-Nano inference call over t2vs + policy + forward_dynamics; check each output."""
         out_dir = tmp_path / "out"
         cmd = [
             "torchrun",
@@ -168,7 +187,7 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None:
             "cosmos_framework.scripts.inference",
             "--parallelism-preset=throughput",
             "-i",
-            "inputs/omni/t2vs.json",
+            *_INPUTS,
             "-o",
             str(out_dir),
             "--checkpoint-path",
@@ -177,10 +196,33 @@ def test_nano_inference_t2vs(tmp_path: Path) -> None:
         ]
         _run(cmd, tmp_path / "inference.log")
 
-        videos = list(out_dir.rglob("vision.mp4"))
-        assert len(videos) == 1, f"expected exactly one vision.mp4 under {out_dir}, found {videos}"
-        video = videos[0]
-        assert video.stat().st_size > 0, f"empty output video at {video}"
-        assert list(out_dir.rglob("sample_outputs.json")), f"no sample_outputs.json under {out_dir}"
+        results = sorted(out_dir.rglob("sample_outputs.json"))
+        assert len(results) == len(_INPUTS), (
+            f"expected {len(_INPUTS)} sample_outputs.json (one per input), found {[str(p) for p in results]}"
+        )
 
-        _assert_sound_not_noise(video)
+        # Dispatch validation by what each sample produced (robust to model_mode
+        # string formatting): a vision.mp4 -> valid video (+ sound if enabled);
+        # an `action` content -> valid action array.
+        n_video = n_sound = n_action = 0
+        for so in results:
+            data = json.loads(so.read_text())
+            args = data.get("args", {})
+            content = data["outputs"][0]["content"]
+            sample_dir = so.parent
+            video = sample_dir / "vision.mp4"
+            if video.is_file():
+                _assert_valid_video(video)
+                n_video += 1
+                if args.get("enable_sound"):
+                    _assert_sound_not_noise(video)
+                    n_sound += 1
+            if isinstance(content, dict) and content.get("action") is not None:
+                _assert_valid_action(content, str(so))
+                n_action += 1
+
+        # Every sample produces a valid video (t2vs, forward_dynamics, policy);
+        # the policy sample additionally yields an action, t2vs an audio track.
+        assert n_video == len(_INPUTS), f"expected every sample to produce a valid video, got {n_video}/{len(_INPUTS)}"
+        assert n_sound >= 1, f"expected the t2vs sample's audio to be checked, got {n_sound}"
+        assert n_action >= 1, f"expected the policy sample's action to be checked, got {n_action}"
diff --git a/tests/nano_training_smoke_test.py b/tests/nano_training_smoke_test.py
index ecdc545..ab7ecf2 100644
--- a/tests/nano_training_smoke_test.py
+++ b/tests/nano_training_smoke_test.py
@@ -1,23 +1,27 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-"""8-GPU smoke test for Cosmos3-Nano SFT training.
+"""8-GPU Cosmos3-Nano SFT pipeline smoke test (train -> export -> infer).
 
-Runs the documented Vision SFT (Cosmos3-Nano) flow from ``docs/training.md``
-end to end on 8 GPUs, capped to a single optimizer step via the
-``vision_sft_nano_1iter`` recipe (``max_iter=1``, ``save_iter=1``):
+Runs the documented Vision SFT (Cosmos3-Nano) lifecycle from ``docs/training.md``
+end to end on 8 GPUs and validates each artifact:
 
   1. Step 1 -- download the bridge-v2 subset dataset + the Wan2.2 VAE.
-  2. Step 2 -- ``convert_model_to_dcp`` the Cosmos3-Nano checkpoint to DCP.
-  3. Step 3 -- run the paired launch shell ``launch_sft_vision_nano_1iter.sh``.
-
-It asserts only that training completes and writes a checkpoint with a finite
-loss (smoke -- no numeric goldens; that is ``launch_regression_test.py``'s job).
-
-Inputs land in the documented, ``.gitignore``-d default locations
-(``examples/data/``, ``examples/checkpoints/``) so they are cached across runs;
-the training output goes under ``outputs/`` (also git-ignored). Steps 1-2 are
-skipped when their artifacts already exist.
+  2. Step 2 -- ``convert_model_to_dcp`` Cosmos3-Nano -> DCP; check DCP completeness.
+  3. Step 3 -- train 5 steps (``vision_sft_nano_5iter``); check the rank-0 loss
+     drops below its starting value (``min(loss) < loss[0]``; per-step diffusion
+     loss is too noisy for a strict trend over only 5 steps).
+  4. Export -- ``export_model`` the trained DCP -> HF safetensors; check export
+     completeness (the ``checkpoint.json`` sentinel + config + safetensors).
+  5. Inference -- a t2i generation from the exported model; check the image is
+     valid.
+
+Smoke-level checks only (artifact validity + a downward loss trend), not numeric
+goldens -- that is ``launch_regression_test.py``'s job.
+
+Inputs land in the documented ``.gitignore``-d locations (``examples/data/``,
+``examples/checkpoints/``, cached across runs); run output goes under the pytest
+tmp dir. Steps 1-2 are skipped when their artifacts already exist.
 
 Invocation (inside the training container, from the repo root, on an 8-GPU
 node)::
@@ -28,6 +32,7 @@
 not collected.
 """
 
+import json
 import os
 import re
 import shutil
@@ -49,7 +54,13 @@
 _DATASET_REVISION = "46468e12ac0dd36901e9e3240d4fc7620942b5d7"
 _WAN_VAE = REPO_ROOT / "examples/checkpoints/wan22_vae/Wan2.2_VAE.pth"
 _DCP_DIR = REPO_ROOT / "examples/checkpoints/Cosmos3-Nano"
-_LAUNCHER = "tests/launch_sft_vision_nano_1iter.sh"
+_LAUNCHER = "tests/launch_sft_vision_nano_5iter.sh"
+
+# rank-0 per-iteration loss from the IterSpeed callback, e.g.
+#   [RANK 0] Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ...
+_RANK0_LOSS_RE = re.compile(
+    r"\[RANK\s+0\]\s+Iteration\s+\d+:\s+Hit counter:[^|]+\|\s+Loss:\s+([-+0-9.eE]+)"
+)
 
 
 def _free_port() -> int:
@@ -63,16 +74,15 @@ def _free_port() -> int:
 def _run(cmd: list[str], log_file: Path, extra_env: dict | None = None) -> tuple[int, str]:
     """Run ``cmd`` from the repo root, tee combined output to ``log_file``.
 
-    Returns ``(returncode, combined_output)``. Inherits the caller's env (HF
-    cache, etc.) plus ``PYTHONPATH=.``.
+    Returns ``(returncode, combined_output)``. Streams live to stdout (so CI
+    shows progress under ``pytest -s``) while capturing into the log + a string.
+    Inherits the caller's env (HF cache, LD_LIBRARY_PATH, ...) plus ``PYTHONPATH=.``.
     """
     env = os.environ.copy()
     env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}"
     if extra_env:
         env.update(extra_env)
     log_file.parent.mkdir(parents=True, exist_ok=True)
-    # Tee: stream the subprocess output live to stdout (so CI shows progress
-    # under ``pytest -s``) while capturing it into the log file + a string.
     captured: list[str] = []
     with log_file.open("w") as fp:
         proc = subprocess.Popen(
@@ -134,14 +144,10 @@ def _ensure_dcp(log_dir: Path) -> None:
     assert _DCP_DIR.is_dir() and any(_DCP_DIR.iterdir()), f"DCP not written to {_DCP_DIR}"
 
 
-def _finite_losses(text: str) -> list[float]:
-    """Parse per-iteration ``Loss:`` values from the training log.
-
-    Matches the ``iter_speed`` callback line, e.g.
-    ``Iteration 1: Hit counter: 1/50 | Loss: 0.2302 | Time: ...``.
-    """
+def _rank0_losses(text: str) -> list[float]:
+    """Parse the rank-0 per-iteration ``Loss:`` series (one value per step)."""
     vals = []
-    for m in re.finditer(r"Loss:\s*([-+0-9.eE]+)", text):
+    for m in _RANK0_LOSS_RE.finditer(text):
         try:
             v = float(m.group(1))
         except ValueError:
@@ -151,6 +157,119 @@ def _finite_losses(text: str) -> list[float]:
     return vals
 
 
+def _safetensors_tensor_names(path: Path) -> set[str]:
+    """Validate a .safetensors header (8-byte LE length + JSON) and return its tensor names."""
+    assert path.is_file() and path.stat().st_size > 8, f"safetensors shard missing/empty: {path}"
+    with path.open("rb") as f:
+        header_len = int.from_bytes(f.read(8), "little")
+        assert 0 < header_len < path.stat().st_size, f"bad safetensors header length in {path}: {header_len}"
+        header = json.loads(f.read(header_len))  # raises if the header isn't valid JSON
+    return {k for k in header if k != "__metadata__"}
+
+
+def _assert_dcp_complete(dcp_root: Path) -> None:
+    """Structural + index-consistency completeness of a torch DCP (no tensor load).
+
+    For each ``.metadata`` under ``dcp_root``: the shard files beside it must all
+    exist and be non-empty, and the set/count of ``*.distcp`` files on disk must
+    match the storage files the ``.metadata`` index references (no missing/extra).
+    Reading ``.metadata`` only parses the index, not the tensors.
+    """
+    assert dcp_root.is_dir(), f"DCP dir missing: {dcp_root}"
+    metas = list(dcp_root.rglob(".metadata"))
+    assert metas, f"no DCP .metadata under {dcp_root}"
+    from torch.distributed.checkpoint import FileSystemReader
+
+    for meta in metas:
+        assert meta.stat().st_size > 0, f"empty DCP .metadata: {meta}"
+        present = sorted(p.name for p in meta.parent.glob("*.distcp"))
+        assert present, f"no .distcp shards beside {meta}"
+        empty = [s for s in present if (meta.parent / s).stat().st_size == 0]
+        assert not empty, f"empty .distcp shards beside {meta}: {empty}"
+
+        # Index consistency: the .metadata declares which shard files exist.
+        metadata = FileSystemReader(str(meta.parent)).read_metadata()
+        referenced = {getattr(info, "relative_path", None) for info in metadata.storage_data.values()}
+        referenced.discard(None)
+        if referenced:  # skip only if this reader doesn't expose shard paths
+            missing = sorted(set(referenced) - set(present))
+            assert not missing, (
+                f"DCP {meta.parent}: .metadata references {len(referenced)} shard file(s) but "
+                f"these are missing on disk: {missing}"
+            )
+            assert len(present) == len(referenced), (
+                f"DCP {meta.parent}: {len(present)} .distcp file(s) on disk != "
+                f"{len(referenced)} referenced by .metadata ({present} vs {sorted(referenced)})"
+            )
+
+        # Tensor-manifest self-consistency: every tensor the .metadata declares
+        # (state_dict_metadata) must be backed by storage (no omitted param).
+        declared = set(metadata.state_dict_metadata.keys())
+        stored = {getattr(idx, "fqn", None) for idx in metadata.storage_data.keys()}
+        stored.discard(None)
+        assert declared, f"DCP .metadata declares no tensors: {meta}"
+        if stored:  # skip only if storage keys don't expose fqn
+            unstored = sorted(declared - stored)
+            assert not unstored, (
+                f"DCP {meta.parent}: {len(unstored)} declared tensor(s) have no storage "
+                f"(omitted): {unstored[:10]}"
+            )
+
+
+def _assert_export_complete(model_dir: Path) -> None:
+    """Structural + index completeness of an exported HF safetensors checkpoint."""
+    assert model_dir.is_dir(), f"export dir missing: {model_dir}"
+    # export_model writes checkpoint.json LAST as the "model is complete" sentinel.
+    for name in ("checkpoint.json", "config.json"):
+        p = model_dir / name
+        assert p.is_file() and p.stat().st_size > 0, f"export missing/empty {name} in {model_dir}"
+        json.loads(p.read_text())  # valid JSON
+    index = model_dir / "model.safetensors.index.json"
+    on_disk = sorted(p.name for p in model_dir.glob("*.safetensors"))
+    if index.is_file():
+        weight_map = json.loads(index.read_text()).get("weight_map", {})
+        declared = set(weight_map.keys())
+        shards = sorted(set(weight_map.values()))
+        assert declared and shards, f"empty weight_map in {index}"
+        missing = sorted(set(shards) - set(on_disk))
+        assert not missing, f"export {model_dir}: index references missing shards: {missing}"
+        # File-count consistency: exactly the index's shards on disk (no extra/missing).
+        assert len(on_disk) == len(shards), (
+            f"export {model_dir}: {len(on_disk)} .safetensors on disk != {len(shards)} in index "
+            f"weight_map ({on_disk} vs {shards})"
+        )
+        # Tensor-manifest self-consistency: the tensors actually stored across the
+        # shards must equal the index's declared keys (no omitted/extra param).
+        stored: set[str] = set()
+        for shard in shards:
+            stored |= _safetensors_tensor_names(model_dir / shard)
+        assert declared == stored, (
+            f"export {model_dir}: index declares {len(declared)} tensors but shards hold {len(stored)} "
+            f"(missing from shards: {sorted(declared - stored)[:10]}; not in index: {sorted(stored - declared)[:10]})"
+        )
+    else:
+        assert on_disk == ["model.safetensors"], (
+            f"export {model_dir}: expected a single model.safetensors (no index), found {on_disk}"
+        )
+        names = _safetensors_tensor_names(model_dir / "model.safetensors")
+        assert names, f"export {model_dir}: model.safetensors holds no tensors"
+
+
+def _assert_valid_image(path: Path) -> None:
+    """Assert ``path`` is a valid, non-degenerate image."""
+    assert path.is_file() and path.stat().st_size > 1024, f"output image missing/too small: {path}"
+    try:
+        from PIL import Image
+    except Exception:  # pragma: no cover -- PIL expected in the env
+        assert path.read_bytes()[:3] == b"\xff\xd8\xff", f"not a JPEG: {path}"
+        return
+    with Image.open(path) as im:
+        im.verify()  # detects truncation/corruption
+    with Image.open(path) as im:
+        width, height = im.size
+    assert width > 0 and height > 0, f"degenerate image size {width}x{height}: {path}"
+
+
 @pytest.fixture(scope="module", autouse=True)
 def _require_8_gpus() -> None:
     """Skip the module unless we can launch an 8-GPU training run here."""
@@ -170,14 +289,15 @@ def _require_8_gpus() -> None:
 
     @pytest.mark.level(2)
     @pytest.mark.gpus(8)
-    def test_nano_sft_vision_1iter(tmp_path: Path) -> None:
-        """Run the full Vision SFT (Cosmos3-Nano) 1-iter flow and check it trains a step."""
+    def test_nano_sft_train_export_infer(tmp_path: Path) -> None:
+        """Full Cosmos3-Nano SFT pipeline: convert -> train 5 -> export -> t2i infer."""
+        # 1-2. Inputs + HF->DCP convert, then DCP completeness.
         _ensure_inputs(tmp_path)
         _ensure_dcp(tmp_path)
+        _assert_dcp_complete(_DCP_DIR)
 
-        # Route all run-specific output (launcher logs + the saved checkpoint via
-        # the harness's IMAGINAIRE_OUTPUT_ROOT) under the pytest tmp dir, which
-        # pytest auto-cleans. Nothing run-specific is left in the repo tree.
+        # 3. Train 5 steps (run output -> pytest tmp via OUTPUT_ROOT + the harness's
+        #    IMAGINAIRE_OUTPUT_ROOT). Free port avoids EADDRINUSE.
         rc, out = _run(
             ["bash", _LAUNCHER],
             tmp_path / "train.log",
@@ -188,16 +308,55 @@ def test_nano_sft_vision_1iter(tmp_path: Path) -> None:
             },
         )
         assert rc == 0, f"SFT launch failed (exit {rc}):\nLog tail:\n{out[-4000:]}"
-
         assert "Done with training" in out, f"training did not finish cleanly:\nLog tail:\n{out[-4000:]}"
 
-        losses = _finite_losses(out)
-        assert losses, f"no finite per-iteration 'Loss:' value found in training log:\n{out[-3000:]}"
+        losses = _rank0_losses(out)
+        assert len(losses) == 5, f"expected 5 rank-0 losses, parsed {losses}\nLog tail:\n{out[-2000:]}"
+        # Per-step diffusion loss is noisy (a random timestep is sampled each step),
+        # so a strict trend over just 5 steps flakes on a single noisy step. The
+        # robust "training is learning" signal is that the loss dropped below its
+        # starting value at some point.
+        assert min(losses) < losses[0], (
+            f"loss never dropped below the first step over 5 steps (training not degrading): {losses}"
+        )
 
-        # save_iter=1 -> the trainer logs the DCP checkpoint path it wrote. Its
-        # location is governed by IMAGINAIRE_OUTPUT_ROOT (the test harness points
-        # this at a pytest tmp dir), so read it from the log rather than guessing.
+        # 4. Locate the trained DCP + config, export to HF safetensors, check completeness.
         saved = re.findall(r"Saved checkpoint to (\S+)", out)
-        assert saved, f"no 'Saved checkpoint to ...' line in training log (save_iter=1):\n{out[-3000:]}"
+        assert saved, f"no 'Saved checkpoint to ...' line in training log:\n{out[-2000:]}"
         ckpt = Path(saved[-1])
-        assert ckpt.is_dir() and any(ckpt.iterdir()), f"saved checkpoint dir missing/empty: {ckpt}"
+        assert ckpt.is_dir() and any(ckpt.iterdir()), f"trained checkpoint dir missing/empty: {ckpt}"
+        run_dir = ckpt.parent.parent  # <RUN_DIR>/checkpoints/iter_X -> <RUN_DIR>
+        config_yaml = run_dir / "config.yaml"
+        assert config_yaml.is_file(), f"run config.yaml missing at {config_yaml}"
+
+        export_dir = run_dir / "model"
+        rc, out = _run(
+            [
+                "python", "-m", "cosmos_framework.scripts.export_model",
+                "--checkpoint-path", str(ckpt),
+                "--config-file", str(config_yaml),
+                "-o", str(export_dir),
+            ],
+            tmp_path / "export.log",
+        )
+        assert rc == 0, f"export_model failed (exit {rc}):\nLog tail:\n{out[-4000:]}"
+        _assert_export_complete(export_dir)
+
+        # 5. t2i inference from the exported model; check the image is valid.
+        infer_out = tmp_path / "exported_out"
+        rc, out = _run(
+            [
+                "torchrun", "--nproc_per_node=8", f"--master_port={_free_port()}",
+                "-m", "cosmos_framework.scripts.inference",
+                "--parallelism-preset=throughput",
+                "-i", "inputs/omni/t2i.json",
+                "-o", str(infer_out),
+                "--checkpoint-path", str(export_dir),
+                "--seed=0",
+            ],
+            tmp_path / "infer.log",
+        )
+        assert rc == 0, f"t2i inference from exported model failed (exit {rc}):\nLog tail:\n{out[-4000:]}"
+        images = list(infer_out.rglob("vision.jpg"))
+        assert len(images) == 1, f"expected one vision.jpg under {infer_out}, found {images}"
+        _assert_valid_image(images[0])
diff --git a/tests/vision_sft_nano_1iter.toml b/tests/vision_sft_nano_5iter.toml
similarity index 80%
rename from tests/vision_sft_nano_1iter.toml
rename to tests/vision_sft_nano_5iter.toml
index c88eed6..b88b8b5 100644
--- a/tests/vision_sft_nano_1iter.toml
+++ b/tests/vision_sft_nano_5iter.toml
@@ -5,15 +5,18 @@
 # Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
 # Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here).
 #
-# SMOKE COPY of vision_sft_nano.toml: max_iter=1 + save_iter=1 so it trains a
-# single optimizer step and immediately writes a DCP checkpoint.
+# SMOKE COPY of vision_sft_nano.toml used by tests/nano_training_smoke_test.py:
+# max_iter=5 + save_iter=5 so it trains a few optimizer steps and writes a DCP
+# checkpoint at the end. warm_up_steps=1 (vs the production warmup) so the LR is
+# at full value almost immediately and the training loss visibly trends down
+# across the 5 logged steps (the test asserts mean(loss[-2:]) < loss[0]).
 
 [job]
 task         = "vfm"
 experiment   = "vision_sft_nano"
 project      = "cosmos3"
 group        = "sft"
-name         = "vision_sft_nano_1iter"
+name         = "vision_sft_nano_5iter"
 wandb_mode   = "disabled"
 
 [model]
@@ -63,13 +66,13 @@ f_max              = [1.0]
 f_min              = [0.0]
 f_start            = [0.0]
 verbosity_interval = 0
-warm_up_steps      = [50]
+warm_up_steps      = [1]                                 # smoke: full LR almost immediately so loss trends down in 5 steps
 
 [trainer]
 distributed_parallelism = "fsdp"
 grad_accum_iter         = 2
 logging_iter            = 1
-max_iter                = 1
+max_iter                = 5
 
 [trainer.callbacks.compile_tokenizer]
 compile_after_iterations = 3
@@ -83,7 +86,7 @@ force_finite = true
 [checkpoint]
 keys_to_skip_loading = ["net_ema."]
 load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
-save_iter            = 1
+save_iter            = 5
 
 [dataloader_train]
 max_sequence_length = 45056

From b31ff926e45af422d8320d2ecdb56cc5df0dd7f9 Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 20:40:01 -0700
Subject: [PATCH 6/8] Lint

---
 tests/launch_sft_vision_nano_5iter.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/launch_sft_vision_nano_5iter.sh b/tests/launch_sft_vision_nano_5iter.sh
index 0fdf748..2175dcc 100755
--- a/tests/launch_sft_vision_nano_5iter.sh
+++ b/tests/launch_sft_vision_nano_5iter.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
 # SMOKE wrapper (test fixture) for tests/nano_training_smoke_test.py — mirrors
 # examples/launch_sft_vision_nano.sh but points at tests/vision_sft_nano_5iter.toml
 # (max_iter=5, save_iter=5). Reuses the shared launcher helper from examples/.

From 7257ad05fc2906cd3224d77a7e32a89f12f71796 Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 21:09:31 -0700
Subject: [PATCH 7/8] test(regression): assert all 10 reasoner iters at a loose
 tolerance

The llava_ov_datapacker (reasoner) spec previously asserted only iter-0
loss, since it runs non-deterministically (no deterministic Hopper FMHA
backward kernel; streamed LLaVA-OneVision data) and iters 1+ drift
run-to-run.

Give LaunchSpec per-spec loss_rtol/loss_atol (defaulting to the tight
1e-3 the deterministic generator uses) and set the reasoner to assert all
10 iters at rtol=atol=0.01. Two H200 samples differ by at most ~0.006
across the 10 iters, so 0.01 holds with margin while still catching a
real numerical regression. Recapture the reasoner goldens to H200 values
(iter-0 is bit-exact H100==H200).

The generator (vision_sft_nano) is unchanged: still all 10 iters at 1e-3.
---
 tests/launch_regression_test.py | 46 ++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/tests/launch_regression_test.py b/tests/launch_regression_test.py
index 65b4358..2b7eb46 100644
--- a/tests/launch_regression_test.py
+++ b/tests/launch_regression_test.py
@@ -54,9 +54,10 @@
     deterministic context). ``VLMModel.__init__`` honors the config-level
     flag via ``init_flash_attn_meta`` independently of the launcher arg, so
     both must be off. It also streams ``lmms-lab/LLaVA-OneVision-Data`` from
-    HuggingFace Hub, so only the first 2 iters reproduce in practice (later
-    iters drift with shard arrival order + non-det kernels). Set
-    ``COSMOS_REGRESSION_VLM_FULL=1`` to assert all 10 (expected to fail).
+    HuggingFace Hub: iter-0 is bit-exact but iters 1+ drift run-to-run with
+    shard arrival order + non-det kernels. All 10 iters are asserted, but with
+    the spec's loose ``loss_rtol``/``loss_atol`` (vs the tight 1e-3 the
+    deterministic vision spec uses) to absorb that drift.
 
 Refreshing the goldens (after an intentional numerical change)::
 
@@ -198,6 +199,11 @@ class LaunchSpec:
     # the tighter goldens tolerance only on the iters that still reproduce in
     # practice (see ``deterministic_iters``).
     deterministic: bool = True
+    # Per-spec goldens tolerance for ``pytest.approx``. Deterministic specs use
+    # the tight default; non-deterministic specs (e.g. the reasoner) need a
+    # looser band to absorb per-step drift across the iters they assert.
+    loss_rtol: float = _DEFAULT_RTOL
+    loss_atol: float = _DEFAULT_ATOL
 
 
 # 4-GPU specs run by ``test_launch_regression``; 8-GPU specs run by
@@ -251,15 +257,21 @@ def _build_specs(paths: dict[str, str]) -> dict[str, LaunchSpec]:
                 "upload_reproducible_setup=false",
             ),
             loss_re=_VLM_LOSS_RE,
-            # Only iter-0 loss reproduces under non-deterministic mode: it's a
-            # pure forward on a seed-fixed batch with seed-fixed init weights,
-            # so it's bit-exact. Iter 1+ depends on iter-0's non-deterministic
-            # backward (no deterministic Hopper FMHA kernel on H100) and drifts
-            # immediately.
-            deterministic_iters=1,
+            # Non-deterministic spec: iter-0 is bit-exact (pure forward on a
+            # seed-fixed batch + init), but iters 1+ drift run-to-run (the Hopper
+            # FMHA backward has no deterministic kernel and the LLaVA-OneVision
+            # data is streamed). We still assert all 10 iters but with a loose
+            # tolerance (loss_rtol/loss_atol below) to absorb that drift.
+            deterministic_iters=10,
             # See the ``deterministic=false`` override above for the
             # Hopper-FMHA rationale; the launcher flag is dropped to match.
             deterministic=False,
+            # Loose band for the non-deterministic per-step loss (vs the tight
+            # 1e-3 default the deterministic VFM spec uses). Two H200 samples
+            # differ by at most ~0.006 across the 10 iters, so 0.01 holds with
+            # margin while still catching a real numerical regression.
+            loss_rtol=0.01,
+            loss_atol=0.01,
         ),
         "vision_sft_nano": LaunchSpec(
             # Replicates launch_sft_vision_nano.sh, capped to 10 iters.
@@ -494,11 +506,9 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str,
     )
 
     n = spec.deterministic_iters
-    if spec.key == "llava_ov_datapacker" and os.environ.get("COSMOS_REGRESSION_VLM_FULL") == "1":
-        n = 10
 
     assert loss[:n] == pytest.approx(
-        expected["loss"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL
+        expected["loss"][:n], rel=spec.loss_rtol, abs=spec.loss_atol
     ), (
         f"{spec.key} ({arch}): rank-0 loss[:{n}] does not match goldens\n"
         f"  got     : {loss[:n]}\n"
@@ -509,7 +519,7 @@ def _assert_spec_matches_goldens(spec_key: str, tmp_path: Path, paths: dict[str,
     if expected["grad_norm"] is None:
         return
     assert grad_norm[:n] == pytest.approx(
-        expected["grad_norm"][:n], rel=_DEFAULT_RTOL, abs=_DEFAULT_ATOL
+        expected["grad_norm"][:n], rel=spec.loss_rtol, abs=spec.loss_atol
     ), (
         f"{spec.key} ({arch}): global grad-norm[:{n}] does not match goldens\n"
         f"  got     : {grad_norm[:n]}\n"
@@ -565,12 +575,12 @@ def test_launch_regression_8gpu(spec_key: str, tmp_path: Path, h100_inputs: dict
         # Recaptured 2026-06-03 with deterministic mode off (both ``--deterministic``
         # and ``model.config.deterministic`` are False — the Hopper FMHA
         # backward refuses to run under PyTorch deterministic mode on H100, see
-        # ``LaunchSpec.deterministic`` and the spec's hydra override). The full
-        # 10-iter series is captured for reference, but only ``deterministic_iters=1``
-        # loss is asserted; iter 1+ drifts because the backward isn't bit-exact,
-        # and even iter-0 grad-norm drifts (so grad_norm is skipped via ``None``).
+        # ``LaunchSpec.deterministic`` and the spec's hydra override). These are
+        # H200 values (iter-0 is bit-exact H100==H200). All 10 iters are asserted
+        # but against the spec's loose tolerance (loss_rtol/loss_atol=0.01) since
+        # iters 1+ drift run-to-run; grad-norm is non-det too, so skipped (None).
         "llava_ov_datapacker": {
-            "loss": [0.88798, 1.01583, 1.06096, 1.05566, 1.00613, 0.91551, 1.10534, 1.03794, 0.94166, 0.69613],
+            "loss": [0.88798, 1.01444, 1.0565, 1.04765, 0.99979, 0.92324, 1.1051, 1.03238, 0.93775, 0.69643],
             "grad_norm": None,
         },
         # Recaptured 2026-06-03 after the TOML-config rewrite shifted some

From 84b853a6fc64a00591779ebd312e681e34d7fd8f Mon Sep 17 00:00:00 2001
From: "liang.feng" <liangf@nvidia.com>
Date: Wed, 3 Jun 2026 21:25:55 -0700
Subject: [PATCH 8/8] Rename for order

---
 .../{gpu-smoke-training.yml => 0-gpu-smoke-training.yml}          | 0
 ...pu-regression-generator.yml => 1-gpu-regression-generator.yml} | 0
 .../{gpu-smoke-inference.yml => 2-gpu-smoke-inference.yml}        | 0
 ...{gpu-regression-reasoner.yml => 3-gpu-regression-reasoner.yml} | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{gpu-smoke-training.yml => 0-gpu-smoke-training.yml} (100%)
 rename .github/workflows/{gpu-regression-generator.yml => 1-gpu-regression-generator.yml} (100%)
 rename .github/workflows/{gpu-smoke-inference.yml => 2-gpu-smoke-inference.yml} (100%)
 rename .github/workflows/{gpu-regression-reasoner.yml => 3-gpu-regression-reasoner.yml} (100%)

diff --git a/.github/workflows/gpu-smoke-training.yml b/.github/workflows/0-gpu-smoke-training.yml
similarity index 100%
rename from .github/workflows/gpu-smoke-training.yml
rename to .github/workflows/0-gpu-smoke-training.yml
diff --git a/.github/workflows/gpu-regression-generator.yml b/.github/workflows/1-gpu-regression-generator.yml
similarity index 100%
rename from .github/workflows/gpu-regression-generator.yml
rename to .github/workflows/1-gpu-regression-generator.yml
diff --git a/.github/workflows/gpu-smoke-inference.yml b/.github/workflows/2-gpu-smoke-inference.yml
similarity index 100%
rename from .github/workflows/gpu-smoke-inference.yml
rename to .github/workflows/2-gpu-smoke-inference.yml
diff --git a/.github/workflows/gpu-regression-reasoner.yml b/.github/workflows/3-gpu-regression-reasoner.yml
similarity index 100%
rename from .github/workflows/gpu-regression-reasoner.yml
rename to .github/workflows/3-gpu-regression-reasoner.yml