Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/workflows/0-gpu-smoke-training.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Cosmos3-Nano 8-GPU SFT pipeline smoke test (convert -> train 5 -> export ->
# t2i infer) on a self-hosted 8×H200 runner.
#
# Requires:
# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
# NVIDIA drivers, and `uv` on PATH;
# * an `HF_TOKEN` repository secret (gated dataset/model downloads).
#
# Inputs (Cosmos3-Nano -> DCP, bridge dataset, Wan VAE) are downloaded /
# converted in-test and cached under examples/ + the HF cache; the first run is
# slow (~30 GB Nano + DCP convert + 5-step train + export + a t2i generation),
# later runs reuse the cache.
name: GPU Smoke (Training)

on:
push:
branches: [main]
pull_request:
branches: [main]

concurrency:
group: gpu-smoke-training-${{ github.ref }}
cancel-in-progress: true

jobs:
training-smoke:
runs-on: [self-hosted, gpu, h200]
timeout-minutes: 90
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_DISABLE_XET: "1"
steps:
- uses: actions/checkout@v6

- uses: astral-sh/setup-uv@v7

- name: Sync environment (cu128-train)
run: uv sync --all-extras --group=cu128-train

# Full SFT pipeline: download + convert Nano->DCP, train 5 steps (loss
# trend), export to HF safetensors, then a t2i generation from the export.
# MAX_GPUS defaults to 8. -s streams the live process log.
- name: Nano SFT pipeline smoke (convert -> train 5 -> export -> t2i, 8 GPU)
run: |
export LD_LIBRARY_PATH=
uv run --all-extras --group=cu128-train python -m pytest -v -s \
tests/nano_training_smoke_test.py --num-gpus=8 --levels=2 -o addopts=

# Clear the heavy artifacts (even on failure): examples/checkpoints (the
# Cosmos3-Nano DCP + Wan VAE, ~30 GB) and the pytest tmp dirs (the SFT
# checkpoint + logs). The small examples/data dataset and the HF cache are
# intentionally kept so subsequent runs reuse them.
- name: Clean up run outputs
if: always()
run: |
rm -rf examples/checkpoints || true
rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
56 changes: 56 additions & 0 deletions .github/workflows/1-gpu-regression-generator.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Generator (VFM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU
# subset). Runs the single ``vision_sft_nano`` spec of
# tests/launch_regression_test.py.
#
# Requires:
# * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs,
# NVIDIA drivers, and `uv` on PATH;
# * an `HF_TOKEN` repository secret (gated dataset/model downloads).
#
# The h100 goldens are reused on H200 (see _detect_arch).
name: GPU Regression (Generator)

on:
push:
branches: [main]
pull_request:
branches: [main]

concurrency:
group: gpu-regression-generator-${{ github.ref }}
cancel-in-progress: true

jobs:
generator-regression:
runs-on: [self-hosted, gpu, h200]
timeout-minutes: 60
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_DISABLE_XET: "1"
# Select the 4-GPU regression test variant (uses 4 of the 8 GPUs).
TEST_MAX_GPUS: "4"
steps:
- uses: actions/checkout@v6

- uses: astral-sh/setup-uv@v7

- name: Sync environment (cu128-train)
run: uv sync --all-extras --group=cu128-train

# Generator (vision_sft_nano) loss vs the h100 goldens. -s streams the live log.
- name: Generator regression (vision_sft_nano, 4-GPU subset)
run: |
export LD_LIBRARY_PATH=
uv run --all-extras --group=cu128-train python -m pytest -v -s \
tests/launch_regression_test.py -k vision_sft_nano \
--num-gpus=4 --levels=2 -o addopts=

# The h100_inputs fixture removes its DCP stage on teardown; clear the
# pytest tmp dirs too (logs + any run output). The HF cache is kept.
- name: Clean up run outputs
if: always()
run: |
rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
54 changes: 54 additions & 0 deletions .github/workflows/2-gpu-smoke-inference.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Cosmos3-Nano 8-GPU multi-modality inference smoke (t2vs + policy + forward_dynamics) on a
# self-hosted 8×H200 runner.
#
# Requires:
# * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs,
# NVIDIA drivers, and `uv` on PATH;
# * an `HF_TOKEN` repository secret (gated model downloads).
#
# The Cosmos3-Nano checkpoint (and its sound tokenizer) download to the runner's
# HF cache; later runs reuse it.
name: GPU Smoke (Inference)

on:
push:
branches: [main]
pull_request:
branches: [main]

concurrency:
group: gpu-smoke-inference-${{ github.ref }}
cancel-in-progress: true

jobs:
inference-smoke:
runs-on: [self-hosted, gpu, h200]
timeout-minutes: 60
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_DISABLE_XET: "1"
steps:
- uses: actions/checkout@v6

- uses: astral-sh/setup-uv@v7

- name: Sync environment (cu128-train)
run: uv sync --all-extras --group=cu128-train

# One inference call over t2vs (+sound), action policy, and forward_dynamics; checks each output.
# MAX_GPUS defaults to 8. -s streams the live process log.
- name: Nano inference smoke (t2vs + action policy + forward_dynamics, 8 GPU)
run: |
export LD_LIBRARY_PATH=
uv run --all-extras --group=cu128-train python -m pytest -v -s \
tests/nano_inference_smoke_test.py --num-gpus=8 --levels=2 -o addopts=

# Inference writes only the pytest tmp dir (the t2vs video + logs); the
# checkpoint download stays in the HF cache (kept). No examples/ artifacts.
- name: Clean up run outputs
if: always()
run: |
rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
58 changes: 58 additions & 0 deletions .github/workflows/3-gpu-regression-reasoner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: OpenMDW-1.1

# Reasoner (VLM) SFT loss regression on a self-hosted 8×H200 runner (4-GPU
# subset). Runs the single ``llava_ov_datapacker`` spec of
# tests/launch_regression_test.py.
#
# Requires:
# * a self-hosted runner labelled [self-hosted, gpu, h200] with >=4 GPUs,
# NVIDIA drivers, and `uv` on PATH;
# * an `HF_TOKEN` repository secret (gated dataset/model downloads, incl. the
# streamed LLaVA-OneVision-Data dataset).
#
# The h100 goldens are reused on H200 (see _detect_arch).
name: GPU Regression (Reasoner)

on:
push:
branches: [main]
pull_request:
branches: [main]

concurrency:
group: gpu-regression-reasoner-${{ github.ref }}
cancel-in-progress: true

jobs:
reasoner-regression:
runs-on: [self-hosted, gpu, h200]
timeout-minutes: 60
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_DISABLE_XET: "1"
# Select the 4-GPU regression test variant (uses 4 of the 8 GPUs).
TEST_MAX_GPUS: "4"
steps:
- uses: actions/checkout@v6

- uses: astral-sh/setup-uv@v7

- name: Sync environment (cu128-train)
run: uv sync --all-extras --group=cu128-train

# Reasoner (llava_ov_datapacker) iter-0 loss vs the h100 goldens. -s streams
# the live log.
- name: Reasoner regression (llava_ov_datapacker, 4-GPU subset)
run: |
export LD_LIBRARY_PATH=
uv run --all-extras --group=cu128-train python -m pytest -v -s \
tests/launch_regression_test.py -k llava_ov_datapacker \
--num-gpus=4 --levels=2 -o addopts=

# The h100_inputs fixture removes its DCP stage on teardown; clear the
# pytest tmp dirs too (logs + any run output). The HF cache is kept.
- name: Clean up run outputs
if: always()
run: |
rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true
Loading