Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ci-config.env
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ BFB_RUN_TIMEOUT=10
# ── Nsight Systems CLI (profile-gpu-nsight workflow) ─
# Bump NSYS_CLI_CACHE_VERSION to invalidate GitHub Actions cache of downloaded RPMs
# when NVIDIA updates packages in the devtools repo.
NSYS_CLI_CACHE_VERSION=1
NSYS_CLI_CACHE_VERSION=2


# ── KNOWN ISSUES ─────────────────────────────────
Expand Down
32 changes: 32 additions & 0 deletions .github/scripts/pin-gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Per-rank GPU pinning shim. Wrap your MPI binary like:
# mpirun -n N <flags> bash pin-gpu.sh ./atmosphere_model
# Sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) per rank.
# Does nothing when no GPUs are detected (lets the model run as-is).
#
# Override knobs:
# PIN_GPU_NGPU force visible GPU count (skip detection)
# PIN_GPU_DEBUG set to 1 for verbose detection output
set -u

LOCAL_RANK="${MPI_LOCALRANKID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${PMI_LOCAL_RANK:-${SLURM_LOCALID:-0}}}}"

NGPU="${PIN_GPU_NGPU:-}"
if [ -z "${NGPU}" ]; then
if command -v nvidia-smi >/dev/null 2>&1; then
NGPU=$(nvidia-smi -L 2>/dev/null | wc -l)
fi
if [ -z "${NGPU}" ] || [ "${NGPU}" -eq 0 ]; then
NGPU=$(ls /dev/nvidia[0-9]* 2>/dev/null | wc -l)
fi
fi

if [ -n "${NGPU}" ] && [ "${NGPU}" -gt 0 ]; then
GPU_ID=$((LOCAL_RANK % NGPU))
export CUDA_VISIBLE_DEVICES="${GPU_ID}"
echo "[pin-gpu] rank=${LOCAL_RANK} -> GPU ${GPU_ID} (of ${NGPU})" >&2
else
echo "[pin-gpu] no GPUs detected; not pinning (rank=${LOCAL_RANK})" >&2
fi

exec "$@"
37 changes: 30 additions & 7 deletions .github/scripts/run-nsys-profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,47 @@ if [ "${MPI_IMPL}" = "openmpi" ]; then
MPI_FLAGS="${OPENMPI_RUN_FLAGS:---allow-run-as-root --oversubscribe}"
fi

# Opt-in CUDA-aware MPI: passes device pointers to MPI without host staging.
# Requires the container's MPI library to be built with GPU support; if it
# isn't, MPI will either silently fall back to host staging or abort.
CUDA_AWARE_MPI="${CUDA_AWARE_MPI:-false}"
if [ "${CUDA_AWARE_MPI}" = "true" ]; then
case "${MPI_IMPL}" in
mpich)
export MPICH_GPU_SUPPORT_ENABLED=1
;;
openmpi)
export OMPI_MCA_pml=ucx
export OMPI_MCA_osc=ucx
export UCX_TLS=cuda,cuda_copy,cuda_ipc,sm,self
MPI_FLAGS="${MPI_FLAGS} --mca pml ucx --mca osc ucx"
;;
esac
fi

ulimit -s unlimited 2>/dev/null || true

cd "${WORKDIR}"

OUT_ABS="${PWD}/${NSYS_BASENAME}"
echo "=== Nsight profile ==="
echo " workdir: ${WORKDIR}"
echo " ranks: ${NUM_PROCS}"
echo " mpi: ${MPI_IMPL}"
echo " output: ${OUT_ABS}"
echo " timeout: ${TIMEOUT}m"
echo " workdir: ${WORKDIR}"
echo " ranks: ${NUM_PROCS}"
echo " mpi: ${MPI_IMPL}"
echo " cuda-aware mpi: ${CUDA_AWARE_MPI}"
echo " output: ${OUT_ABS}"
echo " timeout: ${TIMEOUT}m"

# Trace MPI alongside CUDA so halo exchanges show up in the timeline and we
# can tell device-to-device transfers from host-staged ones.
# pin-gpu.sh sets CUDA_VISIBLE_DEVICES per rank so multi-rank runs spread
# across the node's GPUs instead of stacking on device 0.
set +e
timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \
--trace=cuda,nvtx,osrt \
--trace=cuda,nvtx,osrt,mpi \
--stats=true \
-o "${OUT_ABS}" \
mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model
mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} bash "${SCRIPT_DIR}/pin-gpu.sh" ./atmosphere_model
RUN_STATUS=$?
set -e

Expand Down
56 changes: 53 additions & 3 deletions .github/workflows/profile-gpu-nsight.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ on:
required: false
default: '40'
type: string
cuda_aware_mpi:
description: End-to-end CUDA-aware MPI - sets config_gpu_aware_mpi=.true. in &development (MPAS hands device pointers to MPI) AND MPICH_GPU_SUPPORT_ENABLED=1 / OpenMPI UCX-CUDA flags. Requires the container's MPI to be built with GPU support.
required: false
default: false
type: boolean

jobs:
config:
Expand Down Expand Up @@ -99,9 +104,21 @@ jobs:
- name: GPU check
shell: bash
run: |
echo "## GPU" >> "$GITHUB_STEP_SUMMARY"
nvidia-smi || echo "::warning::nvidia-smi failed"
nvidia-smi || true
{
echo "## GPU diagnostics"
echo ""
echo "### /dev/nvidia*"
ls -la /dev/nvidia* 2>&1 || echo "(none — container not GPU-mapped?)"
echo ""
echo "### nvidia-smi -L (uses /dev/nvidia*, bypasses libnvidia-ml stub)"
nvidia-smi -L 2>&1 || echo "(nvidia-smi -L failed)"
echo ""
echo "### nvidia-smi --query-gpu (CSV)"
nvidia-smi --query-gpu=index,name,uuid,driver_version,memory.total --format=csv 2>&1 || echo "(query failed)"
echo ""
echo "### CUDA / OpenACC env"
env | grep -E '^(CUDA|ACC|NVIDIA|MPICH_GPU|OMPI|UCX)_' | sort || true
} | tee -a "$GITHUB_STEP_SUMMARY"

- name: Setup Nsight Systems CLI (install + RPM cache)
uses: ./.github/actions/setup-nsight-systems
Expand All @@ -127,8 +144,41 @@ jobs:
echo "config_run_duration -> ${DURATION} (config_dt left as in test case)"
grep -E 'config_dt|config_run_duration' nsight-case/namelist.atmosphere | head -n 5 || true

# MPAS host-stages halos itself unless config_gpu_aware_mpi=.true. is set in
# &development. Without this, the MPI library never sees a device pointer and
# MPICH_GPU_SUPPORT_ENABLED has nothing to do.
- name: Patch namelist (config_gpu_aware_mpi = .true.)
if: ${{ inputs.cuda_aware_mpi }}
shell: bash
working-directory: nsight-case
run: |
python3 - <<'PY'
import re
from pathlib import Path
p = Path("namelist.atmosphere")
text = p.read_text()
if re.search(r"^\s*&development\b", text, flags=re.MULTILINE):
def edit(m):
block = m.group(0)
if re.search(r"config_gpu_aware_mpi\s*=", block):
return re.sub(r"config_gpu_aware_mpi\s*=\s*\.\w+\.",
"config_gpu_aware_mpi = .true.", block)
return re.sub(r"(\n\s*/\s*)$",
r"\n config_gpu_aware_mpi = .true.\1",
block, count=1)
text = re.sub(r"^\s*&development\b[\s\S]*?\n\s*/\s*",
edit, text, count=1, flags=re.MULTILINE)
else:
text = text.rstrip() + "\n&development\n config_gpu_aware_mpi = .true.\n/\n"
p.write_text(text)
PY
echo "&development block now reads:"
awk '/^[[:space:]]*&development/,/^[[:space:]]*\//' namelist.atmosphere

- name: Run nsys profile
shell: bash
env:
CUDA_AWARE_MPI: ${{ inputs.cuda_aware_mpi }}
run: |
chmod +x .github/scripts/run-nsys-profile.sh
bash .github/scripts/run-nsys-profile.sh \
Expand Down
Loading