From bdafcb39b37a3654d8bf541761e719200f62a8fd Mon Sep 17 00:00:00 2001 From: Cena Date: Mon, 27 Apr 2026 16:31:08 -0600 Subject: [PATCH 1/3] ci: add CUDA-aware MPI opt-in to GPU Nsight profiling The Nsight workflow always ran with host-staged MPI, even though the MPAS-A OpenACC build ships with `acc host_data use_device(...)` around halo exchanges. Without an explicit opt-in, MPI silently `cudaMemcpy`s device buffers through host memory, so the profile shows a lot of H<->D copy traffic and the MPI calls themselves are missing from the timeline (nsys was not tracing MPI either). Two changes: 1. Add `cuda_aware_mpi` workflow input (default false, preserves prior behaviour). When true, run-nsys-profile.sh sets the right env vars per MPI implementation: - MPICH: MPICH_GPU_SUPPORT_ENABLED=1 - OpenMPI: OMPI_MCA_pml=ucx, OMPI_MCA_osc=ucx, UCX_TLS includes cuda transports; also passes `--mca pml ucx --mca osc ucx` on the mpirun line. These do nothing useful unless the container's MPI is built with GPU support, but the failure mode is loud (MPI abort) rather than silent. 2. Add `mpi` to the nsys trace targets so halo exchanges show up in the timeline regardless of the cuda-aware setting. This lets us compare host-staged vs cuda-aware runs by dispatching the workflow twice. Made-with: Cursor --- .github/ci-config.env | 2 +- .github/scripts/run-nsys-profile.sh | 33 +++++++++++++++++++----- .github/workflows/profile-gpu-nsight.yml | 7 +++++ 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/.github/ci-config.env b/.github/ci-config.env index 05a4dee124..18d3ad5840 100644 --- a/.github/ci-config.env +++ b/.github/ci-config.env @@ -124,7 +124,7 @@ BFB_RUN_TIMEOUT=10 # ── Nsight Systems CLI (profile-gpu-nsight workflow) ─ # Bump NSYS_CLI_CACHE_VERSION to invalidate GitHub Actions cache of downloaded RPMs # when NVIDIA updates packages in the devtools repo. -NSYS_CLI_CACHE_VERSION=1 +NSYS_CLI_CACHE_VERSION=2 # ── KNOWN ISSUES ───────────────────────────────── diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh index ce20625455..d459a16496 100644 --- a/.github/scripts/run-nsys-profile.sh +++ b/.github/scripts/run-nsys-profile.sh @@ -49,21 +49,42 @@ if [ "${MPI_IMPL}" = "openmpi" ]; then MPI_FLAGS="${OPENMPI_RUN_FLAGS:---allow-run-as-root --oversubscribe}" fi +# Opt-in CUDA-aware MPI: passes device pointers to MPI without host staging. +# Requires the container's MPI library to be built with GPU support; if it +# isn't, MPI will either silently fall back to host staging or abort. +CUDA_AWARE_MPI="${CUDA_AWARE_MPI:-false}" +if [ "${CUDA_AWARE_MPI}" = "true" ]; then + case "${MPI_IMPL}" in + mpich) + export MPICH_GPU_SUPPORT_ENABLED=1 + ;; + openmpi) + export OMPI_MCA_pml=ucx + export OMPI_MCA_osc=ucx + export UCX_TLS=cuda,cuda_copy,cuda_ipc,sm,self + MPI_FLAGS="${MPI_FLAGS} --mca pml ucx --mca osc ucx" + ;; + esac +fi + ulimit -s unlimited 2>/dev/null || true cd "${WORKDIR}" OUT_ABS="${PWD}/${NSYS_BASENAME}" echo "=== Nsight profile ===" -echo " workdir: ${WORKDIR}" -echo " ranks: ${NUM_PROCS}" -echo " mpi: ${MPI_IMPL}" -echo " output: ${OUT_ABS}" -echo " timeout: ${TIMEOUT}m" +echo " workdir: ${WORKDIR}" +echo " ranks: ${NUM_PROCS}" +echo " mpi: ${MPI_IMPL}" +echo " cuda-aware mpi: ${CUDA_AWARE_MPI}" +echo " output: ${OUT_ABS}" +echo " timeout: ${TIMEOUT}m" +# Trace MPI alongside CUDA so halo exchanges show up in the timeline and we +# can tell device-to-device transfers from host-staged ones. set +e timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \ - --trace=cuda,nvtx,osrt \ + --trace=cuda,nvtx,osrt,mpi \ --stats=true \ -o "${OUT_ABS}" \ mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml index 92ed3b7969..1f06149f6e 100644 --- a/.github/workflows/profile-gpu-nsight.yml +++ b/.github/workflows/profile-gpu-nsight.yml @@ -37,6 +37,11 @@ on: required: false default: '40' type: string + cuda_aware_mpi: + description: Enable CUDA-aware MPI (passes device pointers directly to MPI). Requires MPI in the container to be built with GPU support. + required: false + default: false + type: boolean jobs: config: @@ -129,6 +134,8 @@ jobs: - name: Run nsys profile shell: bash + env: + CUDA_AWARE_MPI: ${{ inputs.cuda_aware_mpi }} run: | chmod +x .github/scripts/run-nsys-profile.sh bash .github/scripts/run-nsys-profile.sh \ From 401fe554bd2b6b1e0b850effd9e7c2c16a554b9c Mon Sep 17 00:00:00 2001 From: Cena Date: Tue, 28 Apr 2026 04:04:05 -0600 Subject: [PATCH 2/3] ci: per-rank GPU pinning + richer GPU diagnostics for Nsight workflow Both motivated by debugging cuda-aware MPI on PR #47. Without per-rank pinning, all ranks default to GPU 0 on the CIRRUS-4x8-gpu node, which masks any cuda-aware MPI win because there is no cross-GPU traffic. - pin-gpu.sh: tiny shim that sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) using whichever local-rank var the MPI runtime exports (MPI_LOCALRANKID / OMPI_COMM_WORLD_LOCAL_RANK / PMI_LOCAL_RANK / SLURM_LOCALID). No-op if no GPUs detected, so safe when the container has no GPU mapping. Round-robin only; not yet wired into _test-gpu / _test-bfb (those can adopt it next pass). - run-nsys-profile.sh: launch the model through pin-gpu.sh inside mpirun so the pin happens per child process, not once for the whole job. Comment now flags why the wrapper is here. - profile-gpu-nsight.yml: replace the old single nvidia-smi probe (which fails inside containers shipping the GDK stub libnvidia-ml.so) with a structured diagnostic block that lists /dev/nvidia*, runs nvidia-smi -L and --query-gpu (both bypass libnvidia-ml), and dumps CUDA / OpenACC / MPI env vars. Goes into the workflow log and step summary so we can tell whether the runner exposed >1 GPU and what the model actually launched with. Made-with: Cursor --- .github/scripts/pin-gpu.sh | 32 ++++++++++++++++++++++++ .github/scripts/run-nsys-profile.sh | 4 ++- .github/workflows/profile-gpu-nsight.yml | 18 ++++++++++--- 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 .github/scripts/pin-gpu.sh diff --git a/.github/scripts/pin-gpu.sh b/.github/scripts/pin-gpu.sh new file mode 100644 index 0000000000..882b6d2f7f --- /dev/null +++ b/.github/scripts/pin-gpu.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Per-rank GPU pinning shim. Wrap your MPI binary like: +# mpirun -n N bash pin-gpu.sh ./atmosphere_model +# Sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) per rank. +# Does nothing when no GPUs are detected (lets the model run as-is). +# +# Override knobs: +# PIN_GPU_NGPU force visible GPU count (skip detection) +# PIN_GPU_DEBUG set to 1 for verbose detection output +set -u + +LOCAL_RANK="${MPI_LOCALRANKID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${PMI_LOCAL_RANK:-${SLURM_LOCALID:-0}}}}" + +NGPU="${PIN_GPU_NGPU:-}" +if [ -z "${NGPU}" ]; then + if command -v nvidia-smi >/dev/null 2>&1; then + NGPU=$(nvidia-smi -L 2>/dev/null | wc -l) + fi + if [ -z "${NGPU}" ] || [ "${NGPU}" -eq 0 ]; then + NGPU=$(ls /dev/nvidia[0-9]* 2>/dev/null | wc -l) + fi +fi + +if [ -n "${NGPU}" ] && [ "${NGPU}" -gt 0 ]; then + GPU_ID=$((LOCAL_RANK % NGPU)) + export CUDA_VISIBLE_DEVICES="${GPU_ID}" + echo "[pin-gpu] rank=${LOCAL_RANK} -> GPU ${GPU_ID} (of ${NGPU})" >&2 +else + echo "[pin-gpu] no GPUs detected; not pinning (rank=${LOCAL_RANK})" >&2 +fi + +exec "$@" diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh index d459a16496..b73d61c732 100644 --- a/.github/scripts/run-nsys-profile.sh +++ b/.github/scripts/run-nsys-profile.sh @@ -82,12 +82,14 @@ echo " timeout: ${TIMEOUT}m" # Trace MPI alongside CUDA so halo exchanges show up in the timeline and we # can tell device-to-device transfers from host-staged ones. +# pin-gpu.sh sets CUDA_VISIBLE_DEVICES per rank so multi-rank runs spread +# across the node's GPUs instead of stacking on device 0. set +e timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \ --trace=cuda,nvtx,osrt,mpi \ --stats=true \ -o "${OUT_ABS}" \ - mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model + mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} bash "${SCRIPT_DIR}/pin-gpu.sh" ./atmosphere_model RUN_STATUS=$? set -e diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml index 1f06149f6e..88bf622522 100644 --- a/.github/workflows/profile-gpu-nsight.yml +++ b/.github/workflows/profile-gpu-nsight.yml @@ -104,9 +104,21 @@ jobs: - name: GPU check shell: bash run: | - echo "## GPU" >> "$GITHUB_STEP_SUMMARY" - nvidia-smi || echo "::warning::nvidia-smi failed" - nvidia-smi || true + { + echo "## GPU diagnostics" + echo "" + echo "### /dev/nvidia*" + ls -la /dev/nvidia* 2>&1 || echo "(none — container not GPU-mapped?)" + echo "" + echo "### nvidia-smi -L (uses /dev/nvidia*, bypasses libnvidia-ml stub)" + nvidia-smi -L 2>&1 || echo "(nvidia-smi -L failed)" + echo "" + echo "### nvidia-smi --query-gpu (CSV)" + nvidia-smi --query-gpu=index,name,uuid,driver_version,memory.total --format=csv 2>&1 || echo "(query failed)" + echo "" + echo "### CUDA / OpenACC env" + env | grep -E '^(CUDA|ACC|NVIDIA|MPICH_GPU|OMPI|UCX)_' | sort || true + } | tee -a "$GITHUB_STEP_SUMMARY" - name: Setup Nsight Systems CLI (install + RPM cache) uses: ./.github/actions/setup-nsight-systems From c2bec761a7e54a03aa49c9327a051d72380c45a8 Mon Sep 17 00:00:00 2001 From: Cena Date: Tue, 28 Apr 2026 05:28:44 -0600 Subject: [PATCH 3/3] ci: also patch config_gpu_aware_mpi in namelist when cuda_aware_mpi=true The MPI-side env vars are only half the cuda-aware path. MPAS host-stages halo exchanges itself unless config_gpu_aware_mpi=.true. is set in &development; without it the MPI library never sees a device pointer and MPICH_GPU_SUPPORT_ENABLED has nothing to do. Make cuda_aware_mpi a single combined knob: when true, it now both sets the MPI env vars (existing run-nsys-profile.sh logic) and patches nsight-case/namelist.atmosphere so MPAS uses acc host_data use_device around halo sends. Idempotent: appends a &development block if missing, inserts the key if the block exists without it, or flips an existing .false. to .true. Made-with: Cursor --- .github/workflows/profile-gpu-nsight.yml | 33 +++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml index 88bf622522..b95306bd36 100644 --- a/.github/workflows/profile-gpu-nsight.yml +++ b/.github/workflows/profile-gpu-nsight.yml @@ -38,7 +38,7 @@ on: default: '40' type: string cuda_aware_mpi: - description: Enable CUDA-aware MPI (passes device pointers directly to MPI). Requires MPI in the container to be built with GPU support. + description: End-to-end CUDA-aware MPI - sets config_gpu_aware_mpi=.true. in &development (MPAS hands device pointers to MPI) AND MPICH_GPU_SUPPORT_ENABLED=1 / OpenMPI UCX-CUDA flags. Requires the container's MPI to be built with GPU support. required: false default: false type: boolean @@ -144,6 +144,37 @@ jobs: echo "config_run_duration -> ${DURATION} (config_dt left as in test case)" grep -E 'config_dt|config_run_duration' nsight-case/namelist.atmosphere | head -n 5 || true + # MPAS host-stages halos itself unless config_gpu_aware_mpi=.true. is set in + # &development. Without this, the MPI library never sees a device pointer and + # MPICH_GPU_SUPPORT_ENABLED has nothing to do. + - name: Patch namelist (config_gpu_aware_mpi = .true.) + if: ${{ inputs.cuda_aware_mpi }} + shell: bash + working-directory: nsight-case + run: | + python3 - <<'PY' + import re + from pathlib import Path + p = Path("namelist.atmosphere") + text = p.read_text() + if re.search(r"^\s*&development\b", text, flags=re.MULTILINE): + def edit(m): + block = m.group(0) + if re.search(r"config_gpu_aware_mpi\s*=", block): + return re.sub(r"config_gpu_aware_mpi\s*=\s*\.\w+\.", + "config_gpu_aware_mpi = .true.", block) + return re.sub(r"(\n\s*/\s*)$", + r"\n config_gpu_aware_mpi = .true.\1", + block, count=1) + text = re.sub(r"^\s*&development\b[\s\S]*?\n\s*/\s*", + edit, text, count=1, flags=re.MULTILINE) + else: + text = text.rstrip() + "\n&development\n config_gpu_aware_mpi = .true.\n/\n" + p.write_text(text) + PY + echo "&development block now reads:" + awk '/^[[:space:]]*&development/,/^[[:space:]]*\//' namelist.atmosphere + - name: Run nsys profile shell: bash env: