diff --git a/.github/ci-config.env b/.github/ci-config.env index 05a4dee124..18d3ad5840 100644 --- a/.github/ci-config.env +++ b/.github/ci-config.env @@ -124,7 +124,7 @@ BFB_RUN_TIMEOUT=10 # ── Nsight Systems CLI (profile-gpu-nsight workflow) ─ # Bump NSYS_CLI_CACHE_VERSION to invalidate GitHub Actions cache of downloaded RPMs # when NVIDIA updates packages in the devtools repo. -NSYS_CLI_CACHE_VERSION=1 +NSYS_CLI_CACHE_VERSION=2 # ── KNOWN ISSUES ───────────────────────────────── diff --git a/.github/scripts/pin-gpu.sh b/.github/scripts/pin-gpu.sh new file mode 100644 index 0000000000..882b6d2f7f --- /dev/null +++ b/.github/scripts/pin-gpu.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Per-rank GPU pinning shim. Wrap your MPI binary like: +# mpirun -n N bash pin-gpu.sh ./atmosphere_model +# Sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) per rank. +# Does nothing when no GPUs are detected (lets the model run as-is). +# +# Override knobs: +# PIN_GPU_NGPU force visible GPU count (skip detection) +# PIN_GPU_DEBUG set to 1 for verbose detection output +set -u + +LOCAL_RANK="${MPI_LOCALRANKID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${PMI_LOCAL_RANK:-${SLURM_LOCALID:-0}}}}" + +NGPU="${PIN_GPU_NGPU:-}" +if [ -z "${NGPU}" ]; then + if command -v nvidia-smi >/dev/null 2>&1; then + NGPU=$(nvidia-smi -L 2>/dev/null | wc -l) + fi + if [ -z "${NGPU}" ] || [ "${NGPU}" -eq 0 ]; then + NGPU=$(ls /dev/nvidia[0-9]* 2>/dev/null | wc -l) + fi +fi + +if [ -n "${NGPU}" ] && [ "${NGPU}" -gt 0 ]; then + GPU_ID=$((LOCAL_RANK % NGPU)) + export CUDA_VISIBLE_DEVICES="${GPU_ID}" + echo "[pin-gpu] rank=${LOCAL_RANK} -> GPU ${GPU_ID} (of ${NGPU})" >&2 +else + echo "[pin-gpu] no GPUs detected; not pinning (rank=${LOCAL_RANK})" >&2 +fi + +exec "$@" diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh index ce20625455..b73d61c732 100644 --- a/.github/scripts/run-nsys-profile.sh +++ b/.github/scripts/run-nsys-profile.sh @@ -49,24 +49,47 @@ if [ "${MPI_IMPL}" = "openmpi" ]; then MPI_FLAGS="${OPENMPI_RUN_FLAGS:---allow-run-as-root --oversubscribe}" fi +# Opt-in CUDA-aware MPI: passes device pointers to MPI without host staging. +# Requires the container's MPI library to be built with GPU support; if it +# isn't, MPI will either silently fall back to host staging or abort. +CUDA_AWARE_MPI="${CUDA_AWARE_MPI:-false}" +if [ "${CUDA_AWARE_MPI}" = "true" ]; then + case "${MPI_IMPL}" in + mpich) + export MPICH_GPU_SUPPORT_ENABLED=1 + ;; + openmpi) + export OMPI_MCA_pml=ucx + export OMPI_MCA_osc=ucx + export UCX_TLS=cuda,cuda_copy,cuda_ipc,sm,self + MPI_FLAGS="${MPI_FLAGS} --mca pml ucx --mca osc ucx" + ;; + esac +fi + ulimit -s unlimited 2>/dev/null || true cd "${WORKDIR}" OUT_ABS="${PWD}/${NSYS_BASENAME}" echo "=== Nsight profile ===" -echo " workdir: ${WORKDIR}" -echo " ranks: ${NUM_PROCS}" -echo " mpi: ${MPI_IMPL}" -echo " output: ${OUT_ABS}" -echo " timeout: ${TIMEOUT}m" +echo " workdir: ${WORKDIR}" +echo " ranks: ${NUM_PROCS}" +echo " mpi: ${MPI_IMPL}" +echo " cuda-aware mpi: ${CUDA_AWARE_MPI}" +echo " output: ${OUT_ABS}" +echo " timeout: ${TIMEOUT}m" +# Trace MPI alongside CUDA so halo exchanges show up in the timeline and we +# can tell device-to-device transfers from host-staged ones. +# pin-gpu.sh sets CUDA_VISIBLE_DEVICES per rank so multi-rank runs spread +# across the node's GPUs instead of stacking on device 0. set +e timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \ - --trace=cuda,nvtx,osrt \ + --trace=cuda,nvtx,osrt,mpi \ --stats=true \ -o "${OUT_ABS}" \ - mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model + mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} bash "${SCRIPT_DIR}/pin-gpu.sh" ./atmosphere_model RUN_STATUS=$? set -e diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml index 92ed3b7969..b95306bd36 100644 --- a/.github/workflows/profile-gpu-nsight.yml +++ b/.github/workflows/profile-gpu-nsight.yml @@ -37,6 +37,11 @@ on: required: false default: '40' type: string + cuda_aware_mpi: + description: End-to-end CUDA-aware MPI - sets config_gpu_aware_mpi=.true. in &development (MPAS hands device pointers to MPI) AND MPICH_GPU_SUPPORT_ENABLED=1 / OpenMPI UCX-CUDA flags. Requires the container's MPI to be built with GPU support. + required: false + default: false + type: boolean jobs: config: @@ -99,9 +104,21 @@ jobs: - name: GPU check shell: bash run: | - echo "## GPU" >> "$GITHUB_STEP_SUMMARY" - nvidia-smi || echo "::warning::nvidia-smi failed" - nvidia-smi || true + { + echo "## GPU diagnostics" + echo "" + echo "### /dev/nvidia*" + ls -la /dev/nvidia* 2>&1 || echo "(none — container not GPU-mapped?)" + echo "" + echo "### nvidia-smi -L (uses /dev/nvidia*, bypasses libnvidia-ml stub)" + nvidia-smi -L 2>&1 || echo "(nvidia-smi -L failed)" + echo "" + echo "### nvidia-smi --query-gpu (CSV)" + nvidia-smi --query-gpu=index,name,uuid,driver_version,memory.total --format=csv 2>&1 || echo "(query failed)" + echo "" + echo "### CUDA / OpenACC env" + env | grep -E '^(CUDA|ACC|NVIDIA|MPICH_GPU|OMPI|UCX)_' | sort || true + } | tee -a "$GITHUB_STEP_SUMMARY" - name: Setup Nsight Systems CLI (install + RPM cache) uses: ./.github/actions/setup-nsight-systems @@ -127,8 +144,41 @@ jobs: echo "config_run_duration -> ${DURATION} (config_dt left as in test case)" grep -E 'config_dt|config_run_duration' nsight-case/namelist.atmosphere | head -n 5 || true + # MPAS host-stages halos itself unless config_gpu_aware_mpi=.true. is set in + # &development. Without this, the MPI library never sees a device pointer and + # MPICH_GPU_SUPPORT_ENABLED has nothing to do. + - name: Patch namelist (config_gpu_aware_mpi = .true.) + if: ${{ inputs.cuda_aware_mpi }} + shell: bash + working-directory: nsight-case + run: | + python3 - <<'PY' + import re + from pathlib import Path + p = Path("namelist.atmosphere") + text = p.read_text() + if re.search(r"^\s*&development\b", text, flags=re.MULTILINE): + def edit(m): + block = m.group(0) + if re.search(r"config_gpu_aware_mpi\s*=", block): + return re.sub(r"config_gpu_aware_mpi\s*=\s*\.\w+\.", + "config_gpu_aware_mpi = .true.", block) + return re.sub(r"(\n\s*/\s*)$", + r"\n config_gpu_aware_mpi = .true.\1", + block, count=1) + text = re.sub(r"^\s*&development\b[\s\S]*?\n\s*/\s*", + edit, text, count=1, flags=re.MULTILINE) + else: + text = text.rstrip() + "\n&development\n config_gpu_aware_mpi = .true.\n/\n" + p.write_text(text) + PY + echo "&development block now reads:" + awk '/^[[:space:]]*&development/,/^[[:space:]]*\//' namelist.atmosphere + - name: Run nsys profile shell: bash + env: + CUDA_AWARE_MPI: ${{ inputs.cuda_aware_mpi }} run: | chmod +x .github/scripts/run-nsys-profile.sh bash .github/scripts/run-nsys-profile.sh \