From bdafcb39b37a3654d8bf541761e719200f62a8fd Mon Sep 17 00:00:00 2001
From: Cena <cena.miller@gmail.com>
Date: Mon, 27 Apr 2026 16:31:08 -0600
Subject: [PATCH 1/3] ci: add CUDA-aware MPI opt-in to GPU Nsight profiling

The Nsight workflow always ran with host-staged MPI, even though the
MPAS-A OpenACC build ships with `acc host_data use_device(...)` around
halo exchanges. Without an explicit opt-in, MPI silently `cudaMemcpy`s
device buffers through host memory, so the profile shows a lot of
H<->D copy traffic and the MPI calls themselves are missing from the
timeline (nsys was not tracing MPI either).

Two changes:

1. Add `cuda_aware_mpi` workflow input (default false, preserves prior
   behaviour). When true, run-nsys-profile.sh sets the right env vars
   per MPI implementation:
     - MPICH:   MPICH_GPU_SUPPORT_ENABLED=1
     - OpenMPI: OMPI_MCA_pml=ucx, OMPI_MCA_osc=ucx, UCX_TLS includes
                cuda transports; also passes `--mca pml ucx --mca osc ucx`
                on the mpirun line.
   These do nothing useful unless the container's MPI is built with GPU
   support, but the failure mode is loud (MPI abort) rather than silent.

2. Add `mpi` to the nsys trace targets so halo exchanges show up in the
   timeline regardless of the cuda-aware setting. This lets us compare
   host-staged vs cuda-aware runs by dispatching the workflow twice.

Made-with: Cursor
---
 .github/ci-config.env                    |  2 +-
 .github/scripts/run-nsys-profile.sh      | 33 +++++++++++++++++++-----
 .github/workflows/profile-gpu-nsight.yml |  7 +++++
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/.github/ci-config.env b/.github/ci-config.env
index 05a4dee124..18d3ad5840 100644
--- a/.github/ci-config.env
+++ b/.github/ci-config.env
@@ -124,7 +124,7 @@ BFB_RUN_TIMEOUT=10
 # ── Nsight Systems CLI (profile-gpu-nsight workflow) ─
 # Bump NSYS_CLI_CACHE_VERSION to invalidate GitHub Actions cache of downloaded RPMs
 # when NVIDIA updates packages in the devtools repo.
-NSYS_CLI_CACHE_VERSION=1
+NSYS_CLI_CACHE_VERSION=2
 
 
 # ── KNOWN ISSUES ─────────────────────────────────
diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh
index ce20625455..d459a16496 100644
--- a/.github/scripts/run-nsys-profile.sh
+++ b/.github/scripts/run-nsys-profile.sh
@@ -49,21 +49,42 @@ if [ "${MPI_IMPL}" = "openmpi" ]; then
   MPI_FLAGS="${OPENMPI_RUN_FLAGS:---allow-run-as-root --oversubscribe}"
 fi
 
+# Opt-in CUDA-aware MPI: passes device pointers to MPI without host staging.
+# Requires the container's MPI library to be built with GPU support; if it
+# isn't, MPI will either silently fall back to host staging or abort.
+CUDA_AWARE_MPI="${CUDA_AWARE_MPI:-false}"
+if [ "${CUDA_AWARE_MPI}" = "true" ]; then
+  case "${MPI_IMPL}" in
+    mpich)
+      export MPICH_GPU_SUPPORT_ENABLED=1
+      ;;
+    openmpi)
+      export OMPI_MCA_pml=ucx
+      export OMPI_MCA_osc=ucx
+      export UCX_TLS=cuda,cuda_copy,cuda_ipc,sm,self
+      MPI_FLAGS="${MPI_FLAGS} --mca pml ucx --mca osc ucx"
+      ;;
+  esac
+fi
+
 ulimit -s unlimited 2>/dev/null || true
 
 cd "${WORKDIR}"
 
 OUT_ABS="${PWD}/${NSYS_BASENAME}"
 echo "=== Nsight profile ==="
-echo "  workdir: ${WORKDIR}"
-echo "  ranks:   ${NUM_PROCS}"
-echo "  mpi:     ${MPI_IMPL}"
-echo "  output:  ${OUT_ABS}"
-echo "  timeout: ${TIMEOUT}m"
+echo "  workdir:        ${WORKDIR}"
+echo "  ranks:          ${NUM_PROCS}"
+echo "  mpi:            ${MPI_IMPL}"
+echo "  cuda-aware mpi: ${CUDA_AWARE_MPI}"
+echo "  output:         ${OUT_ABS}"
+echo "  timeout:        ${TIMEOUT}m"
 
+# Trace MPI alongside CUDA so halo exchanges show up in the timeline and we
+# can tell device-to-device transfers from host-staged ones.
 set +e
 timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \
-  --trace=cuda,nvtx,osrt \
+  --trace=cuda,nvtx,osrt,mpi \
   --stats=true \
   -o "${OUT_ABS}" \
   mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model
diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml
index 92ed3b7969..1f06149f6e 100644
--- a/.github/workflows/profile-gpu-nsight.yml
+++ b/.github/workflows/profile-gpu-nsight.yml
@@ -37,6 +37,11 @@ on:
         required: false
         default: '40'
         type: string
+      cuda_aware_mpi:
+        description: Enable CUDA-aware MPI (passes device pointers directly to MPI). Requires MPI in the container to be built with GPU support.
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   config:
@@ -129,6 +134,8 @@ jobs:
 
       - name: Run nsys profile
         shell: bash
+        env:
+          CUDA_AWARE_MPI: ${{ inputs.cuda_aware_mpi }}
         run: |
           chmod +x .github/scripts/run-nsys-profile.sh
           bash .github/scripts/run-nsys-profile.sh \

From 401fe554bd2b6b1e0b850effd9e7c2c16a554b9c Mon Sep 17 00:00:00 2001
From: Cena <cena.miller@gmail.com>
Date: Tue, 28 Apr 2026 04:04:05 -0600
Subject: [PATCH 2/3] ci: per-rank GPU pinning + richer GPU diagnostics for
 Nsight workflow

Both motivated by debugging cuda-aware MPI on PR #47. Without per-rank
pinning, all ranks default to GPU 0 on the CIRRUS-4x8-gpu node, which
masks any cuda-aware MPI win because there is no cross-GPU traffic.

- pin-gpu.sh: tiny shim that sets CUDA_VISIBLE_DEVICES to
  (local_rank % visible_gpu_count) using whichever local-rank var the
  MPI runtime exports (MPI_LOCALRANKID / OMPI_COMM_WORLD_LOCAL_RANK /
  PMI_LOCAL_RANK / SLURM_LOCALID). No-op if no GPUs detected, so safe
  when the container has no GPU mapping. Round-robin only; not yet
  wired into _test-gpu / _test-bfb (those can adopt it next pass).

- run-nsys-profile.sh: launch the model through pin-gpu.sh inside
  mpirun so the pin happens per child process, not once for the whole
  job. Comment now flags why the wrapper is here.

- profile-gpu-nsight.yml: replace the old single nvidia-smi probe
  (which fails inside containers shipping the GDK stub libnvidia-ml.so)
  with a structured diagnostic block that lists /dev/nvidia*, runs
  nvidia-smi -L and --query-gpu (both bypass libnvidia-ml), and dumps
  CUDA / OpenACC / MPI env vars. Goes into the workflow log and step
  summary so we can tell whether the runner exposed >1 GPU and what
  the model actually launched with.

Made-with: Cursor
---
 .github/scripts/pin-gpu.sh               | 32 ++++++++++++++++++++++++
 .github/scripts/run-nsys-profile.sh      |  4 ++-
 .github/workflows/profile-gpu-nsight.yml | 18 ++++++++++---
 3 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 .github/scripts/pin-gpu.sh

diff --git a/.github/scripts/pin-gpu.sh b/.github/scripts/pin-gpu.sh
new file mode 100644
index 0000000000..882b6d2f7f
--- /dev/null
+++ b/.github/scripts/pin-gpu.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Per-rank GPU pinning shim. Wrap your MPI binary like:
+#   mpirun -n N <flags> bash pin-gpu.sh ./atmosphere_model
+# Sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) per rank.
+# Does nothing when no GPUs are detected (lets the model run as-is).
+#
+# Override knobs:
+#   PIN_GPU_NGPU   force visible GPU count (skip detection)
+#   PIN_GPU_DEBUG  set to 1 for verbose detection output
+set -u
+
+LOCAL_RANK="${MPI_LOCALRANKID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${PMI_LOCAL_RANK:-${SLURM_LOCALID:-0}}}}"
+
+NGPU="${PIN_GPU_NGPU:-}"
+if [ -z "${NGPU}" ]; then
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    NGPU=$(nvidia-smi -L 2>/dev/null | wc -l)
+  fi
+  if [ -z "${NGPU}" ] || [ "${NGPU}" -eq 0 ]; then
+    NGPU=$(ls /dev/nvidia[0-9]* 2>/dev/null | wc -l)
+  fi
+fi
+
+if [ -n "${NGPU}" ] && [ "${NGPU}" -gt 0 ]; then
+  GPU_ID=$((LOCAL_RANK % NGPU))
+  export CUDA_VISIBLE_DEVICES="${GPU_ID}"
+  echo "[pin-gpu] rank=${LOCAL_RANK} -> GPU ${GPU_ID} (of ${NGPU})" >&2
+else
+  echo "[pin-gpu] no GPUs detected; not pinning (rank=${LOCAL_RANK})" >&2
+fi
+
+exec "$@"
diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh
index d459a16496..b73d61c732 100644
--- a/.github/scripts/run-nsys-profile.sh
+++ b/.github/scripts/run-nsys-profile.sh
@@ -82,12 +82,14 @@ echo "  timeout:        ${TIMEOUT}m"
 
 # Trace MPI alongside CUDA so halo exchanges show up in the timeline and we
 # can tell device-to-device transfers from host-staged ones.
+# pin-gpu.sh sets CUDA_VISIBLE_DEVICES per rank so multi-rank runs spread
+# across the node's GPUs instead of stacking on device 0.
 set +e
 timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \
   --trace=cuda,nvtx,osrt,mpi \
   --stats=true \
   -o "${OUT_ABS}" \
-  mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model
+  mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} bash "${SCRIPT_DIR}/pin-gpu.sh" ./atmosphere_model
 RUN_STATUS=$?
 set -e
 
diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml
index 1f06149f6e..88bf622522 100644
--- a/.github/workflows/profile-gpu-nsight.yml
+++ b/.github/workflows/profile-gpu-nsight.yml
@@ -104,9 +104,21 @@ jobs:
       - name: GPU check
         shell: bash
         run: |
-          echo "## GPU" >> "$GITHUB_STEP_SUMMARY"
-          nvidia-smi || echo "::warning::nvidia-smi failed"
-          nvidia-smi || true
+          {
+            echo "## GPU diagnostics"
+            echo ""
+            echo "### /dev/nvidia*"
+            ls -la /dev/nvidia* 2>&1 || echo "(none — container not GPU-mapped?)"
+            echo ""
+            echo "### nvidia-smi -L (uses /dev/nvidia*, bypasses libnvidia-ml stub)"
+            nvidia-smi -L 2>&1 || echo "(nvidia-smi -L failed)"
+            echo ""
+            echo "### nvidia-smi --query-gpu (CSV)"
+            nvidia-smi --query-gpu=index,name,uuid,driver_version,memory.total --format=csv 2>&1 || echo "(query failed)"
+            echo ""
+            echo "### CUDA / OpenACC env"
+            env | grep -E '^(CUDA|ACC|NVIDIA|MPICH_GPU|OMPI|UCX)_' | sort || true
+          } | tee -a "$GITHUB_STEP_SUMMARY"
 
       - name: Setup Nsight Systems CLI (install + RPM cache)
         uses: ./.github/actions/setup-nsight-systems

From c2bec761a7e54a03aa49c9327a051d72380c45a8 Mon Sep 17 00:00:00 2001
From: Cena <cena.miller@gmail.com>
Date: Tue, 28 Apr 2026 05:28:44 -0600
Subject: [PATCH 3/3] ci: also patch config_gpu_aware_mpi in namelist when
 cuda_aware_mpi=true

The MPI-side env vars are only half the cuda-aware path. MPAS host-stages
halo exchanges itself unless config_gpu_aware_mpi=.true. is set in
&development; without it the MPI library never sees a device pointer and
MPICH_GPU_SUPPORT_ENABLED has nothing to do.

Make cuda_aware_mpi a single combined knob: when true, it now both sets
the MPI env vars (existing run-nsys-profile.sh logic) and patches
nsight-case/namelist.atmosphere so MPAS uses acc host_data use_device
around halo sends. Idempotent: appends a &development block if missing,
inserts the key if the block exists without it, or flips an existing
.false. to .true.

Made-with: Cursor
---
 .github/workflows/profile-gpu-nsight.yml | 33 +++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml
index 88bf622522..b95306bd36 100644
--- a/.github/workflows/profile-gpu-nsight.yml
+++ b/.github/workflows/profile-gpu-nsight.yml
@@ -38,7 +38,7 @@ on:
         default: '40'
         type: string
       cuda_aware_mpi:
-        description: Enable CUDA-aware MPI (passes device pointers directly to MPI). Requires MPI in the container to be built with GPU support.
+        description: End-to-end CUDA-aware MPI - sets config_gpu_aware_mpi=.true. in &development (MPAS hands device pointers to MPI) AND MPICH_GPU_SUPPORT_ENABLED=1 / OpenMPI UCX-CUDA flags. Requires the container's MPI to be built with GPU support.
         required: false
         default: false
         type: boolean
@@ -144,6 +144,37 @@ jobs:
           echo "config_run_duration -> ${DURATION} (config_dt left as in test case)"
           grep -E 'config_dt|config_run_duration' nsight-case/namelist.atmosphere | head -n 5 || true
 
+      # MPAS host-stages halos itself unless config_gpu_aware_mpi=.true. is set in
+      # &development. Without this, the MPI library never sees a device pointer and
+      # MPICH_GPU_SUPPORT_ENABLED has nothing to do.
+      - name: Patch namelist (config_gpu_aware_mpi = .true.)
+        if: ${{ inputs.cuda_aware_mpi }}
+        shell: bash
+        working-directory: nsight-case
+        run: |
+          python3 - <<'PY'
+          import re
+          from pathlib import Path
+          p = Path("namelist.atmosphere")
+          text = p.read_text()
+          if re.search(r"^\s*&development\b", text, flags=re.MULTILINE):
+              def edit(m):
+                  block = m.group(0)
+                  if re.search(r"config_gpu_aware_mpi\s*=", block):
+                      return re.sub(r"config_gpu_aware_mpi\s*=\s*\.\w+\.",
+                                    "config_gpu_aware_mpi = .true.", block)
+                  return re.sub(r"(\n\s*/\s*)$",
+                                r"\n    config_gpu_aware_mpi = .true.\1",
+                                block, count=1)
+              text = re.sub(r"^\s*&development\b[\s\S]*?\n\s*/\s*",
+                            edit, text, count=1, flags=re.MULTILINE)
+          else:
+              text = text.rstrip() + "\n&development\n    config_gpu_aware_mpi = .true.\n/\n"
+          p.write_text(text)
+          PY
+          echo "&development block now reads:"
+          awk '/^[[:space:]]*&development/,/^[[:space:]]*\//' namelist.atmosphere
+
       - name: Run nsys profile
         shell: bash
         env: