diff --git a/.github/ci-config.env b/.github/ci-config.env
index 05a4dee124..18d3ad5840 100644
--- a/.github/ci-config.env
+++ b/.github/ci-config.env
@@ -124,7 +124,7 @@ BFB_RUN_TIMEOUT=10
 # ── Nsight Systems CLI (profile-gpu-nsight workflow) ─
 # Bump NSYS_CLI_CACHE_VERSION to invalidate GitHub Actions cache of downloaded RPMs
 # when NVIDIA updates packages in the devtools repo.
-NSYS_CLI_CACHE_VERSION=1
+NSYS_CLI_CACHE_VERSION=2
 
 
 # ── KNOWN ISSUES ─────────────────────────────────
diff --git a/.github/scripts/pin-gpu.sh b/.github/scripts/pin-gpu.sh
new file mode 100644
index 0000000000..882b6d2f7f
--- /dev/null
+++ b/.github/scripts/pin-gpu.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Per-rank GPU pinning shim. Wrap your MPI binary like:
+#   mpirun -n N <flags> bash pin-gpu.sh ./atmosphere_model
+# Sets CUDA_VISIBLE_DEVICES to (local_rank % visible_gpu_count) per rank.
+# Does nothing when no GPUs are detected (lets the model run as-is).
+#
+# Override knobs:
+#   PIN_GPU_NGPU   force visible GPU count (skip detection)
+#   PIN_GPU_DEBUG  set to 1 for verbose detection output
+set -u
+
+LOCAL_RANK="${MPI_LOCALRANKID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${PMI_LOCAL_RANK:-${SLURM_LOCALID:-0}}}}"
+
+NGPU="${PIN_GPU_NGPU:-}"
+if [ -z "${NGPU}" ]; then
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    NGPU=$(nvidia-smi -L 2>/dev/null | wc -l)
+  fi
+  if [ -z "${NGPU}" ] || [ "${NGPU}" -eq 0 ]; then
+    NGPU=$(ls /dev/nvidia[0-9]* 2>/dev/null | wc -l)
+  fi
+fi
+
+if [ -n "${NGPU}" ] && [ "${NGPU}" -gt 0 ]; then
+  GPU_ID=$((LOCAL_RANK % NGPU))
+  export CUDA_VISIBLE_DEVICES="${GPU_ID}"
+  echo "[pin-gpu] rank=${LOCAL_RANK} -> GPU ${GPU_ID} (of ${NGPU})" >&2
+else
+  echo "[pin-gpu] no GPUs detected; not pinning (rank=${LOCAL_RANK})" >&2
+fi
+
+exec "$@"
diff --git a/.github/scripts/run-nsys-profile.sh b/.github/scripts/run-nsys-profile.sh
index ce20625455..b73d61c732 100644
--- a/.github/scripts/run-nsys-profile.sh
+++ b/.github/scripts/run-nsys-profile.sh
@@ -49,24 +49,47 @@ if [ "${MPI_IMPL}" = "openmpi" ]; then
   MPI_FLAGS="${OPENMPI_RUN_FLAGS:---allow-run-as-root --oversubscribe}"
 fi
 
+# Opt-in CUDA-aware MPI: passes device pointers to MPI without host staging.
+# Requires the container's MPI library to be built with GPU support; if it
+# isn't, MPI will either silently fall back to host staging or abort.
+CUDA_AWARE_MPI="${CUDA_AWARE_MPI:-false}"
+if [ "${CUDA_AWARE_MPI}" = "true" ]; then
+  case "${MPI_IMPL}" in
+    mpich)
+      export MPICH_GPU_SUPPORT_ENABLED=1
+      ;;
+    openmpi)
+      export OMPI_MCA_pml=ucx
+      export OMPI_MCA_osc=ucx
+      export UCX_TLS=cuda,cuda_copy,cuda_ipc,sm,self
+      MPI_FLAGS="${MPI_FLAGS} --mca pml ucx --mca osc ucx"
+      ;;
+  esac
+fi
+
 ulimit -s unlimited 2>/dev/null || true
 
 cd "${WORKDIR}"
 
 OUT_ABS="${PWD}/${NSYS_BASENAME}"
 echo "=== Nsight profile ==="
-echo "  workdir: ${WORKDIR}"
-echo "  ranks:   ${NUM_PROCS}"
-echo "  mpi:     ${MPI_IMPL}"
-echo "  output:  ${OUT_ABS}"
-echo "  timeout: ${TIMEOUT}m"
+echo "  workdir:        ${WORKDIR}"
+echo "  ranks:          ${NUM_PROCS}"
+echo "  mpi:            ${MPI_IMPL}"
+echo "  cuda-aware mpi: ${CUDA_AWARE_MPI}"
+echo "  output:         ${OUT_ABS}"
+echo "  timeout:        ${TIMEOUT}m"
 
+# Trace MPI alongside CUDA so halo exchanges show up in the timeline and we
+# can tell device-to-device transfers from host-staged ones.
+# pin-gpu.sh sets CUDA_VISIBLE_DEVICES per rank so multi-rank runs spread
+# across the node's GPUs instead of stacking on device 0.
 set +e
 timeout "${TIMEOUT}"m "${NSYS_BIN}" profile \
-  --trace=cuda,nvtx,osrt \
+  --trace=cuda,nvtx,osrt,mpi \
   --stats=true \
   -o "${OUT_ABS}" \
-  mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} ./atmosphere_model
+  mpirun -n "${NUM_PROCS}" ${MPI_FLAGS} bash "${SCRIPT_DIR}/pin-gpu.sh" ./atmosphere_model
 RUN_STATUS=$?
 set -e
 
diff --git a/.github/workflows/profile-gpu-nsight.yml b/.github/workflows/profile-gpu-nsight.yml
index 92ed3b7969..b95306bd36 100644
--- a/.github/workflows/profile-gpu-nsight.yml
+++ b/.github/workflows/profile-gpu-nsight.yml
@@ -37,6 +37,11 @@ on:
         required: false
         default: '40'
         type: string
+      cuda_aware_mpi:
+        description: End-to-end CUDA-aware MPI - sets config_gpu_aware_mpi=.true. in &development (MPAS hands device pointers to MPI) AND MPICH_GPU_SUPPORT_ENABLED=1 / OpenMPI UCX-CUDA flags. Requires the container's MPI to be built with GPU support.
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   config:
@@ -99,9 +104,21 @@ jobs:
       - name: GPU check
         shell: bash
         run: |
-          echo "## GPU" >> "$GITHUB_STEP_SUMMARY"
-          nvidia-smi || echo "::warning::nvidia-smi failed"
-          nvidia-smi || true
+          {
+            echo "## GPU diagnostics"
+            echo ""
+            echo "### /dev/nvidia*"
+            ls -la /dev/nvidia* 2>&1 || echo "(none — container not GPU-mapped?)"
+            echo ""
+            echo "### nvidia-smi -L (uses /dev/nvidia*, bypasses libnvidia-ml stub)"
+            nvidia-smi -L 2>&1 || echo "(nvidia-smi -L failed)"
+            echo ""
+            echo "### nvidia-smi --query-gpu (CSV)"
+            nvidia-smi --query-gpu=index,name,uuid,driver_version,memory.total --format=csv 2>&1 || echo "(query failed)"
+            echo ""
+            echo "### CUDA / OpenACC env"
+            env | grep -E '^(CUDA|ACC|NVIDIA|MPICH_GPU|OMPI|UCX)_' | sort || true
+          } | tee -a "$GITHUB_STEP_SUMMARY"
 
       - name: Setup Nsight Systems CLI (install + RPM cache)
         uses: ./.github/actions/setup-nsight-systems
@@ -127,8 +144,41 @@ jobs:
           echo "config_run_duration -> ${DURATION} (config_dt left as in test case)"
           grep -E 'config_dt|config_run_duration' nsight-case/namelist.atmosphere | head -n 5 || true
 
+      # MPAS host-stages halos itself unless config_gpu_aware_mpi=.true. is set in
+      # &development. Without this, the MPI library never sees a device pointer and
+      # MPICH_GPU_SUPPORT_ENABLED has nothing to do.
+      - name: Patch namelist (config_gpu_aware_mpi = .true.)
+        if: ${{ inputs.cuda_aware_mpi }}
+        shell: bash
+        working-directory: nsight-case
+        run: |
+          python3 - <<'PY'
+          import re
+          from pathlib import Path
+          p = Path("namelist.atmosphere")
+          text = p.read_text()
+          if re.search(r"^\s*&development\b", text, flags=re.MULTILINE):
+              def edit(m):
+                  block = m.group(0)
+                  if re.search(r"config_gpu_aware_mpi\s*=", block):
+                      return re.sub(r"config_gpu_aware_mpi\s*=\s*\.\w+\.",
+                                    "config_gpu_aware_mpi = .true.", block)
+                  return re.sub(r"(\n\s*/\s*)$",
+                                r"\n    config_gpu_aware_mpi = .true.\1",
+                                block, count=1)
+              text = re.sub(r"^\s*&development\b[\s\S]*?\n\s*/\s*",
+                            edit, text, count=1, flags=re.MULTILINE)
+          else:
+              text = text.rstrip() + "\n&development\n    config_gpu_aware_mpi = .true.\n/\n"
+          p.write_text(text)
+          PY
+          echo "&development block now reads:"
+          awk '/^[[:space:]]*&development/,/^[[:space:]]*\//' namelist.atmosphere
+
       - name: Run nsys profile
         shell: bash
+        env:
+          CUDA_AWARE_MPI: ${{ inputs.cuda_aware_mpi }}
         run: |
           chmod +x .github/scripts/run-nsys-profile.sh
           bash .github/scripts/run-nsys-profile.sh \