From c28ce2093a0c51e29f0896c552b3dd8c3c2f2336 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 10 Mar 2026 11:51:51 -0500
Subject: [PATCH 1/4] Revert "Revert "[NVIDIA] Update NVIDIA GPT-OSS vLLM image
 from v0.15.1 to v0.16.0 (#800)" (#903) [skip-sweep]"

This reverts commit cad1169b9a1467e17530a1a50aceb1f60ab14c51.
---
 .github/configs/nvidia-master.yaml | 6 +++---
 perf-changelog.yaml                | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index fc837704c..b58f3780e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3076,7 +3076,7 @@ gptoss-fp4-b200-trt:
     - { tp: 8, conc-start:   4, conc-end:   4}
 
 gptoss-fp4-b200-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.16.0-cu130
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: b200
@@ -3107,7 +3107,7 @@ gptoss-fp4-b200-vllm:
     - { tp: 8, conc-start: 4, conc-end: 4 }
 
 gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.16.0-cu130
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h100
@@ -3386,7 +3386,7 @@ gptoss-fp4-h200-trt:
     - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-h200-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.16.0-cu130
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h200
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c19ddbd1a..7082c552f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -932,3 +932,10 @@
     - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867
   
+- config-keys:
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
+  description:
+    - "Update vLLM image from v0.15.1 to v0.16.0-cu130 for NVIDIA GPT-OSS configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/800

From 1e991b00c5c860877883b2ab8226e0ccacc555e9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 10 Mar 2026 11:59:01 -0500
Subject: [PATCH 2/4] Fix H200 DGXC slurm GRES spec to include GPU type

The H200 cluster nodes register GPUs as gpu:nvidia_h200, not generic
gpu, so salloc requires the full type in --gres.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 runners/launch_h200-dgxc-slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 9b3b771a5..3cc777dd8 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -234,7 +234,7 @@ else
     DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:nvidia_h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     # Use flock to serialize concurrent imports to the same squash file

From 6c9a3b6863979e35679b2a419c17aeb80a7b4111 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 10 Mar 2026 12:00:13 -0500
Subject: [PATCH 3/4] Revert "Fix H200 DGXC slurm GRES spec to include GPU
 type"

This reverts commit 1e991b00c5c860877883b2ab8226e0ccacc555e9.
---
 runners/launch_h200-dgxc-slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 3cc777dd8..9b3b771a5 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -234,7 +234,7 @@ else
     DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:nvidia_h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     # Use flock to serialize concurrent imports to the same squash file

From 7bc772c71c7b72efe63c51bb96f75444e11f1028 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Wed, 11 Mar 2026 16:38:45 -0400
Subject: [PATCH 4/4] Add GPU monitoring to gptoss vLLM benchmark scripts

Add start_gpu_monitor and stop_gpu_monitor calls to
gptoss_fp4_b200.sh, gptoss_fp4_h100.sh, and gptoss_fp4_h200.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/gptoss_fp4_b200.sh | 6 ++++++
 benchmarks/single_node/gptoss_fp4_h100.sh | 6 ++++++
 benchmarks/single_node/gptoss_fp4_h200.sh | 6 ++++++
 3 files changed, 18 insertions(+)

diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh
index 1d9e727ce..46fccca6a 100644
--- a/benchmarks/single_node/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200.sh
@@ -45,6 +45,9 @@ export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --config config.yaml \
@@ -77,4 +80,7 @@ if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
index 7d9cec06f..314ec43c9 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -29,6 +29,9 @@ export VLLM_MXFP4_USE_MARLIN=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
@@ -61,4 +64,7 @@ if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh
index 3d945df42..251294a62 100644
--- a/benchmarks/single_node/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200.sh
@@ -17,6 +17,9 @@ fi
 
 hf download "$MODEL"
 
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
 set -x
 pip install datasets pandas
 
@@ -72,4 +75,7 @@ if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
 set +x