From c28ce2093a0c51e29f0896c552b3dd8c3c2f2336 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 10 Mar 2026 11:51:51 -0500 Subject: [PATCH 1/4] Revert "Revert "[NVIDIA] Update NVIDIA GPT-OSS vLLM image from v0.15.1 to v0.16.0 (#800)" (#903) [skip-sweep]" This reverts commit cad1169b9a1467e17530a1a50aceb1f60ab14c51. --- .github/configs/nvidia-master.yaml | 6 +++--- perf-changelog.yaml | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fc837704c..b58f3780e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3076,7 +3076,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 4} gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -3107,7 +3107,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -3386,7 +3386,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c19ddbd1a..7082c552f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -932,3 +932,10 @@ - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 +- config-keys: + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: + - "Update vLLM image from v0.15.1 to v0.16.0-cu130 for NVIDIA GPT-OSS configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/800 From 1e991b00c5c860877883b2ab8226e0ccacc555e9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 10 Mar 2026 11:59:01 -0500 Subject: [PATCH 2/4] Fix H200 DGXC slurm GRES spec to include GPU type The H200 cluster nodes register GPUs as gpu:nvidia_h200, not generic gpu, so salloc requires the full type in --gres. Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h200-dgxc-slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..3cc777dd8 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -234,7 +234,7 @@ else DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') LOCK_FILE="${SQUASH_FILE}.lock" - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:nvidia_h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) # Use flock to serialize concurrent imports to the same squash file From 6c9a3b6863979e35679b2a419c17aeb80a7b4111 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 10 Mar 2026 12:00:13 -0500 Subject: [PATCH 3/4] Revert "Fix H200 DGXC slurm GRES spec to include GPU type" This reverts commit 1e991b00c5c860877883b2ab8226e0ccacc555e9. --- runners/launch_h200-dgxc-slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 3cc777dd8..9b3b771a5 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -234,7 +234,7 @@ else DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') LOCK_FILE="${SQUASH_FILE}.lock" - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:nvidia_h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) # Use flock to serialize concurrent imports to the same squash file From 7bc772c71c7b72efe63c51bb96f75444e11f1028 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 11 Mar 2026 16:38:45 -0400 Subject: [PATCH 4/4] Add GPU monitoring to gptoss vLLM benchmark scripts Add start_gpu_monitor and stop_gpu_monitor calls to gptoss_fp4_b200.sh, gptoss_fp4_h100.sh, and gptoss_fp4_h200.sh. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/gptoss_fp4_b200.sh | 6 ++++++ benchmarks/single_node/gptoss_fp4_h100.sh | 6 ++++++ benchmarks/single_node/gptoss_fp4_h200.sh | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 1d9e727ce..46fccca6a 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -45,6 +45,9 @@ export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ @@ -77,4 +80,7 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi + +# Stop GPU monitoring +stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 7d9cec06f..314ec43c9 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -29,6 +29,9 @@ export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ @@ -61,4 +64,7 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi + +# Stop GPU monitoring +stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 3d945df42..251294a62 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -17,6 +17,9 @@ fi hf download "$MODEL" +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + set -x pip install datasets pandas @@ -72,4 +75,7 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi + +# Stop GPU monitoring +stop_gpu_monitor set +x