From 0c82808def910547d16e825ba2f35e81429191d3 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 16 Mar 2026 14:56:43 -0700
Subject: [PATCH 1/5] Add GLM-5 h200 sglang

---
 .github/configs/nvidia-master.yaml      | 22 +++++++
 benchmarks/single_node/glm5_fp8_h200.sh | 81 +++++++++++++++++++++++++
 perf-changelog.yaml                     |  9 +++
 3 files changed, 112 insertions(+)
 create mode 100644 benchmarks/single_node/glm5_fp8_h200.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f967b78b5..7ee10b3dc 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2020,6 +2020,28 @@ qwen3.5-fp8-h200-sglang:
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
+glm5-fp8-h200-sglang:
+  image: lmsysorg/sglang:glm5-hopper
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: h200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
 dsr1-fp8-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh
new file mode 100644
index 000000000..55ac06ec9
--- /dev/null
+++ b/benchmarks/single_node/glm5_fp8_h200.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+python3 -m sglang.launch_server \
+  --model-path "$MODEL" \
+  --host 0.0.0.0 \
+  --port "$PORT" \
+  --tp-size "$TP" \
+  --tool-call-parser glm47 \
+  --reasoning-parser glm45 \
+  --mem-fraction-static 0.85 \
+  --served-model-name glm-5-fp8 \
+  > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# If your --model is not on Hugging Face (e.g. glm-5-fp8), set BENCH_TOKENIZER
+# to a local path or a public HF model id, e.g. export BENCH_TOKENIZER=THUDM/glm-4-9b-chat
+TOKENIZER_ARGS=""
+if [ -n "${BENCH_TOKENIZER:-}" ]; then
+  TOKENIZER_ARGS="--tokenizer $BENCH_TOKENIZER"
+fi
+
+num_prompts=$((CONC * 5))
+SGLANG_URL="http://0.0.0.0:$PORT"
+
+python3 utils/bench_serving/benchmark_serving.py \
+    --backend openai-chat \
+    --base-url "$SGLANG_URL" \
+    --endpoint /v1/chat/completions \
+    --model glm-5-fp8 \
+    $TOKENIZER_ARGS \
+    --dataset-name random \
+    --num-prompts "$num_prompts" \
+    --random-input-len "$ISL" \
+    --random-output-len "$OSL" \
+    --random-range-ratio "${RANDOM_RANGE_RATIO:-0.8}" \
+    --ignore-eos \
+    --percentile-metrics ttft,tpot,itl,e2el \
+    --max-concurrency "$CONC" \
+    --save-result \
+    --result-dir /workspace \
+    --result-filename "$RESULT_FILENAME.json"
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2090a1b25..182391e77 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -961,3 +961,12 @@
   description:
     - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869
+
+- config-keys:
+    - glm5-fp8-h200-sglang
+  description:
+    - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
+    - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
+    - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
+    - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX

From 767a6e2199b5060b08f2ff6c5c18df4a41cb588f Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 16 Mar 2026 15:00:27 -0700
Subject: [PATCH 2/5] updating PR number

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 182391e77..8e4801eb6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -969,4 +969,4 @@
     - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914

From 4de829f6b297d769c1d761f4ab932de111175331 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 16 Mar 2026 15:43:40 -0700
Subject: [PATCH 3/5] fix: benchmark client

---
 benchmarks/single_node/glm5_fp8_h200.sh | 38 ++++++++-----------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh
index 55ac06ec9..d464bf810 100644
--- a/benchmarks/single_node/glm5_fp8_h200.sh
+++ b/benchmarks/single_node/glm5_fp8_h200.sh
@@ -42,33 +42,19 @@ SERVER_PID=$!
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# If your --model is not on Hugging Face (e.g. glm-5-fp8), set BENCH_TOKENIZER
-# to a local path or a public HF model id, e.g. export BENCH_TOKENIZER=THUDM/glm-4-9b-chat
-TOKENIZER_ARGS=""
-if [ -n "${BENCH_TOKENIZER:-}" ]; then
-  TOKENIZER_ARGS="--tokenizer $BENCH_TOKENIZER"
-fi
-
-num_prompts=$((CONC * 5))
-SGLANG_URL="http://0.0.0.0:$PORT"
-
-python3 utils/bench_serving/benchmark_serving.py \
-    --backend openai-chat \
-    --base-url "$SGLANG_URL" \
-    --endpoint /v1/chat/completions \
-    --model glm-5-fp8 \
-    $TOKENIZER_ARGS \
-    --dataset-name random \
-    --num-prompts "$num_prompts" \
-    --random-input-len "$ISL" \
-    --random-output-len "$OSL" \
-    --random-range-ratio "${RANDOM_RANGE_RATIO:-0.8}" \
-    --ignore-eos \
-    --percentile-metrics ttft,tpot,itl,e2el \
+# Server is SGLang; benchmark client uses OpenAI-compatible (vllm) backend to talk to it
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( CONC * 10 )) \
     --max-concurrency "$CONC" \
-    --save-result \
-    --result-dir /workspace \
-    --result-filename "$RESULT_FILENAME.json"
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then

From 8d7009e0c0ceeab877505500c989bb8fa8c81311 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 16 Mar 2026 16:04:48 -0700
Subject: [PATCH 4/5] fix eval

---
 benchmarks/single_node/glm5_fp8_h200.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh
index d464bf810..9194bb870 100644
--- a/benchmarks/single_node/glm5_fp8_h200.sh
+++ b/benchmarks/single_node/glm5_fp8_h200.sh
@@ -35,6 +35,7 @@ python3 -m sglang.launch_server \
   --reasoning-parser glm45 \
   --mem-fraction-static 0.85 \
   --served-model-name glm-5-fp8 \
+  --trust-remote-code \
   > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
@@ -42,7 +43,6 @@ SERVER_PID=$!
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# Server is SGLang; benchmark client uses OpenAI-compatible (vllm) backend to talk to it
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
@@ -57,7 +57,9 @@ run_benchmark_serving \
     --trust-remote-code
 
 # After throughput, run evaluation only if RUN_EVAL is true
+# Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name
 if [ "${RUN_EVAL}" = "true" ]; then
+    export MODEL_NAME=glm-5-fp8
     run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi

From 534a6b182871e0c034135e8f8fc24a17d8776ffb Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 16 Mar 2026 21:05:42 -0700
Subject: [PATCH 5/5] fixing perf

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8e4801eb6..b9c02c483 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -968,5 +968,5 @@
     - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
     - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
-    - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
+    - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914