From 0c82808def910547d16e825ba2f35e81429191d3 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 16 Mar 2026 14:56:43 -0700 Subject: [PATCH 1/5] Add GLM-5 h200 sglang --- .github/configs/nvidia-master.yaml | 22 +++++++ benchmarks/single_node/glm5_fp8_h200.sh | 81 +++++++++++++++++++++++++ perf-changelog.yaml | 9 +++ 3 files changed, 112 insertions(+) create mode 100644 benchmarks/single_node/glm5_fp8_h200.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f967b78b5..7ee10b3dc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2020,6 +2020,28 @@ qwen3.5-fp8-h200-sglang: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +glm5-fp8-h200-sglang: + image: lmsysorg/sglang:glm5-hopper + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: h200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh new file mode 100644 index 000000000..55ac06ec9 --- /dev/null +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +python3 -m sglang.launch_server \ + --model-path "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --tp-size "$TP" \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --mem-fraction-static 0.85 \ + --served-model-name glm-5-fp8 \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# If your --model is not on Hugging Face (e.g. glm-5-fp8), set BENCH_TOKENIZER +# to a local path or a public HF model id, e.g. export BENCH_TOKENIZER=THUDM/glm-4-9b-chat +TOKENIZER_ARGS="" +if [ -n "${BENCH_TOKENIZER:-}" ]; then + TOKENIZER_ARGS="--tokenizer $BENCH_TOKENIZER" +fi + +num_prompts=$((CONC * 5)) +SGLANG_URL="http://0.0.0.0:$PORT" + +python3 utils/bench_serving/benchmark_serving.py \ + --backend openai-chat \ + --base-url "$SGLANG_URL" \ + --endpoint /v1/chat/completions \ + --model glm-5-fp8 \ + $TOKENIZER_ARGS \ + --dataset-name random \ + --num-prompts "$num_prompts" \ + --random-input-len "$ISL" \ + --random-output-len "$OSL" \ + --random-range-ratio "${RANDOM_RANGE_RATIO:-0.8}" \ + --ignore-eos \ + --percentile-metrics ttft,tpot,itl,e2el \ + --max-concurrency "$CONC" \ + --save-result \ + --result-dir /workspace \ + --result-filename "$RESULT_FILENAME.json" + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2090a1b25..182391e77 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -961,3 +961,12 @@ description: - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 + +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Add GLM-5 FP8 SGLang H200 single-node benchmark" + - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" + - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" + - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 767a6e2199b5060b08f2ff6c5c18df4a41cb588f Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 16 Mar 2026 15:00:27 -0700 Subject: [PATCH 2/5] updating PR number --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 182391e77..8e4801eb6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -969,4 +969,4 @@ - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 From 4de829f6b297d769c1d761f4ab932de111175331 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 16 Mar 2026 15:43:40 -0700 Subject: [PATCH 3/5] fix: benchmark client --- benchmarks/single_node/glm5_fp8_h200.sh | 38 ++++++++----------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index 55ac06ec9..d464bf810 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -42,33 +42,19 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# If your --model is not on Hugging Face (e.g. glm-5-fp8), set BENCH_TOKENIZER -# to a local path or a public HF model id, e.g. export BENCH_TOKENIZER=THUDM/glm-4-9b-chat -TOKENIZER_ARGS="" -if [ -n "${BENCH_TOKENIZER:-}" ]; then - TOKENIZER_ARGS="--tokenizer $BENCH_TOKENIZER" -fi - -num_prompts=$((CONC * 5)) -SGLANG_URL="http://0.0.0.0:$PORT" - -python3 utils/bench_serving/benchmark_serving.py \ - --backend openai-chat \ - --base-url "$SGLANG_URL" \ - --endpoint /v1/chat/completions \ - --model glm-5-fp8 \ - $TOKENIZER_ARGS \ - --dataset-name random \ - --num-prompts "$num_prompts" \ - --random-input-len "$ISL" \ - --random-output-len "$OSL" \ - --random-range-ratio "${RANDOM_RANGE_RATIO:-0.8}" \ - --ignore-eos \ - --percentile-metrics ttft,tpot,itl,e2el \ +# Server is SGLang; benchmark client uses OpenAI-compatible (vllm) backend to talk to it +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( CONC * 10 )) \ --max-concurrency "$CONC" \ - --save-result \ - --result-dir /workspace \ - --result-filename "$RESULT_FILENAME.json" + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then From 8d7009e0c0ceeab877505500c989bb8fa8c81311 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 16 Mar 2026 16:04:48 -0700 Subject: [PATCH 4/5] fix eval --- benchmarks/single_node/glm5_fp8_h200.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index d464bf810..9194bb870 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -35,6 +35,7 @@ python3 -m sglang.launch_server \ --reasoning-parser glm45 \ --mem-fraction-static 0.85 \ --served-model-name glm-5-fp8 \ + --trust-remote-code \ > "$SERVER_LOG" 2>&1 & SERVER_PID=$! @@ -42,7 +43,6 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# Server is SGLang; benchmark client uses OpenAI-compatible (vllm) backend to talk to it run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ @@ -57,7 +57,9 @@ run_benchmark_serving \ --trust-remote-code # After throughput, run evaluation only if RUN_EVAL is true +# Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name if [ "${RUN_EVAL}" = "true" ]; then + export MODEL_NAME=glm-5-fp8 run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi From 534a6b182871e0c034135e8f8fc24a17d8776ffb Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 16 Mar 2026 21:05:42 -0700 Subject: [PATCH 5/5] fixing perf --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8e4801eb6..b9c02c483 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -968,5 +968,5 @@ - "Add GLM-5 FP8 SGLang H200 single-node benchmark" - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - - "openai-chat backend, tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" + - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914