diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2ef06d65d..1dbc62841 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2020,6 +2020,28 @@ qwen3.5-fp8-h200-sglang: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +glm5-fp8-h200-sglang: + image: lmsysorg/sglang:glm5-hopper + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: h200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh new file mode 100644 index 000000000..9194bb870 --- /dev/null +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +python3 -m sglang.launch_server \ + --model-path "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --tp-size "$TP" \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --mem-fraction-static 0.85 \ + --served-model-name glm-5-fp8 \ + --trust-remote-code \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +# Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name +if [ "${RUN_EVAL}" = "true" ]; then + export MODEL_NAME=glm-5-fp8 + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f9fff7fb5..c6a340e96 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -970,3 +970,12 @@ - "Replace old per-file recipes with resolved variants from consolidated 8k1k.yaml" - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907 + +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Add GLM-5 FP8 SGLang H200 single-node benchmark" + - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" + - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" + - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914