diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2ef06d65d..51c86912c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1805,7 +1805,7 @@ qwen3.5-bf16-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } qwen3.5-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.9-cu129-amd64 + image: lmsysorg/sglang:v0.5.9-cu130-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: b200 @@ -1816,18 +1816,15 @@ qwen3.5-fp8-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } - - { tp: 4, ep: 4, conc-start: 8, conc-end: 64} + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4} - - { tp: 4, ep: 4, conc-start: 8, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh index 682e07a2a..3328585b3 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b200.sh @@ -57,7 +57,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ --attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ ---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f9fff7fb5..ef9f3870f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -970,3 +970,10 @@ - "Replace old per-file recipes with resolved variants from consolidated 8k1k.yaml" - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907 + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Replace FP8 TP4/EP4 with TP8 config (conc 4-128) for all ISL/OSL combos" + - "Add --enable-flashinfer-allreduce-fusion to FP8 benchmark script" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918