SemiAnalysisAI · ankursingh-nv · Mar 19, 2026 · Mar 19, 2026 · functionstackx · Mar 19, 2026
@@ -1805,7 +1805,7 @@ qwen3.5-bf16-b200-sglang:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
 
 qwen3.5-fp8-b200-sglang:
-  image: lmsysorg/sglang:v0.5.9-cu129-amd64
+  image: lmsysorg/sglang:v0.5.9-cu130-amd64
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: b200
@@ -1816,18 +1816,15 @@ qwen3.5-fp8-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 64, conc-end: 64 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 }
-    - { tp: 4, ep: 4, conc-start: 8, conc-end: 64}
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4}
-    - { tp: 4, ep: 4, conc-start: 8, conc-end: 64 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
 
 kimik2.5-int4-b200-vllm:
   image: vllm/vllm-openai:v0.15.1

diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh
@@ -57,7 +57,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
 --attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -970,3 +970,10 @@
     - "Replace old per-file recipes with resolved variants from consolidated 8k1k.yaml"
     - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907
+
+- config-keys:
+    - qwen3.5-fp8-b200-sglang
+  description:
+    - "Replace FP8 TP4/EP4 with TP8 config (conc 4-128) for all ISL/OSL combos"
+    - "Add --enable-flashinfer-allreduce-fusion to FP8 benchmark script"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918