From 7663ca82b28d7ab9166503a1b655d54f2db01be9 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 18 Mar 2026 04:20:35 -0400 Subject: [PATCH 1/2] Update models.yaml --- benchmarks/multi_node/amd_utils/models.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..ebc185025 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -39,7 +39,7 @@ DeepSeek-V3: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -70,7 +70,7 @@ DeepSeek-V3: DeepSeek-V3-0324: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -101,7 +101,7 @@ DeepSeek-V3-0324: DeepSeek-R1: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -132,7 +132,7 @@ DeepSeek-R1: DeepSeek-R1-0528: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -163,7 +163,7 @@ DeepSeek-R1-0528: DeepSeek-R1-0528-MXFP4-Preview: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -194,7 +194,7 @@ DeepSeek-R1-0528-MXFP4-Preview: DeepSeek-R1-0528-MXFP4: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From ac69ec5b190b8ac307cddba879bd70c4681850f3 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 18 Mar 2026 04:25:41 -0400 Subject: [PATCH 2/2] Update perf-changelog with MTP FP4/FP8 MI355X SGLang disagg entries Co-Authored-By: Claude Opus 4.6 (1M context) --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2090a1b25..0986a5c6d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -961,3 +961,10 @@ description: - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg-mtp + - dsr1-fp8-mi355x-sglang-disagg-mtp + description: + - "test" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX