diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..ebc185025 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -39,7 +39,7 @@ DeepSeek-V3: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -70,7 +70,7 @@ DeepSeek-V3: DeepSeek-V3-0324: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -101,7 +101,7 @@ DeepSeek-V3-0324: DeepSeek-R1: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -132,7 +132,7 @@ DeepSeek-R1: DeepSeek-R1-0528: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -163,7 +163,7 @@ DeepSeek-R1-0528: DeepSeek-R1-0528-MXFP4-Preview: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 @@ -194,7 +194,7 @@ DeepSeek-R1-0528-MXFP4-Preview: DeepSeek-R1-0528-MXFP4: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-draft-tokens 3" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2090a1b25..0986a5c6d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -961,3 +961,10 @@ description: - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg-mtp + - dsr1-fp8-mi355x-sglang-disagg-mtp + description: + - "test" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX