From 93dccc7b4cb89f2449db5cc87bee4f1dbce9e52d Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 1 Jul 2025 08:34:26 +0000 Subject: [PATCH 01/25] [ascend]suppot deepseek eager_mode --- docs/zh_cn/supported_models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 2d119c7ed4..a4b9aabb30 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -149,7 +149,7 @@ | QWen2.5-VL | 3B - 72B | MLLM | Yes | Yes | - | - | Yes | - | Yes | No | | QWen2-MoE | A14.57B | LLM | Yes | - | No | No | - | - | Yes | - | | QWen3 | 0.6B-235B | LLM | Yes | Yes | No | No | Yes | Yes | Yes | Yes | -| DeepSeek-V2 | 16B | LLM | No | Yes | No | No | - | - | - | - | +| DeepSeek-V2 | 16B | LLM | Yes | Yes | No | No | - | - | - | - | | InternVL(v1.5) | 2B-26B | MLLM | Yes | - | Yes | Yes | - | - | Yes | - | | InternVL2 | 1B-40B | MLLM | Yes | Yes | Yes | Yes | Yes | - | Yes | Yes | | InternVL2.5 | 1B-78B | MLLM | Yes | Yes | Yes | Yes | Yes | - | Yes | Yes | From 70cb6ab70e9d85aa050854a04be66b3d2c6c5c5d Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 3 Dec 2025 07:27:54 +0000 Subject: [PATCH 02/25] fix flash_mla_available on ascend --- lmdeploy/pytorch/configurations/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lmdeploy/pytorch/configurations/utils.py b/lmdeploy/pytorch/configurations/utils.py index dfdd50512e..305083a48f 100644 --- a/lmdeploy/pytorch/configurations/utils.py +++ b/lmdeploy/pytorch/configurations/utils.py @@ -11,9 +11,13 @@ def flash_mla_available(): # use flash_mla by default if it is installed use_flash_mla = False try: - # torch_npu device_properties doesn't have 'major' attribute + """ + In some torch_npu versions, device_properties doesn't have 'major' attribute; + In other torch_npu versions, the value of major is None. + """ device_properties = torch.cuda.get_device_properties(0) - if hasattr(device_properties, 'major') and device_properties.major >= 9: + major = getattr(device_properties, 'major', None) + if isinstance(major, int) and major >= 9: import flash_mla # noqa use_flash_mla = True except ImportError: From 1bb548b1eb2e6638d0f25c45707de2e0ffbc6032 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 9 Dec 2025 08:52:35 +0000 Subject: [PATCH 03/25] modify for dp_ep --- lmdeploy/pytorch/backends/dlinfer/moe.py | 13 +++++++++++-- lmdeploy/pytorch/engine/executor/ray_executor.py | 6 ++++++ lmdeploy/pytorch/models/deepseek_v2.py | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 70b134c786..12c8ed8b31 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -35,8 +35,9 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): class DlinferFusedMoEImpl(FusedMoEImpl): """Dlinfer fused moe implementation.""" - def __init__(self, top_k: int, renormalize: bool = False): + def __init__(self, top_k: int, num_experts: int, renormalize: bool = False): self.top_k = top_k + self.num_experts = num_experts self.renormalize = renormalize def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): @@ -46,6 +47,14 @@ def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tens return gate_up_weights.transpose(-1, -2).contiguous(), down_weights.transpose(-1, -2).contiguous() return gate_up_weights, down_weights + def ep_expert_list(self, world_size: int, rank: int): + """Experts list of current rank.""" + num_experts = self.num_experts + expert_per_rank = (num_experts + world_size - 1) // world_size + first_expert = rank * expert_per_rank + last_expert = min(first_expert + expert_per_rank, num_experts) + return list(range(first_expert, last_expert)) + def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, @@ -76,4 +85,4 @@ def build(top_k: int, layer_idx: int = 0, out_dtype: torch.dtype = torch.bfloat16): """Build from mlp.""" - return DlinferFusedMoEImpl(top_k=top_k, renormalize=renormalize) + return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize) diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index e4b4fbac2a..7796e252c8 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -47,6 +47,7 @@ def _get_master_port(): def get_ascend_device_rank_mapping(master_addr): +# def get_ascend_device_rank_mapping(master_addr: str, workers: list, dp: int): rank_table_file = _envs.ascend_rank_table_file if not rank_table_file: raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set') @@ -67,6 +68,8 @@ def get_ascend_device_rank_mapping(master_addr): logger.error(f'Parse rank table file({rank_table}) failed') raise e + # if dp > 1: + # worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers]) envs = { 'ASCEND_RANK_TABLE_FILE_PATH': rank_table_file, } @@ -286,6 +289,9 @@ def __init__( self._prefetch_task: asyncio.Task = None self.remote_outs: asyncio.Queue = None + rank_offset = dist_config.dp_rank * attn_tp + self.rank_offset = rank_offset + logger.info('Init distributed environment by device.') self.rank_offset = dist_config.dp_rank * attn_tp self._init_distributed_environment_by_device(device_type) diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py index 4db550eb8d..5e781f8034 100644 --- a/lmdeploy/pytorch/models/deepseek_v2.py +++ b/lmdeploy/pytorch/models/deepseek_v2.py @@ -1185,10 +1185,14 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di if weight_name not in name: continue name = name.replace(weight_name, param_name) + if name not in params_dict.keys(): + continue param = params_dict[name] load_weight(param, loaded_weight, expert_id=expert_id, shard_id=shard_id) break else: + if name not in params_dict.keys(): + return param = params_dict[name] load_weight(param, loaded_weight) @@ -1219,6 +1223,8 @@ def __load_kcvc(name: str, weight: torch.Tensor): dim=1) w_vc = w_vc.transpose(1, 2).contiguous() kc_param_name = name.replace('.kv_b_proj', '.kc') + if kc_param_name not in params_dict.keys(): + return param_kc = params_dict[kc_param_name] load_weight(param_kc, w_kc) vc_param_name = name.replace('.kv_b_proj', '.vc') @@ -1265,6 +1271,8 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): else: loaded_weight = loaded_weight.to(device) weight = __update_pe(loaded_weight, head_dim, pe_dim_offset) + if name not in params_dict.keys(): + continue param = params_dict[name] load_weight(param, weight) break @@ -1282,6 +1290,8 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): else: __load_kcvc(name, loaded_weight) else: + if name not in params_dict.keys(): + return param = params_dict[name] load_weight(param, loaded_weight) @@ -1369,9 +1379,13 @@ def __skip_layers(): if weight_name not in name: continue name = name.replace(weight_name, param_name) + if name not in params_dict.keys(): + continue param = params_dict[name] load_weight(param, loaded_weight, shard_id=shard_id) break else: + if name not in params_dict.keys(): + continue param = params_dict[name] load_weight(param, loaded_weight) From 5f47d928c62a0c612f24c5aea635c556edb8ab93 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 11 Dec 2025 10:49:27 +0000 Subject: [PATCH 04/25] backup code --- lmdeploy/pytorch/backends/dlinfer/moe.py | 11 ++++++++--- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 7 ++++++- lmdeploy/pytorch/nn/moe/default.py | 3 +++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 12c8ed8b31..0db8d4b3f1 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -35,10 +35,12 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): class DlinferFusedMoEImpl(FusedMoEImpl): """Dlinfer fused moe implementation.""" - def __init__(self, top_k: int, num_experts: int, renormalize: bool = False): + def __init__(self, top_k: int, num_experts: int, renormalize: bool = False, ep_size: int = 1, ep_group: torch.distributed.ProcessGroup = None): self.top_k = top_k self.num_experts = num_experts self.renormalize = renormalize + self.ep_size = ep_size + self.ep_group = ep_group def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): """Update weights.""" @@ -68,8 +70,11 @@ def forward(self, """forward.""" assert gate_up_bias is None assert down_bias is None + # from lmdeploy.utils import get_logger + # logger = get_logger('lmdeploy') + # logger.error(f'###### {expert_list=}') return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize) + self.renormalize, self.ep_size, self.ep_group, expert_list) class DlinferFusedMoEBuilder(FusedMoEBuilder): @@ -85,4 +90,4 @@ def build(top_k: int, layer_idx: int = 0, out_dtype: torch.dtype = torch.bfloat16): """Build from mlp.""" - return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize) + return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize, ep_size=ep_size, ep_group=ep_group) diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 4bcfade78d..6e50b1a5be 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch import dlinfer.ops as ext_ops +from typing import List from torch import Tensor @@ -11,6 +13,9 @@ def fused_moe( topk_ids: Tensor, topk: int, renormalize: bool, + ep_size: int, + ep_group: torch.distributed.ProcessGroup = None, + expert_list: List[int] = None, ): """Dlinfer fused moe.""" - return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize) + return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group, expert_list) diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py index 0633aa001a..9f0e825b97 100644 --- a/lmdeploy/pytorch/nn/moe/default.py +++ b/lmdeploy/pytorch/nn/moe/default.py @@ -136,6 +136,9 @@ def __init__(self, dist_ctx = get_dist_manager().current_context() self.ep_size, rank = get_ep_world_rank() impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE) + # from lmdeploy.utils import get_logger + # logger = get_logger('lmdeploy') + # logger.error(f'FusedMoE ep_size: {self.ep_size}, rank: {rank}, {dist_ctx.ep_gpu_group.rank()=}') self.impl = impl_builder.build( top_k, num_experts, From cdeb30ca9f2c4e1d51198a967775a1cbc1afaa07 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 16 Dec 2025 11:42:51 +0000 Subject: [PATCH 05/25] run tp ep --- lmdeploy/pytorch/backends/dlinfer/moe.py | 5 +---- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 0db8d4b3f1..77a09394f2 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -70,11 +70,8 @@ def forward(self, """forward.""" assert gate_up_bias is None assert down_bias is None - # from lmdeploy.utils import get_logger - # logger = get_logger('lmdeploy') - # logger.error(f'###### {expert_list=}') return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize, self.ep_size, self.ep_group, expert_list) + self.renormalize, self.ep_size, self.ep_group) class DlinferFusedMoEBuilder(FusedMoEBuilder): diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 6e50b1a5be..7edfcc8631 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -15,7 +15,6 @@ def fused_moe( renormalize: bool, ep_size: int, ep_group: torch.distributed.ProcessGroup = None, - expert_list: List[int] = None, ): """Dlinfer fused moe.""" - return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group, expert_list) + return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group) From c91aea8080ad9bd865f41abfda21a85e6b90fb37 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 16 Dec 2025 11:56:51 +0000 Subject: [PATCH 06/25] format code --- lmdeploy/pytorch/backends/dlinfer/moe.py | 13 +++++++++++-- lmdeploy/pytorch/configurations/utils.py | 6 ++---- lmdeploy/pytorch/engine/executor/ray_executor.py | 3 --- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 6 +++--- lmdeploy/pytorch/nn/moe/default.py | 3 --- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 77a09394f2..5703ef0638 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -35,7 +35,12 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): class DlinferFusedMoEImpl(FusedMoEImpl): """Dlinfer fused moe implementation.""" - def __init__(self, top_k: int, num_experts: int, renormalize: bool = False, ep_size: int = 1, ep_group: torch.distributed.ProcessGroup = None): + def __init__(self, + top_k: int, + num_experts: int, + renormalize: bool = False, + ep_size: int = 1, + ep_group: torch.distributed.ProcessGroup = None): self.top_k = top_k self.num_experts = num_experts self.renormalize = renormalize @@ -87,4 +92,8 @@ def build(top_k: int, layer_idx: int = 0, out_dtype: torch.dtype = torch.bfloat16): """Build from mlp.""" - return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize, ep_size=ep_size, ep_group=ep_group) + return DlinferFusedMoEImpl(top_k=top_k, + num_experts=num_experts, + renormalize=renormalize, + ep_size=ep_size, + ep_group=ep_group) diff --git a/lmdeploy/pytorch/configurations/utils.py b/lmdeploy/pytorch/configurations/utils.py index 305083a48f..2ea21364a7 100644 --- a/lmdeploy/pytorch/configurations/utils.py +++ b/lmdeploy/pytorch/configurations/utils.py @@ -11,10 +11,8 @@ def flash_mla_available(): # use flash_mla by default if it is installed use_flash_mla = False try: - """ - In some torch_npu versions, device_properties doesn't have 'major' attribute; - In other torch_npu versions, the value of major is None. - """ + """In some torch_npu versions, device_properties doesn't have 'major' + attribute; In other torch_npu versions, the value of major is None.""" device_properties = torch.cuda.get_device_properties(0) major = getattr(device_properties, 'major', None) if isinstance(major, int) and major >= 9: diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index 7796e252c8..b0a4219a46 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -47,7 +47,6 @@ def _get_master_port(): def get_ascend_device_rank_mapping(master_addr): -# def get_ascend_device_rank_mapping(master_addr: str, workers: list, dp: int): rank_table_file = _envs.ascend_rank_table_file if not rank_table_file: raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set') @@ -68,8 +67,6 @@ def get_ascend_device_rank_mapping(master_addr): logger.error(f'Parse rank table file({rank_table}) failed') raise e - # if dp > 1: - # worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers]) envs = { 'ASCEND_RANK_TABLE_FILE_PATH': rank_table_file, } diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 7edfcc8631..2079f68831 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch import dlinfer.ops as ext_ops -from typing import List +import torch from torch import Tensor @@ -17,4 +16,5 @@ def fused_moe( ep_group: torch.distributed.ProcessGroup = None, ): """Dlinfer fused moe.""" - return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group) + return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, + ep_size, ep_group) diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py index 9f0e825b97..0633aa001a 100644 --- a/lmdeploy/pytorch/nn/moe/default.py +++ b/lmdeploy/pytorch/nn/moe/default.py @@ -136,9 +136,6 @@ def __init__(self, dist_ctx = get_dist_manager().current_context() self.ep_size, rank = get_ep_world_rank() impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE) - # from lmdeploy.utils import get_logger - # logger = get_logger('lmdeploy') - # logger.error(f'FusedMoE ep_size: {self.ep_size}, rank: {rank}, {dist_ctx.ep_gpu_group.rank()=}') self.impl = impl_builder.build( top_k, num_experts, From 0879001c2b2e9c0f74f4849387280f5c1d938e33 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Mon, 29 Dec 2025 06:27:04 +0000 Subject: [PATCH 07/25] add dp tp --- lmdeploy/pytorch/backends/dlinfer/__init__.py | 18 +++++++++++++++ .../backends/dlinfer/ascend/op_backend.py | 16 +++++++++++++ lmdeploy/pytorch/backends/dlinfer/moe.py | 23 +++++++++++++++++-- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 6 ++--- .../dlinfer/moe_gating_topk_softmax.py | 5 ++-- 5 files changed, 61 insertions(+), 7 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py index ef101fec61..d06de6ac2e 100644 --- a/lmdeploy/pytorch/backends/dlinfer/__init__.py +++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py @@ -1 +1,19 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch +from dataclasses import dataclass + + +@dataclass +class DlinferDistContext: + dp_size: int = 1 + tp_size: int = 1 + ep_size: int = 1 + + dp_rank: int = 0 + tp_rank: int = 0 + ep_rank: int = 0 + + max_tokens_accros_dp: int = 1 + + tp_group: torch.distributed.ProcessGroup = None + ep_group: torch.distributed.ProcessGroup = None diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index b7ce2a6846..3aaa405ce0 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -7,9 +7,11 @@ from typing import Dict, Tuple import torch +import torch.distributed as dist from lmdeploy.pytorch import envs as _envs from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig +from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.utils import get_logger from ..op_backend import DlinferOpsBackend @@ -92,6 +94,7 @@ class AscendOpsBackend(DlinferOpsBackend): half_negative_inf = torch.finfo(torch.float16).min total_slots = None max_batches = None + max_tokens_accros_dp = 0 @staticmethod def get_name() -> str: @@ -219,6 +222,18 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s return kv_start_indices, attention_mask + def get_max_tokens_across_dp(): + dist_ctx = get_dist_manager().current_context() + if dist_ctx.dist_config.dp > 1: + total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype) + world_size = dist_ctx.dist_config.world_size + total_token_buffer = torch.zeros(world_size, dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device()) + dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group) + max_tokens_accros_dp = torch.max(total_token_buffer).item() + else: + max_tokens_accros_dp = 0 + return max_tokens_accros_dp + q_seqlens_cpu, kv_seqlens_cpu = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu, kv_seqlens_cpu) @@ -228,6 +243,7 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s is_unpaged_prefill, q_seqlens_list, kv_seqlens_list, max_q_seq_len, max_kv_seq_len) + cls.max_tokens_accros_dp = get_max_tokens_across_dp() if not cls.enable_graph and step_context.kv_quant_policy == 8: record_file = os.getenv('ASCEND_QUANT_RECORD_FILE') diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 5703ef0638..362acc9909 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -1,12 +1,29 @@ # Copyright (c) OpenMMLab. All rights reserved. +from dataclasses import dataclass from typing import Callable, List import torch +from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl +from . import DlinferDistContext + + +def get_dist_ctx(): + dist_ctx = get_dist_manager().current_context() + + return DlinferDistContext(dp_size = dist_ctx.dist_config.dp, + tp_size = dist_ctx.dist_config.tp, + ep_size = dist_ctx.dist_config.ep, + dp_rank = dist_ctx.dp_rank, + tp_rank = dist_ctx.attn_tp_group.rank, + ep_rank = dist_ctx.ep_rank, + max_tokens_accros_dp = 1, + tp_group = dist_ctx.attn_tp_group.gpu_group, + ep_group = dist_ctx.ep_gpu_group) class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl): @@ -17,9 +34,10 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): self.dim = dim if n_groups != -1: raise NotImplementedError('Group router not supported') + self.dist_ctx = get_dist_ctx() def forward(self, x: torch.Tensor): - routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k) + routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, self.dist_ctx) return routing_weights, selected_experts @@ -46,6 +64,7 @@ def __init__(self, self.renormalize = renormalize self.ep_size = ep_size self.ep_group = ep_group + self.dist_ctx = get_dist_ctx() def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): """Update weights.""" @@ -76,7 +95,7 @@ def forward(self, assert gate_up_bias is None assert down_bias is None return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize, self.ep_size, self.ep_group) + self.renormalize, self.dist_ctx) class DlinferFusedMoEBuilder(FusedMoEBuilder): diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 2079f68831..df291adbd8 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -2,6 +2,7 @@ import dlinfer.ops as ext_ops import torch from torch import Tensor +from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext def fused_moe( @@ -12,9 +13,8 @@ def fused_moe( topk_ids: Tensor, topk: int, renormalize: bool, - ep_size: int, - ep_group: torch.distributed.ProcessGroup = None, + dist_ctx: DlinferDistContext, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, - ep_size, ep_group) + dist_ctx) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index ad2fe66056..fdeed7e81a 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops from torch import Tensor +from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext -def moe_gating_topk_softmax(router_logits: Tensor, topk: int): - routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk) +def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext): + routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, dist_ctx) return routing_weights, selected_experts From f17f23159247e5f2fe824477180a60c63ac6eb64 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 30 Dec 2025 09:33:36 +0000 Subject: [PATCH 08/25] move DlinferDistContext into dlinfer --- lmdeploy/pytorch/backends/dlinfer/__init__.py | 18 ------------------ lmdeploy/pytorch/backends/dlinfer/moe.py | 3 +-- lmdeploy/pytorch/kernels/dlinfer/__init__.py | 1 + lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 2 +- .../kernels/dlinfer/moe_gating_topk_softmax.py | 2 +- 5 files changed, 4 insertions(+), 22 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py index d06de6ac2e..ef101fec61 100644 --- a/lmdeploy/pytorch/backends/dlinfer/__init__.py +++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py @@ -1,19 +1 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch -from dataclasses import dataclass - - -@dataclass -class DlinferDistContext: - dp_size: int = 1 - tp_size: int = 1 - ep_size: int = 1 - - dp_rank: int = 0 - tp_rank: int = 0 - ep_rank: int = 0 - - max_tokens_accros_dp: int = 1 - - tp_group: torch.distributed.ProcessGroup = None - ep_group: torch.distributed.ProcessGroup = None diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 362acc9909..9a53392f1c 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -6,10 +6,9 @@ import torch from lmdeploy.pytorch.distributed import get_dist_manager -from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.kernels.dlinfer import DlinferDistContext, fused_moe, moe_gating_topk_softmax from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl -from . import DlinferDistContext def get_dist_ctx(): diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index 7b226d7ff4..88cf884f55 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from dlinfer.utils.type_annotation import DlinferDistContext from ..default import multinomial_sampling, per_channel_quant from .apply_rotary_pos_emb import apply_rotary_pos_emb from .awq_kernels import awq_linear diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index df291adbd8..1f18af6880 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -2,7 +2,7 @@ import dlinfer.ops as ext_ops import torch from torch import Tensor -from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext +from . import DlinferDistContext def fused_moe( diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index fdeed7e81a..90c3408b10 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops from torch import Tensor -from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext +from . import DlinferDistContext def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext): From 760c0dbceb0447b47c93c040fcde1ab30334a967 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 31 Dec 2025 01:42:18 +0000 Subject: [PATCH 09/25] fix get_max_tokens_across_dp in tp case --- lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 3aaa405ce0..566f7e032b 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -231,7 +231,7 @@ def get_max_tokens_across_dp(): dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group) max_tokens_accros_dp = torch.max(total_token_buffer).item() else: - max_tokens_accros_dp = 0 + max_tokens_accros_dp = torch.sum(step_context.q_seqlens).item() return max_tokens_accros_dp q_seqlens_cpu, kv_seqlens_cpu = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) From 73ebda1e1b30dd1341a429a25997083ca05e7dc6 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Sun, 4 Jan 2026 08:03:03 +0000 Subject: [PATCH 10/25] format code --- .../backends/dlinfer/ascend/op_backend.py | 4 +++- lmdeploy/pytorch/backends/dlinfer/moe.py | 22 +++++++++---------- lmdeploy/pytorch/kernels/dlinfer/__init__.py | 2 ++ lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 2 +- .../dlinfer/moe_gating_topk_softmax.py | 1 + 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 566f7e032b..ba5b39893b 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -227,7 +227,9 @@ def get_max_tokens_across_dp(): if dist_ctx.dist_config.dp > 1: total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype) world_size = dist_ctx.dist_config.world_size - total_token_buffer = torch.zeros(world_size, dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device()) + total_token_buffer = torch.zeros(world_size, + dtype=step_context.q_seqlens.dtype, + device=torch.npu.current_device()) dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group) max_tokens_accros_dp = torch.max(total_token_buffer).item() else: diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 9a53392f1c..0dc75f2d47 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -1,6 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. - -from dataclasses import dataclass from typing import Callable, List import torch @@ -13,16 +11,16 @@ def get_dist_ctx(): dist_ctx = get_dist_manager().current_context() - - return DlinferDistContext(dp_size = dist_ctx.dist_config.dp, - tp_size = dist_ctx.dist_config.tp, - ep_size = dist_ctx.dist_config.ep, - dp_rank = dist_ctx.dp_rank, - tp_rank = dist_ctx.attn_tp_group.rank, - ep_rank = dist_ctx.ep_rank, - max_tokens_accros_dp = 1, - tp_group = dist_ctx.attn_tp_group.gpu_group, - ep_group = dist_ctx.ep_gpu_group) + + return DlinferDistContext(dp_size=dist_ctx.dist_config.dp, + tp_size=dist_ctx.dist_config.tp, + ep_size=dist_ctx.dist_config.ep, + dp_rank=dist_ctx.dp_rank, + tp_rank=dist_ctx.attn_tp_group.rank, + ep_rank=dist_ctx.ep_rank, + max_tokens_accros_dp=1, + tp_group=dist_ctx.attn_tp_group.gpu_group, + ep_group=dist_ctx.ep_gpu_group) class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl): diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index 88cf884f55..79790ff4d7 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dlinfer.utils.type_annotation import DlinferDistContext + from ..default import multinomial_sampling, per_channel_quant from .apply_rotary_pos_emb import apply_rotary_pos_emb from .awq_kernels import awq_linear @@ -12,6 +13,7 @@ from .rms_norm import rms_norm __all__ = [ + 'DlinferDistContext', 'rms_norm', 'apply_rotary_pos_emb', 'awq_linear', diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 1f18af6880..ef9ba53402 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops -import torch from torch import Tensor + from . import DlinferDistContext diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index 90c3408b10..b57e33afea 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops from torch import Tensor + from . import DlinferDistContext From 86d00ebb1e63acbb631bd799f1e641217eb2c299 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Tue, 6 Jan 2026 02:49:07 +0000 Subject: [PATCH 11/25] fix grpah_mode dp --- lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index ba5b39893b..f46408cd3d 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -226,6 +226,14 @@ def get_max_tokens_across_dp(): dist_ctx = get_dist_manager().current_context() if dist_ctx.dist_config.dp > 1: total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype) + if cls.enable_graph and step_context.is_decoding: + from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size + total_token_current_rank_item = total_token_current_rank.item() + total_token_current_rank = torch.tensor( + [get_ascend_compatible_size(total_token_current_rank_item)], + dtype=total_token_current_rank.dtype, + device=total_token_current_rank.device, + ) world_size = dist_ctx.dist_config.world_size total_token_buffer = torch.zeros(world_size, dtype=step_context.q_seqlens.dtype, From 35a60da9ae7cac8a3c707ae8cf788e8c9b51d3f3 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Mon, 12 Jan 2026 11:54:56 +0000 Subject: [PATCH 12/25] add mlpmetada --- .../backends/dlinfer/ascend/op_backend.py | 146 +++++++++++++++--- lmdeploy/pytorch/backends/dlinfer/moe.py | 56 ++++--- .../pytorch/backends/dlinfer/op_backend.py | 5 + lmdeploy/pytorch/backends/moe.py | 20 ++- lmdeploy/pytorch/kernels/dlinfer/__init__.py | 3 - lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 16 +- .../dlinfer/moe_gating_topk_softmax.py | 8 +- lmdeploy/pytorch/model_inputs.py | 1 + lmdeploy/pytorch/models/qwen3_moe.py | 13 +- lmdeploy/pytorch/models/qwen3_vl.py | 6 + lmdeploy/pytorch/models/qwen3_vl_moe.py | 2 + lmdeploy/pytorch/nn/moe/base.py | 17 +- lmdeploy/pytorch/nn/moe/default.py | 2 + 13 files changed, 227 insertions(+), 68 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 3f2f5e5911..df8e115b5b 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -2,6 +2,7 @@ import itertools import os import re +from dataclasses import dataclass from functools import lru_cache from pathlib import Path from typing import Dict, Tuple @@ -14,6 +15,7 @@ from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.utils import get_logger +from ..moe import MoeType from ..op_backend import DlinferOpsBackend logger = get_logger('lmdeploy') @@ -42,6 +44,30 @@ def is_Ascend310P(cls) -> bool: def is_Ascend910(cls) -> bool: return cls.device_name().startswith(cls.Ascend910) + @classmethod + @lru_cache(maxsize=1) + def soc_version(cls) -> str: + return torch.npu.get_soc_version() + + @classmethod + def is_A2(cls) -> bool: + return 220 <= cls.soc_version() <= 225 + + @classmethod + def is_A3(cls) -> bool: + return 250 <= cls.soc_version() <= 255 + + +@dataclass +class DistMeta: + dp_size: int + tp_size: int + ep_size: int + tp_rank: int + ep_rank: int + tp_group: torch.distributed.ProcessGroup + ep_group: torch.distributed.ProcessGroup + class AscendKVQuantMeta: has_set_value: bool = False @@ -90,10 +116,12 @@ def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_laye class AscendOpsBackend(DlinferOpsBackend): """Ascend layer backend.""" - enable_graph = False - half_negative_inf = torch.finfo(torch.float16).min + enable_graph: bool = False + half_negative_inf: float = torch.finfo(torch.float16).min total_slots = None max_batches = None + dist_meta: DistMeta = None + graph_capture_sizes = None max_tokens_accros_dp = 0 @staticmethod @@ -235,27 +263,83 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s return kv_start_indices, attention_mask - def get_max_tokens_across_dp(): + def get_dist_meta(): + if cls.dist_meta is not None: + return cls.dist_meta dist_ctx = get_dist_manager().current_context() - if dist_ctx.dist_config.dp > 1: - total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype) - if cls.enable_graph and step_context.is_decoding: - from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size - total_token_current_rank_item = total_token_current_rank.item() - total_token_current_rank = torch.tensor( - [get_ascend_compatible_size(total_token_current_rank_item)], - dtype=total_token_current_rank.dtype, - device=total_token_current_rank.device, - ) - world_size = dist_ctx.dist_config.world_size - total_token_buffer = torch.zeros(world_size, + dp_size, tp_size, ep_size = dist_ctx.dist_config.dp, dist_ctx.dist_config.tp, dist_ctx.dist_config.ep + tp_rank, ep_rank = dist_ctx.attn_tp_group.rank, dist_ctx.ep_rank + tp_group = dist_ctx.attn_tp_group.gpu_group + ep_group = dist_ctx.ep_gpu_group + cls.dist_meta = DistMeta(dp_size=dp_size, + tp_size=tp_size, + ep_size=ep_size, + tp_rank=tp_rank, + ep_rank=ep_rank, + tp_group=tp_group, + ep_group=ep_group) + return cls.dist_meta + + def get_tokens_info(dp_size, tp_size, ep_size, ep_group): + if ep_size <= 1: + return 0, 0, 0, None + # get runtime num_tokens + is_graph = cls.enable_graph and step_context.is_decoding + if is_graph: + from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size + tokens_current_rank = step_context.q_seqlens.shape[0] + num_tokens = min(get_ascend_compatible_size(tokens_current_rank), cls.max_batches) + else: + tokens_current_rank = step_context.q_seqlens.sum().item() + num_tokens = tokens_current_rank + # get max_tokens_across_dp + if dp_size > 1: + num_tokens_tensor = torch.tensor([num_tokens], dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device()) - dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group) - max_tokens_accros_dp = torch.max(total_token_buffer).item() + world_size = dp_size * tp_size + num_tokens_buffer = torch.zeros([world_size], + dtype=step_context.q_seqlens.dtype, + device=torch.npu.current_device()) + dist.all_gather_into_tensor(num_tokens_buffer, num_tokens_tensor, ep_group) + max_tokens_across_dp = torch.max(num_tokens_buffer).item() + else: + max_tokens_across_dp = num_tokens + # get pad_size + paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size + pad_size = paded_size - num_tokens + # get x_active_mask + x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) + if pad_size > 0: + x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) + return num_tokens, max_tokens_across_dp, pad_size, x_active_mask + + @lru_cache + def init_mc2_token_capacity(tp_size): + max_num_tokens = min(cls.max_batches, 512) + num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size + return num_tokens_per_tp_rank * tp_size + + def select_moe_type(num_tokens, dp_size, tp_size, ep_size): + if ep_size <= 1: + return MoeType.ALLGATHER + mc2_token_capacity = init_mc2_token_capacity(tp_size) + is_graph = cls.enable_graph and step_context.is_decoding + if is_graph: + import math + num_tokens = math.ceil(num_tokens / tp_size) * tp_size + if SocVersion.is_A2(): + if num_tokens <= mc2_token_capacity and dp_size * tp_size >= 16: + return MoeType.MC2 + else: + return MoeType.ALLGATHER + elif SocVersion.is_A3(): + if num_tokens <= mc2_token_capacity: + return MoeType.MC2 + else: + return MoeType.ALLTOALL else: - max_tokens_accros_dp = torch.sum(step_context.q_seqlens).item() - return max_tokens_accros_dp + raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) @@ -267,7 +351,6 @@ def get_max_tokens_across_dp(): is_unpaged_prefill, q_seqlens_list, kv_seqlens_list, max_q_seq_len, max_kv_seq_len) - cls.max_tokens_accros_dp = get_max_tokens_across_dp() if not cls.enable_graph and step_context.kv_quant_policy == 8: record_file = os.getenv('ASCEND_QUANT_RECORD_FILE') @@ -300,8 +383,29 @@ def get_max_tokens_across_dp(): quant_policy=step_context.kv_quant_policy, quant_meta=AscendKVQuantMeta.quant_meta, ) - step_context.attn_metadata = attn_metadata + + get_dist_meta() + num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, + cls.dist_meta.tp_size, + cls.dist_meta.ep_size, + cls.dist_meta.ep_group) + moe_type = select_moe_type(num_tokens, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size) + mlp_meta_cls = cls.get_mlp_metadata_cls() + mlp_metadata = mlp_meta_cls( + max_tokens_across_dp=max_tokens_across_dp, + pad_size=pad_size, + dp_size=cls.dist_meta.dp_size, + tp_size=cls.dist_meta.tp_size, + ep_size=cls.dist_meta.ep_size, + tp_rank=cls.dist_meta.tp_rank, + ep_rank=cls.dist_meta.ep_rank, + tp_group=cls.dist_meta.tp_group, + ep_group=cls.dist_meta.ep_group, + moe_type=moe_type, + x_active_mask=x_active_mask, + ) + step_context.mlp_metadata = mlp_metadata return step_context @staticmethod diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 0dc75f2d47..3c552711e7 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -1,29 +1,32 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os +from dataclasses import dataclass from typing import Callable, List import torch +from dlinfer.utils.type_annotation import MoeType -from lmdeploy.pytorch.distributed import get_dist_manager -from lmdeploy.pytorch.kernels.dlinfer import DlinferDistContext, fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax -from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl +from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl -def get_dist_ctx(): - dist_ctx = get_dist_manager().current_context() +@dataclass +class DlinferMLPMetadata(MLPMetadata): + max_tokens_across_dp: int = 1 + pad_size: int = 0 + dp_size: int = 1 + tp_size: int = 1 + ep_size: int = 1 + tp_rank: int = 0 + ep_rank: int = 0 + tp_group: torch.distributed.ProcessGroup = None + ep_group: torch.distributed.ProcessGroup = None + moe_type: MoeType = MoeType.UNDEFINED + x_active_mask: torch.Tensor = None - return DlinferDistContext(dp_size=dist_ctx.dist_config.dp, - tp_size=dist_ctx.dist_config.tp, - ep_size=dist_ctx.dist_config.ep, - dp_rank=dist_ctx.dp_rank, - tp_rank=dist_ctx.attn_tp_group.rank, - ep_rank=dist_ctx.ep_rank, - max_tokens_accros_dp=1, - tp_group=dist_ctx.attn_tp_group.gpu_group, - ep_group=dist_ctx.ep_gpu_group) - -class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl): +class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]): """Dlinfer softmax topk implementation.""" def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): @@ -31,14 +34,15 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): self.dim = dim if n_groups != -1: raise NotImplementedError('Group router not supported') - self.dist_ctx = get_dist_ctx() - def forward(self, x: torch.Tensor): - routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, self.dist_ctx) + def forward(self, x: torch.Tensor, mlp_metada: DlinferMLPMetadata): + routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, mlp_metada.max_tokens_across_dp, + mlp_metada.pad_size, mlp_metada.tp_size, + mlp_metada.ep_size, mlp_metada.tp_rank) return routing_weights, selected_experts -class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder): +class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder[DlinferMLPMetadata]): """Dlinfer softmax topk implementation builder.""" @staticmethod @@ -47,7 +51,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): return DlinferSoftmaxTopKImpl(top_k, dim, n_groups) -class DlinferFusedMoEImpl(FusedMoEImpl): +class DlinferFusedMoEImpl(FusedMoEImpl[DlinferMLPMetadata]): """Dlinfer fused moe implementation.""" def __init__(self, @@ -61,12 +65,13 @@ def __init__(self, self.renormalize = renormalize self.ep_size = ep_size self.ep_group = ep_group - self.dist_ctx = get_dist_ctx() def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): """Update weights.""" device_type = gate_up_weights.device.type if device_type in ['npu']: + if os.getenv('DLINFER_RESET_MOE_UPDATE_WEIGHTS', '0') == '1': + return gate_up_weights, down_weights return gate_up_weights.transpose(-1, -2).contiguous(), down_weights.transpose(-1, -2).contiguous() return gate_up_weights, down_weights @@ -84,6 +89,7 @@ def forward(self, topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor, down_weights: torch.Tensor, + mlp_metadata: DlinferMLPMetadata, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, expert_list: List[int] = None, @@ -92,10 +98,12 @@ def forward(self, assert gate_up_bias is None assert down_bias is None return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize, self.dist_ctx) + self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size, + mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group, + mlp_metadata.moe_type, mlp_metadata.x_active_mask) -class DlinferFusedMoEBuilder(FusedMoEBuilder): +class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]): """Dlinfer fused moe builder.""" @staticmethod diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py index 16eb604ccd..0e39907f85 100644 --- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py @@ -67,6 +67,11 @@ def get_attention_metadata_cls(): from .attention import DlinferAttentionMetadata return DlinferAttentionMetadata + @staticmethod + def get_mlp_metadata_cls(): + from .moe import DlinferMLPMetadata + return DlinferMLPMetadata + @staticmethod def get_k_block_shape( block_size: int, diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py index 5b33b97da7..ea945b7550 100644 --- a/lmdeploy/pytorch/backends/moe.py +++ b/lmdeploy/pytorch/backends/moe.py @@ -1,13 +1,23 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools from abc import ABC, abstractmethod -from typing import Callable, List, Optional +from dataclasses import dataclass +from typing import Callable, Generic, List, Optional, TypeVar import torch import torch.distributed as dist -class SoftmaxTopKImpl(ABC): +@dataclass +class MLPMetadata: + """Base MLP metadata.""" + ... + + +T = TypeVar('T', bound=MLPMetadata) + + +class SoftmaxTopKImpl(ABC, Generic[T]): """Softmax topk implementation api.""" @staticmethod @@ -22,7 +32,7 @@ def forward(self, x: torch.Tensor): raise NotImplementedError -class SoftmaxTopKBuilder(ABC): +class SoftmaxTopKBuilder(ABC, Generic[T]): """Softmax topk implementation builder.""" @staticmethod @@ -32,7 +42,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): raise NotImplementedError -class FusedMoEImpl(ABC): +class FusedMoEImpl(ABC, Generic[T]): """Fused moe implementation.""" def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): @@ -58,7 +68,7 @@ def forward(self, raise NotImplementedError -class FusedMoEBuilder(ABC): +class FusedMoEBuilder(ABC, Generic[T]): """Fused moe builder.""" @staticmethod diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index 79790ff4d7..7b226d7ff4 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -1,6 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from dlinfer.utils.type_annotation import DlinferDistContext - from ..default import multinomial_sampling, per_channel_quant from .apply_rotary_pos_emb import apply_rotary_pos_emb from .awq_kernels import awq_linear @@ -13,7 +11,6 @@ from .rms_norm import rms_norm __all__ = [ - 'DlinferDistContext', 'rms_norm', 'apply_rotary_pos_emb', 'awq_linear', diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index ef9ba53402..e0cea0b503 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,9 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops +import torch.distributed as dist +from dlinfer.utils.type_annotation import MoeType from torch import Tensor -from . import DlinferDistContext - def fused_moe( hidden_states: Tensor, @@ -13,8 +13,16 @@ def fused_moe( topk_ids: Tensor, topk: int, renormalize: bool, - dist_ctx: DlinferDistContext, + pad_size: int, + tp_size: int, + ep_size: int, + tp_rank: int, + ep_rank: int, + tp_group: dist.ProcessGroup, + ep_group: dist.ProcessGroup, + moe_type: MoeType, + x_active_mask: Tensor, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, - dist_ctx) + pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index b57e33afea..db71e87787 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -2,9 +2,9 @@ import dlinfer.ops as ext_ops from torch import Tensor -from . import DlinferDistContext - -def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext): - routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, dist_ctx) +def moe_gating_topk_softmax(router_logits: Tensor, topk: int, max_tokens_across_dp: int, pad_size: int, tp_size: int, + ep_size: int, tp_rank: int) -> tuple[Tensor, Tensor]: + routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, max_tokens_across_dp, + pad_size, tp_size, ep_size, tp_rank) return routing_weights, selected_experts diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py index 80e0540e6d..8e351428c1 100644 --- a/lmdeploy/pytorch/model_inputs.py +++ b/lmdeploy/pytorch/model_inputs.py @@ -350,6 +350,7 @@ class StepContext: input_multimodals: List[MultiModalTensor] = None vision_inputs: VisionModelInputs = None attn_metadata: Any = None + mlp_metadata: Any = None cross_seqlens: torch.LongTensor = None cross_kv_seqlens: torch.LongTensor = None cross_attn_metadata: Any = None diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py index 9d50cbb86b..66076de6dd 100644 --- a/lmdeploy/pytorch/models/qwen3_moe.py +++ b/lmdeploy/pytorch/models/qwen3_moe.py @@ -225,12 +225,13 @@ def forward( self, hidden_states: torch.Tensor, all_routed_experts: torch.Tensor = None, + mlp_metadata: Any = None, ): """forward.""" batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) router_logits = self.gate(hidden_states) - topk_weights, topk_ids = self.softmax_topk(router_logits) + topk_weights, topk_ids = self.softmax_topk(router_logits, mlp_metadata) if all_routed_experts is not None: all_routed_experts[:, self.layer_idx, :] = topk_ids if get_dist_manager().current_context().dist_config.enable_eplb: @@ -239,6 +240,7 @@ def forward( hidden_states, topk_weights, topk_ids, + mlp_metadata, ) out_states = out_states.reshape(batch_size, sequence_length, -1) @@ -284,6 +286,7 @@ def forward( past_key_value: Optional[List[torch.FloatTensor]], residual: Optional[torch.Tensor] = None, attn_metadata: Any = None, + mlp_metadata: Any = None, all_routed_experts: torch.Tensor = None, ): @@ -303,7 +306,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts) + hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts, mlp_metadata=mlp_metadata) outputs = (hidden_states, residual) return outputs @@ -349,6 +352,7 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, + mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, all_routed_experts: torch.Tensor = None, ): @@ -375,6 +379,7 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, all_routed_experts=all_routed_experts, ) @@ -430,6 +435,7 @@ def forward( position_ids: torch.Tensor, past_key_values: List[List[torch.Tensor]], attn_metadata: Any = None, + mlp_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, ): @@ -450,6 +456,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, all_routed_experts=all_routed_experts, ) @@ -476,6 +483,7 @@ def prepare_inputs_for_generation( input_ids = context.input_ids position_ids = context.position_ids attn_metadata = context.attn_metadata + mlp_metadata = context.mlp_metadata # process vision embeddings vision_embeddings = context.input_embeddings @@ -491,6 +499,7 @@ def prepare_inputs_for_generation( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, ) diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py index 60c3617ffe..5d81c8c327 100644 --- a/lmdeploy/pytorch/models/qwen3_vl.py +++ b/lmdeploy/pytorch/models/qwen3_vl.py @@ -102,6 +102,7 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, + mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack @@ -143,6 +144,7 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, ) # add visual features to the hidden states of first several layers @@ -497,6 +499,7 @@ def forward( position_ids: torch.Tensor, past_key_values: List[List[torch.Tensor]], attn_metadata: Any = None, + mlp_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, pixel_values: torch.Tensor = None, @@ -541,6 +544,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, mrope_position_ids=mrope_position_ids, # args for deepstack @@ -574,6 +578,7 @@ def prepare_inputs_for_generation( input_ids = context.input_ids position_ids = context.position_ids attn_metadata = context.attn_metadata + mlp_metadata = context.mlp_metadata pixel_values = None vis_cu_seqlens = None @@ -614,6 +619,7 @@ def prepare_inputs_for_generation( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, mrope_position_ids=mrope_position_ids, pixel_values=pixel_values, diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py index 1dc7e32de9..7586db3d11 100644 --- a/lmdeploy/pytorch/models/qwen3_vl_moe.py +++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py @@ -33,6 +33,7 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, + mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack @@ -74,6 +75,7 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, + mlp_metadata=mlp_metadata, ) # add visual features to the hidden states of first several layers diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py index 484dbbe492..2ae9db24fd 100644 --- a/lmdeploy/pytorch/nn/moe/base.py +++ b/lmdeploy/pytorch/nn/moe/base.py @@ -8,6 +8,7 @@ import lmdeploy.pytorch.distributed as dist from lmdeploy.pytorch.backends import OpType, get_backend +from lmdeploy.pytorch.backends.moe import MLPMetadata from lmdeploy.pytorch.config import TPMode from lmdeploy.pytorch.distributed import get_dist_manager, get_tp_world_rank from lmdeploy.pytorch.model_inputs import get_step_ctx_manager @@ -29,9 +30,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): impl_builder = get_backend().get_layer_impl_builder(OpType.SoftmaxTopK) self.impl = impl_builder.build(top_k, dim, n_groups=n_groups) - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor, mlp_metadata: MLPMetadata): """forward.""" - return self.impl.forward(x) + return self.impl.forward(x, mlp_metadata) def update_dims(hidden_dim: int, ffn_dim: int): @@ -296,7 +297,8 @@ def forward_dptp(self) -> MoEForwardDPTP: """Forward dptp.""" return self._forward_dptp - def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor): + def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor, + mlp_metadata: MLPMetadata): """Default forward.""" state = { 'hidden_states': hidden_states, @@ -305,16 +307,21 @@ def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tenso 'moe_type': MoeType.Default, } recv_state = self.dispatch(state) + recv_state.update({'mlp_metadata': mlp_metadata}) gemm_state = self.gemm(recv_state) out_state = self.combine(gemm_state) return out_state['hidden_states'] - def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor): + def forward(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_idx: torch.LongTensor, + mlp_metadata: MLPMetadata = None): """forward.""" if self.tp > 1 and self.tp_mode == TPMode.DP_TP: return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx) else: - return self.forward_default(hidden_states, topk_weights, topk_idx) + return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata) def renormalize(self, topk_weights): """renormalize.""" diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py index 0633aa001a..358674a466 100644 --- a/lmdeploy/pytorch/nn/moe/default.py +++ b/lmdeploy/pytorch/nn/moe/default.py @@ -298,12 +298,14 @@ def gemm(self, state: Dict): hidden_states = state['hidden_states'] topk_weights = state['topk_weights'] topk_ids = state['topk_idx'] + mlp_metadata = state['mlp_metadata'] hidden_states = self.impl.forward(hidden_states, topk_weights, topk_ids, self.gate_up.weight, self.down.weight, + mlp_metadata, self.gate_up.bias, self.down.bias, self.expert_list, From 8980ba9ca75d0d2bbb9b2d785f5cf49bff57ac5a Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 14 Jan 2026 09:21:55 +0000 Subject: [PATCH 13/25] good eager mode --- .../backends/dlinfer/ascend/op_backend.py | 37 ++++++++++++++----- lmdeploy/pytorch/backends/dlinfer/moe.py | 3 +- lmdeploy/pytorch/kernels/dlinfer/__init__.py | 3 +- .../kernels/dlinfer/flash_attention.py | 2 +- .../pytorch/kernels/dlinfer/pagedattention.py | 9 +++-- 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index df8e115b5b..85733e35b6 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -123,6 +123,7 @@ class AscendOpsBackend(DlinferOpsBackend): dist_meta: DistMeta = None graph_capture_sizes = None max_tokens_accros_dp = 0 + max_tokens_accros_dp = 0 @staticmethod def get_name() -> str: @@ -308,10 +309,11 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group): # get pad_size paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size pad_size = paded_size - num_tokens - # get x_active_mask - x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) - if pad_size > 0: - x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) + # # get x_active_mask + # x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) + # if pad_size > 0: + # x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) + x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) return num_tokens, max_tokens_across_dp, pad_size, x_active_mask @lru_cache @@ -320,27 +322,39 @@ def init_mc2_token_capacity(tp_size): num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size return num_tokens_per_tp_rank * tp_size - def select_moe_type(num_tokens, dp_size, tp_size, ep_size): + def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): if ep_size <= 1: return MoeType.ALLGATHER mc2_token_capacity = init_mc2_token_capacity(tp_size) is_graph = cls.enable_graph and step_context.is_decoding if is_graph: import math - num_tokens = math.ceil(num_tokens / tp_size) * tp_size + max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size if SocVersion.is_A2(): - if num_tokens <= mc2_token_capacity and dp_size * tp_size >= 16: + if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16: return MoeType.MC2 else: return MoeType.ALLGATHER elif SocVersion.is_A3(): - if num_tokens <= mc2_token_capacity: + if max_tokens_across_dp <= mc2_token_capacity: return MoeType.MC2 else: return MoeType.ALLTOALL else: raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') + def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type): + if moe_type in {MoeType.MC2, MoeType.ALLTOALL}: + x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) + else: + return None + # if pad_size > 0: + # x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) + # if tp_size > 1: + # split_x_active_mask = torch.tensor_split(x_active_mask, tp_size, dim=0) + # x_active_mask = split_x_active_mask[tp_rank] + return x_active_mask + q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu, @@ -385,13 +399,15 @@ def select_moe_type(num_tokens, dp_size, tp_size, ep_size): ) step_context.attn_metadata = attn_metadata - get_dist_meta() + cls.dist_meta = get_dist_meta() num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group) - moe_type = select_moe_type(num_tokens, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size) + moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, + cls.dist_meta.ep_size) mlp_meta_cls = cls.get_mlp_metadata_cls() + cls.max_tokens_accros_dp = max_tokens_across_dp mlp_metadata = mlp_meta_cls( max_tokens_across_dp=max_tokens_across_dp, pad_size=pad_size, @@ -441,6 +457,7 @@ def device_count(): @staticmethod def support_ray(): """Support ray.""" + # return False if not _envs.ascend_set_rt_visable_devices_by_ray: os.environ['RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES'] = '1' return True diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 3c552711e7..10aab3c213 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -4,9 +4,8 @@ from typing import Callable, List import torch -from dlinfer.utils.type_annotation import MoeType -from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index 7b226d7ff4..f9ea874ae5 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -4,7 +4,7 @@ from .awq_kernels import awq_linear from .fill_kv_cache import fill_kv_cache from .flash_attention import flash_attention_fwd -from .fused_moe import fused_moe +from .fused_moe import MoeType, fused_moe from .linear import linear from .moe_gating_topk_softmax import moe_gating_topk_softmax from .pagedattention import paged_attention_fwd @@ -15,6 +15,7 @@ 'apply_rotary_pos_emb', 'awq_linear', 'fill_kv_cache', + 'MoeType', 'fused_moe', 'paged_attention_fwd', 'flash_attention_fwd', diff --git a/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py b/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py index a1b4c659d1..7f3037b247 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py +++ b/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops -from dlinfer.utils.type_annotation import Tensor +from torch import Tensor def flash_attention_fwd( diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py index 13f4e12a58..8996508aff 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py +++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + import dlinfer.ops as ext_ops -import torch -from dlinfer.utils.type_annotation import Optional, Sequence, Tensor +from torch import Tensor def prefill_attention( @@ -111,8 +112,8 @@ def paged_token_attention( def paged_attention_fwd( query_states: Tensor, - key_states: torch.Tensor, - value_states: torch.Tensor, + key_states: Tensor, + value_states: Tensor, attn_output: Tensor, key_cache: Tensor, value_cache: Tensor, From a4f003ba342a6a0a34da2cff0a716da139d83cbd Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 14 Jan 2026 10:48:34 +0000 Subject: [PATCH 14/25] good graph mode --- .../backends/dlinfer/ascend/op_backend.py | 28 +++---------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 85733e35b6..ab57dc97bb 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -122,8 +122,6 @@ class AscendOpsBackend(DlinferOpsBackend): max_batches = None dist_meta: DistMeta = None graph_capture_sizes = None - max_tokens_accros_dp = 0 - max_tokens_accros_dp = 0 @staticmethod def get_name() -> str: @@ -309,12 +307,9 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group): # get pad_size paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size pad_size = paded_size - num_tokens - # # get x_active_mask - # x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) - # if pad_size > 0: - # x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) + # get x_active_mask x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) - return num_tokens, max_tokens_across_dp, pad_size, x_active_mask + return max_tokens_across_dp, pad_size, x_active_mask @lru_cache def init_mc2_token_capacity(tp_size): @@ -343,18 +338,6 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): else: raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') - def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type): - if moe_type in {MoeType.MC2, MoeType.ALLTOALL}: - x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device()) - else: - return None - # if pad_size > 0: - # x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False) - # if tp_size > 1: - # split_x_active_mask = torch.tensor_split(x_active_mask, tp_size, dim=0) - # x_active_mask = split_x_active_mask[tp_rank] - return x_active_mask - q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu, @@ -400,14 +383,11 @@ def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type): step_context.attn_metadata = attn_metadata cls.dist_meta = get_dist_meta() - num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, - cls.dist_meta.tp_size, - cls.dist_meta.ep_size, - cls.dist_meta.ep_group) + max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size, + cls.dist_meta.ep_size, cls.dist_meta.ep_group) moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size) mlp_meta_cls = cls.get_mlp_metadata_cls() - cls.max_tokens_accros_dp = max_tokens_across_dp mlp_metadata = mlp_meta_cls( max_tokens_across_dp=max_tokens_across_dp, pad_size=pad_size, From 74997d3d11d5450e131ce2d040ca43ecf160bc0f Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 15 Jan 2026 02:52:01 +0000 Subject: [PATCH 15/25] good dp*tp+ep feature --- lmdeploy/pytorch/backends/dlinfer/moe.py | 8 +++++++- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 10aab3c213..173f955c56 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -64,6 +64,11 @@ def __init__(self, self.renormalize = renormalize self.ep_size = ep_size self.ep_group = ep_group + self.expert_ids_per_ep_rank = torch.tensor( + [i % (self.num_experts // self.ep_size) for i in range(num_experts)], + dtype=torch.int32, + device=torch.npu.current_device(), + ) def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): """Update weights.""" @@ -96,10 +101,11 @@ def forward(self, """forward.""" assert gate_up_bias is None assert down_bias is None + return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size, mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group, - mlp_metadata.moe_type, mlp_metadata.x_active_mask) + mlp_metadata.moe_type, mlp_metadata.x_active_mask, self.expert_ids_per_ep_rank) class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]): diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index e0cea0b503..ea94ebd671 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -22,7 +22,9 @@ def fused_moe( ep_group: dist.ProcessGroup, moe_type: MoeType, x_active_mask: Tensor, + expert_ids_per_ep_rank: Tensor, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, - pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask) + pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask, + expert_ids_per_ep_rank) From 40044668d747879f6b3ed8a797478f93d6d066a4 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 16 Jan 2026 03:39:20 +0000 Subject: [PATCH 16/25] fix tp err --- lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index ab57dc97bb..3b85ab1cc9 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -281,7 +281,7 @@ def get_dist_meta(): def get_tokens_info(dp_size, tp_size, ep_size, ep_group): if ep_size <= 1: - return 0, 0, 0, None + return 0, 0, 0 # get runtime num_tokens is_graph = cls.enable_graph and step_context.is_decoding if is_graph: From 1166d22ae4d0146bb27d9912811504ea4f654da3 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 22 Jan 2026 11:39:31 +0000 Subject: [PATCH 17/25] update pad_size --- .../backends/dlinfer/ascend/op_backend.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 3b85ab1cc9..fb4cf03026 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import itertools +import math import os import re from dataclasses import dataclass @@ -304,12 +305,24 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group): max_tokens_across_dp = torch.max(num_tokens_buffer).item() else: max_tokens_across_dp = num_tokens - # get pad_size - paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size - pad_size = paded_size - num_tokens + # # get pad_size + # paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size + # pad_size = paded_size - num_tokens # get x_active_mask x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) - return max_tokens_across_dp, pad_size, x_active_mask + return num_tokens, max_tokens_across_dp, x_active_mask + + def get_pad_size(num_tokens, max_tokens_across_dp, tp_size, moe_type): + if moe_type == MoeType.MC2: + paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size + pad_size = paded_size - num_tokens + elif moe_type == MoeType.ALLTOALL: + pad_size = tp_size - num_tokens + elif moe_type == MoeType.ALLGATHER: + pad_size = max_tokens_across_dp - num_tokens + else: + pad_size = 0 + return pad_size @lru_cache def init_mc2_token_capacity(tp_size): @@ -383,10 +396,11 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): step_context.attn_metadata = attn_metadata cls.dist_meta = get_dist_meta() - max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size, - cls.dist_meta.ep_size, cls.dist_meta.ep_group) + num_tokens, max_tokens_across_dp, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size, + cls.dist_meta.ep_size, cls.dist_meta.ep_group) moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size) + pad_size = get_pad_size(num_tokens, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) mlp_meta_cls = cls.get_mlp_metadata_cls() mlp_metadata = mlp_meta_cls( max_tokens_across_dp=max_tokens_across_dp, From fec438f36379159bc9b2a17010156224ddc3ccf5 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 23 Jan 2026 10:40:57 +0000 Subject: [PATCH 18/25] optimize ep moe --- .../backends/dlinfer/ascend/op_backend.py | 72 ++++++++++--------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index fb4cf03026..9ec6a5f74f 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -283,46 +283,30 @@ def get_dist_meta(): def get_tokens_info(dp_size, tp_size, ep_size, ep_group): if ep_size <= 1: return 0, 0, 0 - # get runtime num_tokens + # get runtime_tokens_current_rank is_graph = cls.enable_graph and step_context.is_decoding if is_graph: from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size - tokens_current_rank = step_context.q_seqlens.shape[0] - num_tokens = min(get_ascend_compatible_size(tokens_current_rank), cls.max_batches) + actual_tokens_current_rank = step_context.q_seqlens.shape[0] + runtime_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank), + cls.max_batches) else: - tokens_current_rank = step_context.q_seqlens.sum().item() - num_tokens = tokens_current_rank + actual_tokens_current_rank = step_context.q_seqlens.sum().item() + runtime_tokens_current_rank = actual_tokens_current_rank # get max_tokens_across_dp if dp_size > 1: - num_tokens_tensor = torch.tensor([num_tokens], - dtype=step_context.q_seqlens.dtype, - device=torch.npu.current_device()) + runtime_tokens_tensor = torch.tensor([runtime_tokens_current_rank], + dtype=step_context.q_seqlens.dtype, + device=torch.npu.current_device()) world_size = dp_size * tp_size - num_tokens_buffer = torch.zeros([world_size], - dtype=step_context.q_seqlens.dtype, - device=torch.npu.current_device()) - dist.all_gather_into_tensor(num_tokens_buffer, num_tokens_tensor, ep_group) - max_tokens_across_dp = torch.max(num_tokens_buffer).item() + runtime_tokens_buffer = torch.zeros([world_size], + dtype=step_context.q_seqlens.dtype, + device=torch.npu.current_device()) + dist.all_gather_into_tensor(runtime_tokens_buffer, runtime_tokens_tensor, ep_group) + max_tokens_across_dp = torch.max(runtime_tokens_buffer).item() else: - max_tokens_across_dp = num_tokens - # # get pad_size - # paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size - # pad_size = paded_size - num_tokens - # get x_active_mask - x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) - return num_tokens, max_tokens_across_dp, x_active_mask - - def get_pad_size(num_tokens, max_tokens_across_dp, tp_size, moe_type): - if moe_type == MoeType.MC2: - paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size - pad_size = paded_size - num_tokens - elif moe_type == MoeType.ALLTOALL: - pad_size = tp_size - num_tokens - elif moe_type == MoeType.ALLGATHER: - pad_size = max_tokens_across_dp - num_tokens - else: - pad_size = 0 - return pad_size + max_tokens_across_dp = runtime_tokens_current_rank + return actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp @lru_cache def init_mc2_token_capacity(tp_size): @@ -351,6 +335,23 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): else: raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') + def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size, + moe_type): + x_active_mask = None + if moe_type == MoeType.MC2: + paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size + pad_size = paded_size - runtime_tokens_current_rank + x_active_mask = torch.ones(actual_tokens_current_rank, + dtype=torch.bool, + device=torch.npu.current_device()) + elif moe_type == MoeType.ALLTOALL: + pad_size = tp_size - runtime_tokens_current_rank + elif moe_type == MoeType.ALLGATHER: + pad_size = max_tokens_across_dp - runtime_tokens_current_rank + else: + pad_size = 0 + return pad_size, x_active_mask + q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu, @@ -396,11 +397,12 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): step_context.attn_metadata = attn_metadata cls.dist_meta = get_dist_meta() - num_tokens, max_tokens_across_dp, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size, - cls.dist_meta.ep_size, cls.dist_meta.ep_group) + actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp = get_tokens_info( + cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group) moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size) - pad_size = get_pad_size(num_tokens, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) + pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, + max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) mlp_meta_cls = cls.get_mlp_metadata_cls() mlp_metadata = mlp_meta_cls( max_tokens_across_dp=max_tokens_across_dp, From d7177a105c55e0896e240304c11fa385a9ae40fc Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 28 Jan 2026 02:33:06 +0000 Subject: [PATCH 19/25] opt ep moe --- .../pytorch/backends/dlinfer/ascend/op_backend.py | 11 +++++++++++ lmdeploy/pytorch/backends/dlinfer/moe.py | 4 +++- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 9ec6a5f74f..26dbf5b172 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -352,6 +352,15 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to pad_size = 0 return pad_size, x_active_mask + @lru_cache(maxsize=1) + def get_moe_group_name(group): + if group is None: + return None + local_rank = torch.distributed.get_rank(group=group) + backend = group._get_backend(torch.device('npu')) + group_name = backend.get_hccl_comm_name(local_rank) + return group_name + q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill) q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu, @@ -403,6 +412,7 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to cls.dist_meta.ep_size) pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) + moe_group_name = get_moe_group_name(cls.dist_meta.ep_group) mlp_meta_cls = cls.get_mlp_metadata_cls() mlp_metadata = mlp_meta_cls( max_tokens_across_dp=max_tokens_across_dp, @@ -416,6 +426,7 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to ep_group=cls.dist_meta.ep_group, moe_type=moe_type, x_active_mask=x_active_mask, + moe_group_name=moe_group_name, ) step_context.mlp_metadata = mlp_metadata return step_context diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 173f955c56..fc12c1679a 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -23,6 +23,7 @@ class DlinferMLPMetadata(MLPMetadata): ep_group: torch.distributed.ProcessGroup = None moe_type: MoeType = MoeType.UNDEFINED x_active_mask: torch.Tensor = None + moe_group_name: str = None class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]): @@ -105,7 +106,8 @@ def forward(self, return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size, mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group, - mlp_metadata.moe_type, mlp_metadata.x_active_mask, self.expert_ids_per_ep_rank) + mlp_metadata.moe_type, mlp_metadata.x_active_mask, mlp_metadata.moe_group_name, + self.expert_ids_per_ep_rank) class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]): diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index ea94ebd671..decce941ac 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -22,9 +22,10 @@ def fused_moe( ep_group: dist.ProcessGroup, moe_type: MoeType, x_active_mask: Tensor, + moe_group_name: str, expert_ids_per_ep_rank: Tensor, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask, - expert_ids_per_ep_rank) + moe_group_name, expert_ids_per_ep_rank) From 1d3325e206c09378e2efdefbbfc0dfa3be2a9f20 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Wed, 28 Jan 2026 07:56:14 +0000 Subject: [PATCH 20/25] fix ascend dptp --- lmdeploy/pytorch/nn/moe/base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py index 2ae9db24fd..dc7d7ab8a0 100644 --- a/lmdeploy/pytorch/nn/moe/base.py +++ b/lmdeploy/pytorch/nn/moe/base.py @@ -135,7 +135,8 @@ def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: to cur_out = self.gemm_func(hidden_states, topk_weights, topk_ids) return self.reduce_scatter(cur_out, output_states, tp_sizes) - def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor): + def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + mlp_metadata: MLPMetadata): """forward.""" def __slice_tensor(tensor: torch.Tensor, slice_size: int): @@ -177,6 +178,7 @@ def __slice_and_gather(): # pre cur_inputs = __slice_and_gather() + cur_inputs.update(dict(mlp_metadata=mlp_metadata)) out_handles = [] # main loop @@ -185,6 +187,7 @@ def __slice_and_gather(): _, handle = self._gemm_and_reduce_scatter(**cur_inputs) out_handles.append(handle) cur_inputs = next_inputs + cur_inputs.update(dict(mlp_metadata=mlp_metadata)) # post _, handle = self._gemm_and_reduce_scatter(**cur_inputs) @@ -259,12 +262,13 @@ def init_dist_args(self, all_reduce: bool): if self.tp > 1 and self.tp_mode == TPMode.DP_TP: - def __gemm_func(hidden_states, topk_weights, topk_ids): + def __gemm_func(hidden_states, topk_weights, topk_ids, mlp_metadata): return self.gemm( dict( hidden_states=hidden_states, topk_weights=topk_weights, topk_idx=topk_ids, + mlp_metadata=mlp_metadata, moe_type=MoeType.Default, ))['hidden_states'] @@ -319,7 +323,7 @@ def forward(self, mlp_metadata: MLPMetadata = None): """forward.""" if self.tp > 1 and self.tp_mode == TPMode.DP_TP: - return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx) + return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx, mlp_metadata) else: return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata) From b1f94e43673257fb9f237f0481d9208c12765118 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 30 Jan 2026 03:48:41 +0000 Subject: [PATCH 21/25] refactor code --- .../backends/dlinfer/ascend/op_backend.py | 8 ++--- lmdeploy/pytorch/backends/dlinfer/moe.py | 30 ++++++++++--------- .../pytorch/backends/dlinfer/op_backend.py | 5 ---- lmdeploy/pytorch/backends/moe.py | 20 ++++--------- lmdeploy/pytorch/model_inputs.py | 1 - lmdeploy/pytorch/models/qwen3_moe.py | 13 ++------ lmdeploy/pytorch/models/qwen3_vl.py | 6 ---- lmdeploy/pytorch/models/qwen3_vl_moe.py | 2 -- lmdeploy/pytorch/nn/moe/base.py | 27 +++++------------ lmdeploy/pytorch/nn/moe/default.py | 2 -- 10 files changed, 35 insertions(+), 79 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 26dbf5b172..f303e7a63d 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -16,7 +16,7 @@ from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.utils import get_logger -from ..moe import MoeType +from ..moe import MOEMetadata, MoeType from ..op_backend import DlinferOpsBackend logger = get_logger('lmdeploy') @@ -413,8 +413,8 @@ def get_moe_group_name(group): pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) moe_group_name = get_moe_group_name(cls.dist_meta.ep_group) - mlp_meta_cls = cls.get_mlp_metadata_cls() - mlp_metadata = mlp_meta_cls( + + moe_metadata = MOEMetadata( max_tokens_across_dp=max_tokens_across_dp, pad_size=pad_size, dp_size=cls.dist_meta.dp_size, @@ -428,7 +428,7 @@ def get_moe_group_name(group): x_active_mask=x_active_mask, moe_group_name=moe_group_name, ) - step_context.mlp_metadata = mlp_metadata + step_context.moe_metadata = moe_metadata return step_context @staticmethod diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index fc12c1679a..aa4331a529 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -6,12 +6,13 @@ import torch from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.model_inputs import get_step_ctx_manager -from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl +from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl @dataclass -class DlinferMLPMetadata(MLPMetadata): +class MOEMetadata: max_tokens_across_dp: int = 1 pad_size: int = 0 dp_size: int = 1 @@ -26,7 +27,7 @@ class DlinferMLPMetadata(MLPMetadata): moe_group_name: str = None -class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]): +class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl): """Dlinfer softmax topk implementation.""" def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): @@ -35,14 +36,15 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): if n_groups != -1: raise NotImplementedError('Group router not supported') - def forward(self, x: torch.Tensor, mlp_metada: DlinferMLPMetadata): - routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, mlp_metada.max_tokens_across_dp, - mlp_metada.pad_size, mlp_metada.tp_size, - mlp_metada.ep_size, mlp_metada.tp_rank) + def forward(self, x: torch.Tensor): + moe_metadata = get_step_ctx_manager().current_context().moe_metadata + routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata.max_tokens_across_dp, + moe_metadata.pad_size, moe_metadata.tp_size, + moe_metadata.ep_size, moe_metadata.tp_rank) return routing_weights, selected_experts -class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder[DlinferMLPMetadata]): +class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder): """Dlinfer softmax topk implementation builder.""" @staticmethod @@ -51,7 +53,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): return DlinferSoftmaxTopKImpl(top_k, dim, n_groups) -class DlinferFusedMoEImpl(FusedMoEImpl[DlinferMLPMetadata]): +class DlinferFusedMoEImpl(FusedMoEImpl): """Dlinfer fused moe implementation.""" def __init__(self, @@ -94,7 +96,6 @@ def forward(self, topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor, down_weights: torch.Tensor, - mlp_metadata: DlinferMLPMetadata, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, expert_list: List[int] = None, @@ -102,15 +103,16 @@ def forward(self, """forward.""" assert gate_up_bias is None assert down_bias is None + moe_metadata = get_step_ctx_manager().current_context().moe_metadata return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size, - mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group, - mlp_metadata.moe_type, mlp_metadata.x_active_mask, mlp_metadata.moe_group_name, + self.renormalize, moe_metadata.pad_size, moe_metadata.tp_size, moe_metadata.ep_size, + moe_metadata.tp_rank, moe_metadata.ep_rank, moe_metadata.tp_group, moe_metadata.ep_group, + moe_metadata.moe_type, moe_metadata.x_active_mask, moe_metadata.moe_group_name, self.expert_ids_per_ep_rank) -class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]): +class DlinferFusedMoEBuilder(FusedMoEBuilder): """Dlinfer fused moe builder.""" @staticmethod diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py index 0e39907f85..16eb604ccd 100644 --- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py @@ -67,11 +67,6 @@ def get_attention_metadata_cls(): from .attention import DlinferAttentionMetadata return DlinferAttentionMetadata - @staticmethod - def get_mlp_metadata_cls(): - from .moe import DlinferMLPMetadata - return DlinferMLPMetadata - @staticmethod def get_k_block_shape( block_size: int, diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py index ea945b7550..5b33b97da7 100644 --- a/lmdeploy/pytorch/backends/moe.py +++ b/lmdeploy/pytorch/backends/moe.py @@ -1,23 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Callable, Generic, List, Optional, TypeVar +from typing import Callable, List, Optional import torch import torch.distributed as dist -@dataclass -class MLPMetadata: - """Base MLP metadata.""" - ... - - -T = TypeVar('T', bound=MLPMetadata) - - -class SoftmaxTopKImpl(ABC, Generic[T]): +class SoftmaxTopKImpl(ABC): """Softmax topk implementation api.""" @staticmethod @@ -32,7 +22,7 @@ def forward(self, x: torch.Tensor): raise NotImplementedError -class SoftmaxTopKBuilder(ABC, Generic[T]): +class SoftmaxTopKBuilder(ABC): """Softmax topk implementation builder.""" @staticmethod @@ -42,7 +32,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1): raise NotImplementedError -class FusedMoEImpl(ABC, Generic[T]): +class FusedMoEImpl(ABC): """Fused moe implementation.""" def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): @@ -68,7 +58,7 @@ def forward(self, raise NotImplementedError -class FusedMoEBuilder(ABC, Generic[T]): +class FusedMoEBuilder(ABC): """Fused moe builder.""" @staticmethod diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py index 8e351428c1..80e0540e6d 100644 --- a/lmdeploy/pytorch/model_inputs.py +++ b/lmdeploy/pytorch/model_inputs.py @@ -350,7 +350,6 @@ class StepContext: input_multimodals: List[MultiModalTensor] = None vision_inputs: VisionModelInputs = None attn_metadata: Any = None - mlp_metadata: Any = None cross_seqlens: torch.LongTensor = None cross_kv_seqlens: torch.LongTensor = None cross_attn_metadata: Any = None diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py index 66076de6dd..9d50cbb86b 100644 --- a/lmdeploy/pytorch/models/qwen3_moe.py +++ b/lmdeploy/pytorch/models/qwen3_moe.py @@ -225,13 +225,12 @@ def forward( self, hidden_states: torch.Tensor, all_routed_experts: torch.Tensor = None, - mlp_metadata: Any = None, ): """forward.""" batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) router_logits = self.gate(hidden_states) - topk_weights, topk_ids = self.softmax_topk(router_logits, mlp_metadata) + topk_weights, topk_ids = self.softmax_topk(router_logits) if all_routed_experts is not None: all_routed_experts[:, self.layer_idx, :] = topk_ids if get_dist_manager().current_context().dist_config.enable_eplb: @@ -240,7 +239,6 @@ def forward( hidden_states, topk_weights, topk_ids, - mlp_metadata, ) out_states = out_states.reshape(batch_size, sequence_length, -1) @@ -286,7 +284,6 @@ def forward( past_key_value: Optional[List[torch.FloatTensor]], residual: Optional[torch.Tensor] = None, attn_metadata: Any = None, - mlp_metadata: Any = None, all_routed_experts: torch.Tensor = None, ): @@ -306,7 +303,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts, mlp_metadata=mlp_metadata) + hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts) outputs = (hidden_states, residual) return outputs @@ -352,7 +349,6 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, - mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, all_routed_experts: torch.Tensor = None, ): @@ -379,7 +375,6 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, all_routed_experts=all_routed_experts, ) @@ -435,7 +430,6 @@ def forward( position_ids: torch.Tensor, past_key_values: List[List[torch.Tensor]], attn_metadata: Any = None, - mlp_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, ): @@ -456,7 +450,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, all_routed_experts=all_routed_experts, ) @@ -483,7 +476,6 @@ def prepare_inputs_for_generation( input_ids = context.input_ids position_ids = context.position_ids attn_metadata = context.attn_metadata - mlp_metadata = context.mlp_metadata # process vision embeddings vision_embeddings = context.input_embeddings @@ -499,7 +491,6 @@ def prepare_inputs_for_generation( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, ) diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py index 5d81c8c327..60c3617ffe 100644 --- a/lmdeploy/pytorch/models/qwen3_vl.py +++ b/lmdeploy/pytorch/models/qwen3_vl.py @@ -102,7 +102,6 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, - mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack @@ -144,7 +143,6 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, ) # add visual features to the hidden states of first several layers @@ -499,7 +497,6 @@ def forward( position_ids: torch.Tensor, past_key_values: List[List[torch.Tensor]], attn_metadata: Any = None, - mlp_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, pixel_values: torch.Tensor = None, @@ -544,7 +541,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, mrope_position_ids=mrope_position_ids, # args for deepstack @@ -578,7 +574,6 @@ def prepare_inputs_for_generation( input_ids = context.input_ids position_ids = context.position_ids attn_metadata = context.attn_metadata - mlp_metadata = context.mlp_metadata pixel_values = None vis_cu_seqlens = None @@ -619,7 +614,6 @@ def prepare_inputs_for_generation( position_ids=position_ids, past_key_values=past_key_values, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, inputs_embeds=inputs_embeds, mrope_position_ids=mrope_position_ids, pixel_values=pixel_values, diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py index 7586db3d11..1dc7e32de9 100644 --- a/lmdeploy/pytorch/models/qwen3_vl_moe.py +++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py @@ -33,7 +33,6 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, attn_metadata: Any = None, - mlp_metadata: Any = None, inputs_embeds: Optional[torch.FloatTensor] = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack @@ -75,7 +74,6 @@ def forward( past_key_value=past_key_value, residual=residual, attn_metadata=attn_metadata, - mlp_metadata=mlp_metadata, ) # add visual features to the hidden states of first several layers diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py index dc7d7ab8a0..484dbbe492 100644 --- a/lmdeploy/pytorch/nn/moe/base.py +++ b/lmdeploy/pytorch/nn/moe/base.py @@ -8,7 +8,6 @@ import lmdeploy.pytorch.distributed as dist from lmdeploy.pytorch.backends import OpType, get_backend -from lmdeploy.pytorch.backends.moe import MLPMetadata from lmdeploy.pytorch.config import TPMode from lmdeploy.pytorch.distributed import get_dist_manager, get_tp_world_rank from lmdeploy.pytorch.model_inputs import get_step_ctx_manager @@ -30,9 +29,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): impl_builder = get_backend().get_layer_impl_builder(OpType.SoftmaxTopK) self.impl = impl_builder.build(top_k, dim, n_groups=n_groups) - def forward(self, x: torch.Tensor, mlp_metadata: MLPMetadata): + def forward(self, x: torch.Tensor): """forward.""" - return self.impl.forward(x, mlp_metadata) + return self.impl.forward(x) def update_dims(hidden_dim: int, ffn_dim: int): @@ -135,8 +134,7 @@ def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: to cur_out = self.gemm_func(hidden_states, topk_weights, topk_ids) return self.reduce_scatter(cur_out, output_states, tp_sizes) - def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - mlp_metadata: MLPMetadata): + def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor): """forward.""" def __slice_tensor(tensor: torch.Tensor, slice_size: int): @@ -178,7 +176,6 @@ def __slice_and_gather(): # pre cur_inputs = __slice_and_gather() - cur_inputs.update(dict(mlp_metadata=mlp_metadata)) out_handles = [] # main loop @@ -187,7 +184,6 @@ def __slice_and_gather(): _, handle = self._gemm_and_reduce_scatter(**cur_inputs) out_handles.append(handle) cur_inputs = next_inputs - cur_inputs.update(dict(mlp_metadata=mlp_metadata)) # post _, handle = self._gemm_and_reduce_scatter(**cur_inputs) @@ -262,13 +258,12 @@ def init_dist_args(self, all_reduce: bool): if self.tp > 1 and self.tp_mode == TPMode.DP_TP: - def __gemm_func(hidden_states, topk_weights, topk_ids, mlp_metadata): + def __gemm_func(hidden_states, topk_weights, topk_ids): return self.gemm( dict( hidden_states=hidden_states, topk_weights=topk_weights, topk_idx=topk_ids, - mlp_metadata=mlp_metadata, moe_type=MoeType.Default, ))['hidden_states'] @@ -301,8 +296,7 @@ def forward_dptp(self) -> MoEForwardDPTP: """Forward dptp.""" return self._forward_dptp - def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor, - mlp_metadata: MLPMetadata): + def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor): """Default forward.""" state = { 'hidden_states': hidden_states, @@ -311,21 +305,16 @@ def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tenso 'moe_type': MoeType.Default, } recv_state = self.dispatch(state) - recv_state.update({'mlp_metadata': mlp_metadata}) gemm_state = self.gemm(recv_state) out_state = self.combine(gemm_state) return out_state['hidden_states'] - def forward(self, - hidden_states: torch.Tensor, - topk_weights: torch.Tensor, - topk_idx: torch.LongTensor, - mlp_metadata: MLPMetadata = None): + def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor): """forward.""" if self.tp > 1 and self.tp_mode == TPMode.DP_TP: - return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx, mlp_metadata) + return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx) else: - return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata) + return self.forward_default(hidden_states, topk_weights, topk_idx) def renormalize(self, topk_weights): """renormalize.""" diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py index 358674a466..0633aa001a 100644 --- a/lmdeploy/pytorch/nn/moe/default.py +++ b/lmdeploy/pytorch/nn/moe/default.py @@ -298,14 +298,12 @@ def gemm(self, state: Dict): hidden_states = state['hidden_states'] topk_weights = state['topk_weights'] topk_ids = state['topk_idx'] - mlp_metadata = state['mlp_metadata'] hidden_states = self.impl.forward(hidden_states, topk_weights, topk_ids, self.gate_up.weight, self.down.weight, - mlp_metadata, self.gate_up.bias, self.down.bias, self.expert_list, From 2f7710807c1e775e138af62d2fc5e38cb0984ae5 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 5 Feb 2026 09:10:29 +0000 Subject: [PATCH 22/25] refactor code --- docker/Dockerfile_ascend_a3 | 3 +- .../backends/dlinfer/ascend/op_backend.py | 21 ++++---- lmdeploy/pytorch/backends/dlinfer/moe.py | 50 +++++++------------ lmdeploy/pytorch/kernels/dlinfer/__init__.py | 5 +- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 19 ++----- .../dlinfer/moe_gating_topk_softmax.py | 8 +-- requirements/runtime_ascend.txt | 6 +-- 7 files changed, 43 insertions(+), 69 deletions(-) diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3 index d8fc152ed1..1d8064a129 100644 --- a/docker/Dockerfile_ascend_a3 +++ b/docker/Dockerfile_ascend_a3 @@ -4,7 +4,7 @@ ARG ASCEND_DEVICE_TYPE=ascend_a3 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub -FROM ${ASCEND_HUB}/cann:8.3.rc1-a3-openeuler24.03-py3.11 AS ascend_a3_base +FROM ${ASCEND_HUB}/cann:8.5.0-a3-openeuler24.03-py3.11 AS ascend_a3_base FROM ${ASCEND_DEVICE_TYPE}_base AS builder ENV DEBIAN_FRONTEND=noninteractive @@ -22,6 +22,5 @@ ARG LMDEPLOY_TAG=main RUN --mount=type=cache,target=/root/.cache \ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ - pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \ TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \ LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG} diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index f303e7a63d..ffa3da6cb4 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -16,7 +16,7 @@ from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.utils import get_logger -from ..moe import MOEMetadata, MoeType +from ..moe import DlinferMoeMetada, DlinferMoeType from ..op_backend import DlinferOpsBackend logger = get_logger('lmdeploy') @@ -316,7 +316,7 @@ def init_mc2_token_capacity(tp_size): def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): if ep_size <= 1: - return MoeType.ALLGATHER + return DlinferMoeType.ALLGATHER mc2_token_capacity = init_mc2_token_capacity(tp_size) is_graph = cls.enable_graph and step_context.is_decoding if is_graph: @@ -324,29 +324,29 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size if SocVersion.is_A2(): if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16: - return MoeType.MC2 + return DlinferMoeType.MC2 else: - return MoeType.ALLGATHER + return DlinferMoeType.ALLGATHER elif SocVersion.is_A3(): if max_tokens_across_dp <= mc2_token_capacity: - return MoeType.MC2 + return DlinferMoeType.MC2 else: - return MoeType.ALLTOALL + return DlinferMoeType.ALLTOALL else: raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size, moe_type): x_active_mask = None - if moe_type == MoeType.MC2: + if moe_type == DlinferMoeType.MC2: paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size pad_size = paded_size - runtime_tokens_current_rank x_active_mask = torch.ones(actual_tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) - elif moe_type == MoeType.ALLTOALL: + elif moe_type == DlinferMoeType.ALLTOALL: pad_size = tp_size - runtime_tokens_current_rank - elif moe_type == MoeType.ALLGATHER: + elif moe_type == DlinferMoeType.ALLGATHER: pad_size = max_tokens_across_dp - runtime_tokens_current_rank else: pad_size = 0 @@ -414,7 +414,7 @@ def get_moe_group_name(group): max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) moe_group_name = get_moe_group_name(cls.dist_meta.ep_group) - moe_metadata = MOEMetadata( + moe_metadata = DlinferMoeMetada( max_tokens_across_dp=max_tokens_across_dp, pad_size=pad_size, dp_size=cls.dist_meta.dp_size, @@ -464,7 +464,6 @@ def device_count(): @staticmethod def support_ray(): """Support ray.""" - # return False if not _envs.ascend_set_rt_visable_devices_by_ray: os.environ['RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES'] = '1' return True diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index aa4331a529..a347dba27b 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -1,32 +1,17 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from dataclasses import dataclass from typing import Callable, List import torch -from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetada # noqa: F401 +from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeType # noqa: F401 +from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax from lmdeploy.pytorch.model_inputs import get_step_ctx_manager from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl -@dataclass -class MOEMetadata: - max_tokens_across_dp: int = 1 - pad_size: int = 0 - dp_size: int = 1 - tp_size: int = 1 - ep_size: int = 1 - tp_rank: int = 0 - ep_rank: int = 0 - tp_group: torch.distributed.ProcessGroup = None - ep_group: torch.distributed.ProcessGroup = None - moe_type: MoeType = MoeType.UNDEFINED - x_active_mask: torch.Tensor = None - moe_group_name: str = None - - class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl): """Dlinfer softmax topk implementation.""" @@ -37,10 +22,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1): raise NotImplementedError('Group router not supported') def forward(self, x: torch.Tensor): - moe_metadata = get_step_ctx_manager().current_context().moe_metadata - routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata.max_tokens_across_dp, - moe_metadata.pad_size, moe_metadata.tp_size, - moe_metadata.ep_size, moe_metadata.tp_rank) + step_context = get_step_ctx_manager().current_context() + moe_metadata = getattr(step_context, 'moe_metadata', None) + routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata) return routing_weights, selected_experts @@ -67,11 +51,13 @@ def __init__(self, self.renormalize = renormalize self.ep_size = ep_size self.ep_group = ep_group - self.expert_ids_per_ep_rank = torch.tensor( - [i % (self.num_experts // self.ep_size) for i in range(num_experts)], - dtype=torch.int32, - device=torch.npu.current_device(), - ) + self.expert_ids_per_ep_rank = None + if self.ep_size > 1: + self.expert_ids_per_ep_rank = torch.tensor( + [i % (self.num_experts // self.ep_size) for i in range(num_experts)], + dtype=torch.int32, + device=torch.cuda.current_device(), + ) def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor): """Update weights.""" @@ -103,13 +89,13 @@ def forward(self, """forward.""" assert gate_up_bias is None assert down_bias is None - moe_metadata = get_step_ctx_manager().current_context().moe_metadata + step_context = get_step_ctx_manager().current_context() + moe_metadata = getattr(step_context, 'moe_metadata', None) + if moe_metadata is not None: + moe_metadata.expert_ids_per_ep_rank = self.expert_ids_per_ep_rank return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k, - self.renormalize, moe_metadata.pad_size, moe_metadata.tp_size, moe_metadata.ep_size, - moe_metadata.tp_rank, moe_metadata.ep_rank, moe_metadata.tp_group, moe_metadata.ep_group, - moe_metadata.moe_type, moe_metadata.x_active_mask, moe_metadata.moe_group_name, - self.expert_ids_per_ep_rank) + self.renormalize, moe_metadata) class DlinferFusedMoEBuilder(FusedMoEBuilder): diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index f9ea874ae5..834de084df 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -4,7 +4,7 @@ from .awq_kernels import awq_linear from .fill_kv_cache import fill_kv_cache from .flash_attention import flash_attention_fwd -from .fused_moe import MoeType, fused_moe +from .fused_moe import DlinferMoeMetada, DlinferMoeType, fused_moe from .linear import linear from .moe_gating_topk_softmax import moe_gating_topk_softmax from .pagedattention import paged_attention_fwd @@ -15,7 +15,8 @@ 'apply_rotary_pos_emb', 'awq_linear', 'fill_kv_cache', - 'MoeType', + 'DlinferMoeType', + 'DlinferMoeMetada', 'fused_moe', 'paged_attention_fwd', 'flash_attention_fwd', diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index decce941ac..7ff6bbccf0 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops -import torch.distributed as dist -from dlinfer.utils.type_annotation import MoeType +from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada +from dlinfer.utils.type_annotation import MoeType as DlinferMoeType # noqa: F401 from torch import Tensor @@ -13,19 +13,8 @@ def fused_moe( topk_ids: Tensor, topk: int, renormalize: bool, - pad_size: int, - tp_size: int, - ep_size: int, - tp_rank: int, - ep_rank: int, - tp_group: dist.ProcessGroup, - ep_group: dist.ProcessGroup, - moe_type: MoeType, - x_active_mask: Tensor, - moe_group_name: str, - expert_ids_per_ep_rank: Tensor, + moe_metadata: DlinferMoeMetada, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, - pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask, - moe_group_name, expert_ids_per_ep_rank) + moe_metadata) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index db71e87787..c991595041 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,10 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + import dlinfer.ops as ext_ops from torch import Tensor -def moe_gating_topk_softmax(router_logits: Tensor, topk: int, max_tokens_across_dp: int, pad_size: int, tp_size: int, - ep_size: int, tp_rank: int) -> tuple[Tensor, Tensor]: - routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, max_tokens_across_dp, - pad_size, tp_size, ep_size, tp_rank) +def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: Any) -> tuple[Tensor, Tensor]: + routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata) return routing_weights, selected_experts diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt index d94a38d0bf..22d1ca8418 100644 --- a/requirements/runtime_ascend.txt +++ b/requirements/runtime_ascend.txt @@ -22,9 +22,9 @@ safetensors sentencepiece shortuuid tiktoken -torch>=2.3.1,<2.9.0 -torch-npu>=2.3.1,<2.9.0 -torchvision>=0.18.1,<0.24.0 +torch>=2.3.1,<2.10.0 +torch-npu>=2.3.1,<2.10.0 +torchvision>=0.18.1,<0.25.0 transformers uvicorn xgrammar From 07eaf3f71c633e762914ecb2f2e71a90bc9b45b6 Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 5 Feb 2026 10:58:41 +0000 Subject: [PATCH 23/25] remove useless code --- .../pytorch/backends/dlinfer/ascend/op_backend.py | 2 -- lmdeploy/pytorch/engine/executor/ray_executor.py | 3 --- .../kernels/dlinfer/moe_gating_topk_softmax.py | 5 ++--- lmdeploy/pytorch/models/deepseek_v2.py | 14 -------------- 4 files changed, 2 insertions(+), 22 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index ffa3da6cb4..0455bf50da 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -118,11 +118,9 @@ def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_laye class AscendOpsBackend(DlinferOpsBackend): """Ascend layer backend.""" enable_graph: bool = False - half_negative_inf: float = torch.finfo(torch.float16).min total_slots = None max_batches = None dist_meta: DistMeta = None - graph_capture_sizes = None @staticmethod def get_name() -> str: diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index b0a4219a46..e4b4fbac2a 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -286,9 +286,6 @@ def __init__( self._prefetch_task: asyncio.Task = None self.remote_outs: asyncio.Queue = None - rank_offset = dist_config.dp_rank * attn_tp - self.rank_offset = rank_offset - logger.info('Init distributed environment by device.') self.rank_offset = dist_config.dp_rank * attn_tp self._init_distributed_environment_by_device(device_type) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index c991595041..68d7de7fe2 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,10 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any - import dlinfer.ops as ext_ops +from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada from torch import Tensor -def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: Any) -> tuple[Tensor, Tensor]: +def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: DlinferMoeMetada) -> tuple[Tensor, Tensor]: routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata) return routing_weights, selected_experts diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py index 5e781f8034..4db550eb8d 100644 --- a/lmdeploy/pytorch/models/deepseek_v2.py +++ b/lmdeploy/pytorch/models/deepseek_v2.py @@ -1185,14 +1185,10 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di if weight_name not in name: continue name = name.replace(weight_name, param_name) - if name not in params_dict.keys(): - continue param = params_dict[name] load_weight(param, loaded_weight, expert_id=expert_id, shard_id=shard_id) break else: - if name not in params_dict.keys(): - return param = params_dict[name] load_weight(param, loaded_weight) @@ -1223,8 +1219,6 @@ def __load_kcvc(name: str, weight: torch.Tensor): dim=1) w_vc = w_vc.transpose(1, 2).contiguous() kc_param_name = name.replace('.kv_b_proj', '.kc') - if kc_param_name not in params_dict.keys(): - return param_kc = params_dict[kc_param_name] load_weight(param_kc, w_kc) vc_param_name = name.replace('.kv_b_proj', '.vc') @@ -1271,8 +1265,6 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): else: loaded_weight = loaded_weight.to(device) weight = __update_pe(loaded_weight, head_dim, pe_dim_offset) - if name not in params_dict.keys(): - continue param = params_dict[name] load_weight(param, weight) break @@ -1290,8 +1282,6 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): else: __load_kcvc(name, loaded_weight) else: - if name not in params_dict.keys(): - return param = params_dict[name] load_weight(param, loaded_weight) @@ -1379,13 +1369,9 @@ def __skip_layers(): if weight_name not in name: continue name = name.replace(weight_name, param_name) - if name not in params_dict.keys(): - continue param = params_dict[name] load_weight(param, loaded_weight, shard_id=shard_id) break else: - if name not in params_dict.keys(): - continue param = params_dict[name] load_weight(param, loaded_weight) From 700db7de264fe545aece99bf4ea5466d8df7d3cb Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 6 Feb 2026 09:32:38 +0000 Subject: [PATCH 24/25] update code --- docker/Dockerfile_ascend_a3 | 1 + .../supported_models/supported_models.md | 2 +- .../backends/dlinfer/ascend/op_backend.py | 58 +++++++++---------- lmdeploy/pytorch/backends/dlinfer/moe.py | 4 +- lmdeploy/pytorch/kernels/dlinfer/__init__.py | 6 +- lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 6 +- .../dlinfer/moe_gating_topk_softmax.py | 5 +- 7 files changed, 42 insertions(+), 40 deletions(-) diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3 index 1d8064a129..aa975d3b9c 100644 --- a/docker/Dockerfile_ascend_a3 +++ b/docker/Dockerfile_ascend_a3 @@ -22,5 +22,6 @@ ARG LMDEPLOY_TAG=main RUN --mount=type=cache,target=/root/.cache \ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ + pip install --no-cache-dir torch==2.9.0 torch-npu==2.9.0 torchvision==0.24.0 && \ TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \ LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG} diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 317ab78e71..399594af91 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -150,7 +150,7 @@ | QWen2.5-VL | 3B - 72B | MLLM | Yes | Yes | - | - | Yes | - | Yes | No | | QWen2-MoE | A14.57B | LLM | Yes | - | No | No | - | - | Yes | - | | QWen3 | 0.6B-235B | LLM | Yes | Yes | No | No | Yes | Yes | Yes | Yes | -| DeepSeek-V2 | 16B | LLM | Yes | Yes | No | No | - | - | - | - | +| DeepSeek-V2 | 16B | LLM | No | Yes | No | No | - | - | - | - | | InternVL(v1.5) | 2B-26B | MLLM | Yes | - | Yes | Yes | - | - | Yes | - | | InternVL2 | 1B-40B | MLLM | Yes | Yes | Yes | Yes | Yes | - | Yes | Yes | | InternVL2.5 | 1B-78B | MLLM | Yes | Yes | Yes | Yes | Yes | - | Yes | Yes | diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 0455bf50da..843ba6e987 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -16,7 +16,7 @@ from lmdeploy.pytorch.distributed import get_dist_manager from lmdeploy.utils import get_logger -from ..moe import DlinferMoeMetada, DlinferMoeType +from ..moe import DlinferMoECommType, DlinferMoeMetadata from ..op_backend import DlinferOpsBackend logger = get_logger('lmdeploy') @@ -281,19 +281,19 @@ def get_dist_meta(): def get_tokens_info(dp_size, tp_size, ep_size, ep_group): if ep_size <= 1: return 0, 0, 0 - # get runtime_tokens_current_rank + # get padded_tokens_current_rank is_graph = cls.enable_graph and step_context.is_decoding if is_graph: from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size actual_tokens_current_rank = step_context.q_seqlens.shape[0] - runtime_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank), - cls.max_batches) + padded_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank), + cls.max_batches) else: actual_tokens_current_rank = step_context.q_seqlens.sum().item() - runtime_tokens_current_rank = actual_tokens_current_rank + padded_tokens_current_rank = actual_tokens_current_rank # get max_tokens_across_dp if dp_size > 1: - runtime_tokens_tensor = torch.tensor([runtime_tokens_current_rank], + runtime_tokens_tensor = torch.tensor([padded_tokens_current_rank], dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device()) world_size = dp_size * tp_size @@ -303,8 +303,8 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group): dist.all_gather_into_tensor(runtime_tokens_buffer, runtime_tokens_tensor, ep_group) max_tokens_across_dp = torch.max(runtime_tokens_buffer).item() else: - max_tokens_across_dp = runtime_tokens_current_rank - return actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp + max_tokens_across_dp = padded_tokens_current_rank + return actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp @lru_cache def init_mc2_token_capacity(tp_size): @@ -312,9 +312,9 @@ def init_mc2_token_capacity(tp_size): num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size return num_tokens_per_tp_rank * tp_size - def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): + def select_moe_comm_type(max_tokens_across_dp, dp_size, tp_size, ep_size): if ep_size <= 1: - return DlinferMoeType.ALLGATHER + return DlinferMoECommType.ALLGATHER mc2_token_capacity = init_mc2_token_capacity(tp_size) is_graph = cls.enable_graph and step_context.is_decoding if is_graph: @@ -322,30 +322,30 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size): max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size if SocVersion.is_A2(): if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16: - return DlinferMoeType.MC2 + return DlinferMoECommType.MC2 else: - return DlinferMoeType.ALLGATHER + return DlinferMoECommType.ALLGATHER elif SocVersion.is_A3(): if max_tokens_across_dp <= mc2_token_capacity: - return DlinferMoeType.MC2 + return DlinferMoECommType.MC2 else: - return DlinferMoeType.ALLTOALL + return DlinferMoECommType.ALLTOALL else: raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}') - def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size, - moe_type): + def get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp, tp_size, + moe_comm_type): x_active_mask = None - if moe_type == DlinferMoeType.MC2: + if moe_comm_type == DlinferMoECommType.MC2: paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size - pad_size = paded_size - runtime_tokens_current_rank + pad_size = paded_size - padded_tokens_current_rank x_active_mask = torch.ones(actual_tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) - elif moe_type == DlinferMoeType.ALLTOALL: - pad_size = tp_size - runtime_tokens_current_rank - elif moe_type == DlinferMoeType.ALLGATHER: - pad_size = max_tokens_across_dp - runtime_tokens_current_rank + elif moe_comm_type == DlinferMoECommType.ALLTOALL: + pad_size = tp_size - padded_tokens_current_rank + elif moe_comm_type == DlinferMoECommType.ALLGATHER: + pad_size = max_tokens_across_dp - padded_tokens_current_rank else: pad_size = 0 return pad_size, x_active_mask @@ -404,15 +404,15 @@ def get_moe_group_name(group): step_context.attn_metadata = attn_metadata cls.dist_meta = get_dist_meta() - actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp = get_tokens_info( + actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp = get_tokens_info( cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group) - moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, - cls.dist_meta.ep_size) - pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, - max_tokens_across_dp, cls.dist_meta.tp_size, moe_type) + moe_comm_type = select_moe_comm_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size, + cls.dist_meta.ep_size) + pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, + max_tokens_across_dp, cls.dist_meta.tp_size, moe_comm_type) moe_group_name = get_moe_group_name(cls.dist_meta.ep_group) - moe_metadata = DlinferMoeMetada( + moe_metadata = DlinferMoeMetadata( max_tokens_across_dp=max_tokens_across_dp, pad_size=pad_size, dp_size=cls.dist_meta.dp_size, @@ -422,7 +422,7 @@ def get_moe_group_name(group): ep_rank=cls.dist_meta.ep_rank, tp_group=cls.dist_meta.tp_group, ep_group=cls.dist_meta.ep_group, - moe_type=moe_type, + moe_comm_type=moe_comm_type, x_active_mask=x_active_mask, moe_group_name=moe_group_name, ) diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index a347dba27b..f034a0bb07 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -4,8 +4,8 @@ import torch -from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetada # noqa: F401 -from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeType # noqa: F401 +from lmdeploy.pytorch.kernels.dlinfer import DlinferMoECommType # noqa: F401 +from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetadata # noqa: F401 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax from lmdeploy.pytorch.model_inputs import get_step_ctx_manager diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py index 834de084df..660368ba23 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py +++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py @@ -4,7 +4,7 @@ from .awq_kernels import awq_linear from .fill_kv_cache import fill_kv_cache from .flash_attention import flash_attention_fwd -from .fused_moe import DlinferMoeMetada, DlinferMoeType, fused_moe +from .fused_moe import DlinferMoECommType, DlinferMoeMetadata, fused_moe from .linear import linear from .moe_gating_topk_softmax import moe_gating_topk_softmax from .pagedattention import paged_attention_fwd @@ -15,8 +15,8 @@ 'apply_rotary_pos_emb', 'awq_linear', 'fill_kv_cache', - 'DlinferMoeType', - 'DlinferMoeMetada', + 'DlinferMoECommType', + 'DlinferMoeMetadata', 'fused_moe', 'paged_attention_fwd', 'flash_attention_fwd', diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py index 7ff6bbccf0..4624e0c199 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops -from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada -from dlinfer.utils.type_annotation import MoeType as DlinferMoeType # noqa: F401 +from dlinfer.utils.type_annotation import MoECommType as DlinferMoECommType # noqa: F401 +from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata from torch import Tensor @@ -13,7 +13,7 @@ def fused_moe( topk_ids: Tensor, topk: int, renormalize: bool, - moe_metadata: DlinferMoeMetada, + moe_metadata: DlinferMoeMetadata, ): """Dlinfer fused moe.""" return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index 68d7de7fe2..c72f5f2324 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,9 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import dlinfer.ops as ext_ops -from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada +from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata from torch import Tensor -def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: DlinferMoeMetada) -> tuple[Tensor, Tensor]: +def moe_gating_topk_softmax(router_logits: Tensor, topk: int, + moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]: routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata) return routing_weights, selected_experts From 6a957ff0853ffa590614a40e4257485d353897df Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Fri, 6 Feb 2026 11:17:16 +0000 Subject: [PATCH 25/25] update code --- lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 7 +++---- .../pytorch/kernels/dlinfer/moe_gating_topk_softmax.py | 4 +++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 843ba6e987..484cbd1b72 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -47,7 +47,7 @@ def is_Ascend910(cls) -> bool: @classmethod @lru_cache(maxsize=1) - def soc_version(cls) -> str: + def soc_version(cls) -> int: return torch.npu.get_soc_version() @classmethod @@ -318,7 +318,6 @@ def select_moe_comm_type(max_tokens_across_dp, dp_size, tp_size, ep_size): mc2_token_capacity = init_mc2_token_capacity(tp_size) is_graph = cls.enable_graph and step_context.is_decoding if is_graph: - import math max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size if SocVersion.is_A2(): if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16: @@ -337,8 +336,8 @@ def get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, max_tok moe_comm_type): x_active_mask = None if moe_comm_type == DlinferMoECommType.MC2: - paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size - pad_size = paded_size - padded_tokens_current_rank + padded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size + pad_size = padded_size - padded_tokens_current_rank x_active_mask = torch.ones(actual_tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device()) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index c72f5f2324..cc1a324bf4 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,10 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import dlinfer.ops as ext_ops from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata from torch import Tensor def moe_gating_topk_softmax(router_logits: Tensor, topk: int, - moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]: + moe_metadata: DlinferMoeMetadata) -> Tuple[Tensor, Tensor]: routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata) return routing_weights, selected_experts