From 93dccc7b4cb89f2449db5cc87bee4f1dbce9e52d Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 1 Jul 2025 08:34:26 +0000
Subject: [PATCH 01/25] [ascend]suppot deepseek eager_mode

---
 docs/zh_cn/supported_models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 2d119c7ed4..a4b9aabb30 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -149,7 +149,7 @@
 |   QWen2.5-VL   | 3B - 72B  | MLLM |       Yes        |       Yes        |       -       |       -       |      Yes       |        -         |    Yes    |    No     |
 |   QWen2-MoE    |  A14.57B  | LLM  |       Yes        |        -         |      No       |      No       |       -        |        -         |    Yes    |     -     |
 |     QWen3      | 0.6B-235B | LLM  |       Yes        |       Yes        |      No       |      No       |      Yes       |       Yes        |    Yes    |    Yes    |
-|  DeepSeek-V2   |    16B    | LLM  |        No        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
+|  DeepSeek-V2   |    16B    | LLM  |       Yes        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
 | InternVL(v1.5) |  2B-26B   | MLLM |       Yes        |        -         |      Yes      |      Yes      |       -        |        -         |    Yes    |     -     |
 |   InternVL2    |  1B-40B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |
 |  InternVL2.5   |  1B-78B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |

From 70cb6ab70e9d85aa050854a04be66b3d2c6c5c5d Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 3 Dec 2025 07:27:54 +0000
Subject: [PATCH 02/25] fix flash_mla_available on ascend

---
 lmdeploy/pytorch/configurations/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/pytorch/configurations/utils.py b/lmdeploy/pytorch/configurations/utils.py
index dfdd50512e..305083a48f 100644
--- a/lmdeploy/pytorch/configurations/utils.py
+++ b/lmdeploy/pytorch/configurations/utils.py
@@ -11,9 +11,13 @@ def flash_mla_available():
     # use flash_mla by default if it is installed
     use_flash_mla = False
     try:
-        # torch_npu device_properties doesn't have 'major' attribute
+        """
+        In some torch_npu versions, device_properties doesn't have 'major' attribute;
+        In other torch_npu versions, the value of major is None.
+        """
         device_properties = torch.cuda.get_device_properties(0)
-        if hasattr(device_properties, 'major') and device_properties.major >= 9:
+        major = getattr(device_properties, 'major', None)
+        if isinstance(major, int) and major >= 9:
             import flash_mla  # noqa
             use_flash_mla = True
     except ImportError:

From 1bb548b1eb2e6638d0f25c45707de2e0ffbc6032 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 9 Dec 2025 08:52:35 +0000
Subject: [PATCH 03/25] modify for dp_ep

---
 lmdeploy/pytorch/backends/dlinfer/moe.py         | 13 +++++++++++--
 lmdeploy/pytorch/engine/executor/ray_executor.py |  6 ++++++
 lmdeploy/pytorch/models/deepseek_v2.py           | 14 ++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 70b134c786..12c8ed8b31 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -35,8 +35,9 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
 class DlinferFusedMoEImpl(FusedMoEImpl):
     """Dlinfer fused moe implementation."""
 
-    def __init__(self, top_k: int, renormalize: bool = False):
+    def __init__(self, top_k: int, num_experts: int, renormalize: bool = False):
         self.top_k = top_k
+        self.num_experts = num_experts
         self.renormalize = renormalize
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
@@ -46,6 +47,14 @@ def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tens
             return gate_up_weights.transpose(-1, -2).contiguous(), down_weights.transpose(-1, -2).contiguous()
         return gate_up_weights, down_weights
 
+    def ep_expert_list(self, world_size: int, rank: int):
+        """Experts list of current rank."""
+        num_experts = self.num_experts
+        expert_per_rank = (num_experts + world_size - 1) // world_size
+        first_expert = rank * expert_per_rank
+        last_expert = min(first_expert + expert_per_rank, num_experts)
+        return list(range(first_expert, last_expert))
+
     def forward(self,
                 hidden_states: torch.Tensor,
                 topk_weights: torch.Tensor,
@@ -76,4 +85,4 @@ def build(top_k: int,
               layer_idx: int = 0,
               out_dtype: torch.dtype = torch.bfloat16):
         """Build from mlp."""
-        return DlinferFusedMoEImpl(top_k=top_k, renormalize=renormalize)
+        return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize)
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index e4b4fbac2a..7796e252c8 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -47,6 +47,7 @@ def _get_master_port():
 
 
 def get_ascend_device_rank_mapping(master_addr):
+# def get_ascend_device_rank_mapping(master_addr: str, workers: list, dp: int):
     rank_table_file = _envs.ascend_rank_table_file
     if not rank_table_file:
         raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set')
@@ -67,6 +68,8 @@ def get_ascend_device_rank_mapping(master_addr):
         logger.error(f'Parse rank table file({rank_table})  failed')
         raise e
 
+    # if dp > 1:
+    #     worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers])
     envs = {
         'ASCEND_RANK_TABLE_FILE_PATH': rank_table_file,
     }
@@ -286,6 +289,9 @@ def __init__(
             self._prefetch_task: asyncio.Task = None
             self.remote_outs: asyncio.Queue = None
 
+            rank_offset = dist_config.dp_rank * attn_tp
+            self.rank_offset = rank_offset
+
             logger.info('Init distributed environment by device.')
             self.rank_offset = dist_config.dp_rank * attn_tp
             self._init_distributed_environment_by_device(device_type)
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index 4db550eb8d..5e781f8034 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -1185,10 +1185,14 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             if weight_name not in name:
                 continue
             name = name.replace(weight_name, param_name)
+            if name not in params_dict.keys():
+                continue
             param = params_dict[name]
             load_weight(param, loaded_weight, expert_id=expert_id, shard_id=shard_id)
             break
         else:
+            if name not in params_dict.keys():
+                return
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
@@ -1219,6 +1223,8 @@ def __load_kcvc(name: str, weight: torch.Tensor):
                                                                                         dim=1)
             w_vc = w_vc.transpose(1, 2).contiguous()
             kc_param_name = name.replace('.kv_b_proj', '.kc')
+            if kc_param_name not in params_dict.keys():
+                return
             param_kc = params_dict[kc_param_name]
             load_weight(param_kc, w_kc)
             vc_param_name = name.replace('.kv_b_proj', '.vc')
@@ -1265,6 +1271,8 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
             else:
                 loaded_weight = loaded_weight.to(device)
                 weight = __update_pe(loaded_weight, head_dim, pe_dim_offset)
+            if name not in params_dict.keys():
+                continue
             param = params_dict[name]
             load_weight(param, weight)
             break
@@ -1282,6 +1290,8 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
                 else:
                     __load_kcvc(name, loaded_weight)
             else:
+                if name not in params_dict.keys():
+                    return
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
 
@@ -1369,9 +1379,13 @@ def __skip_layers():
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
+                    if name not in params_dict.keys():
+                        continue
                     param = params_dict[name]
                     load_weight(param, loaded_weight, shard_id=shard_id)
                     break
                 else:
+                    if name not in params_dict.keys():
+                        continue
                     param = params_dict[name]
                     load_weight(param, loaded_weight)

From 5f47d928c62a0c612f24c5aea635c556edb8ab93 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Thu, 11 Dec 2025 10:49:27 +0000
Subject: [PATCH 04/25] backup code

---
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 11 ++++++++---
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py |  7 ++++++-
 lmdeploy/pytorch/nn/moe/default.py            |  3 +++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 12c8ed8b31..0db8d4b3f1 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -35,10 +35,12 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
 class DlinferFusedMoEImpl(FusedMoEImpl):
     """Dlinfer fused moe implementation."""
 
-    def __init__(self, top_k: int, num_experts: int, renormalize: bool = False):
+    def __init__(self, top_k: int, num_experts: int, renormalize: bool = False, ep_size: int = 1, ep_group: torch.distributed.ProcessGroup = None):
         self.top_k = top_k
         self.num_experts = num_experts
         self.renormalize = renormalize
+        self.ep_size = ep_size
+        self.ep_group = ep_group
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
         """Update weights."""
@@ -68,8 +70,11 @@ def forward(self,
         """forward."""
         assert gate_up_bias is None
         assert down_bias is None
+        # from lmdeploy.utils import get_logger
+        # logger = get_logger('lmdeploy')
+        # logger.error(f'###### {expert_list=}')
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize)
+                         self.renormalize, self.ep_size, self.ep_group, expert_list)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder):
@@ -85,4 +90,4 @@ def build(top_k: int,
               layer_idx: int = 0,
               out_dtype: torch.dtype = torch.bfloat16):
         """Build from mlp."""
-        return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize)
+        return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize, ep_size=ep_size, ep_group=ep_group)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 4bcfade78d..6e50b1a5be 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
 import dlinfer.ops as ext_ops
+from typing import List
 from torch import Tensor
 
 
@@ -11,6 +13,9 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
+    ep_size: int,
+    ep_group: torch.distributed.ProcessGroup = None,
+    expert_list: List[int] = None,
 ):
     """Dlinfer fused moe."""
-    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize)
+    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group, expert_list)
diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py
index 0633aa001a..9f0e825b97 100644
--- a/lmdeploy/pytorch/nn/moe/default.py
+++ b/lmdeploy/pytorch/nn/moe/default.py
@@ -136,6 +136,9 @@ def __init__(self,
         dist_ctx = get_dist_manager().current_context()
         self.ep_size, rank = get_ep_world_rank()
         impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE)
+        # from lmdeploy.utils import get_logger
+        # logger = get_logger('lmdeploy')
+        # logger.error(f'FusedMoE ep_size: {self.ep_size}, rank: {rank}, {dist_ctx.ep_gpu_group.rank()=}')
         self.impl = impl_builder.build(
             top_k,
             num_experts,

From cdeb30ca9f2c4e1d51198a967775a1cbc1afaa07 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 16 Dec 2025 11:42:51 +0000
Subject: [PATCH 05/25] run tp ep

---
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 5 +----
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 0db8d4b3f1..77a09394f2 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -70,11 +70,8 @@ def forward(self,
         """forward."""
         assert gate_up_bias is None
         assert down_bias is None
-        # from lmdeploy.utils import get_logger
-        # logger = get_logger('lmdeploy')
-        # logger.error(f'###### {expert_list=}')
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize, self.ep_size, self.ep_group, expert_list)
+                         self.renormalize, self.ep_size, self.ep_group)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 6e50b1a5be..7edfcc8631 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -15,7 +15,6 @@ def fused_moe(
     renormalize: bool,
     ep_size: int,
     ep_group: torch.distributed.ProcessGroup = None,
-    expert_list: List[int] = None,
 ):
     """Dlinfer fused moe."""
-    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group, expert_list)
+    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group)

From c91aea8080ad9bd865f41abfda21a85e6b90fb37 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 16 Dec 2025 11:56:51 +0000
Subject: [PATCH 06/25] format code

---
 lmdeploy/pytorch/backends/dlinfer/moe.py         | 13 +++++++++++--
 lmdeploy/pytorch/configurations/utils.py         |  6 ++----
 lmdeploy/pytorch/engine/executor/ray_executor.py |  3 ---
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py    |  6 +++---
 lmdeploy/pytorch/nn/moe/default.py               |  3 ---
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 77a09394f2..5703ef0638 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -35,7 +35,12 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
 class DlinferFusedMoEImpl(FusedMoEImpl):
     """Dlinfer fused moe implementation."""
 
-    def __init__(self, top_k: int, num_experts: int, renormalize: bool = False, ep_size: int = 1, ep_group: torch.distributed.ProcessGroup = None):
+    def __init__(self,
+                 top_k: int,
+                 num_experts: int,
+                 renormalize: bool = False,
+                 ep_size: int = 1,
+                 ep_group: torch.distributed.ProcessGroup = None):
         self.top_k = top_k
         self.num_experts = num_experts
         self.renormalize = renormalize
@@ -87,4 +92,8 @@ def build(top_k: int,
               layer_idx: int = 0,
               out_dtype: torch.dtype = torch.bfloat16):
         """Build from mlp."""
-        return DlinferFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize, ep_size=ep_size, ep_group=ep_group)
+        return DlinferFusedMoEImpl(top_k=top_k,
+                                   num_experts=num_experts,
+                                   renormalize=renormalize,
+                                   ep_size=ep_size,
+                                   ep_group=ep_group)
diff --git a/lmdeploy/pytorch/configurations/utils.py b/lmdeploy/pytorch/configurations/utils.py
index 305083a48f..2ea21364a7 100644
--- a/lmdeploy/pytorch/configurations/utils.py
+++ b/lmdeploy/pytorch/configurations/utils.py
@@ -11,10 +11,8 @@ def flash_mla_available():
     # use flash_mla by default if it is installed
     use_flash_mla = False
     try:
-        """
-        In some torch_npu versions, device_properties doesn't have 'major' attribute;
-        In other torch_npu versions, the value of major is None.
-        """
+        """In some torch_npu versions, device_properties doesn't have 'major'
+        attribute; In other torch_npu versions, the value of major is None."""
         device_properties = torch.cuda.get_device_properties(0)
         major = getattr(device_properties, 'major', None)
         if isinstance(major, int) and major >= 9:
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index 7796e252c8..b0a4219a46 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -47,7 +47,6 @@ def _get_master_port():
 
 
 def get_ascend_device_rank_mapping(master_addr):
-# def get_ascend_device_rank_mapping(master_addr: str, workers: list, dp: int):
     rank_table_file = _envs.ascend_rank_table_file
     if not rank_table_file:
         raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set')
@@ -68,8 +67,6 @@ def get_ascend_device_rank_mapping(master_addr):
         logger.error(f'Parse rank table file({rank_table})  failed')
         raise e
 
-    # if dp > 1:
-    #     worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers])
     envs = {
         'ASCEND_RANK_TABLE_FILE_PATH': rank_table_file,
     }
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 7edfcc8631..2079f68831 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
 import dlinfer.ops as ext_ops
-from typing import List
+import torch
 from torch import Tensor
 
 
@@ -17,4 +16,5 @@ def fused_moe(
     ep_group: torch.distributed.ProcessGroup = None,
 ):
     """Dlinfer fused moe."""
-    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize, ep_size, ep_group)
+    return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
+                             ep_size, ep_group)
diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py
index 9f0e825b97..0633aa001a 100644
--- a/lmdeploy/pytorch/nn/moe/default.py
+++ b/lmdeploy/pytorch/nn/moe/default.py
@@ -136,9 +136,6 @@ def __init__(self,
         dist_ctx = get_dist_manager().current_context()
         self.ep_size, rank = get_ep_world_rank()
         impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE)
-        # from lmdeploy.utils import get_logger
-        # logger = get_logger('lmdeploy')
-        # logger.error(f'FusedMoE ep_size: {self.ep_size}, rank: {rank}, {dist_ctx.ep_gpu_group.rank()=}')
         self.impl = impl_builder.build(
             top_k,
             num_experts,

From 0879001c2b2e9c0f74f4849387280f5c1d938e33 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Mon, 29 Dec 2025 06:27:04 +0000
Subject: [PATCH 07/25] add dp tp

---
 lmdeploy/pytorch/backends/dlinfer/__init__.py | 18 +++++++++++++++
 .../backends/dlinfer/ascend/op_backend.py     | 16 +++++++++++++
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 23 +++++++++++++++++--
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py |  6 ++---
 .../dlinfer/moe_gating_topk_softmax.py        |  5 ++--
 5 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py
index ef101fec61..d06de6ac2e 100644
--- a/lmdeploy/pytorch/backends/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py
@@ -1 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from dataclasses import dataclass
+
+
+@dataclass
+class DlinferDistContext:
+    dp_size: int = 1
+    tp_size: int = 1
+    ep_size: int = 1
+
+    dp_rank: int = 0
+    tp_rank: int = 0
+    ep_rank: int = 0
+
+    max_tokens_accros_dp: int = 1
+
+    tp_group: torch.distributed.ProcessGroup = None
+    ep_group: torch.distributed.ProcessGroup = None
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index b7ce2a6846..3aaa405ce0 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -7,9 +7,11 @@
 from typing import Dict, Tuple
 
 import torch
+import torch.distributed as dist
 
 from lmdeploy.pytorch import envs as _envs
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
+from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
 from ..op_backend import DlinferOpsBackend
@@ -92,6 +94,7 @@ class AscendOpsBackend(DlinferOpsBackend):
     half_negative_inf = torch.finfo(torch.float16).min
     total_slots = None
     max_batches = None
+    max_tokens_accros_dp = 0
 
     @staticmethod
     def get_name() -> str:
@@ -219,6 +222,18 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s
 
             return kv_start_indices, attention_mask
 
+        def get_max_tokens_across_dp():
+            dist_ctx = get_dist_manager().current_context()
+            if dist_ctx.dist_config.dp > 1:
+                total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype)
+                world_size = dist_ctx.dist_config.world_size
+                total_token_buffer = torch.zeros(world_size, dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device())
+                dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group)
+                max_tokens_accros_dp = torch.max(total_token_buffer).item()
+            else:
+                max_tokens_accros_dp = 0
+            return max_tokens_accros_dp
+
         q_seqlens_cpu, kv_seqlens_cpu = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill)
         q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu,
                                                            kv_seqlens_cpu)
@@ -228,6 +243,7 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s
                                                                                    is_unpaged_prefill, q_seqlens_list,
                                                                                    kv_seqlens_list, max_q_seq_len,
                                                                                    max_kv_seq_len)
+        cls.max_tokens_accros_dp = get_max_tokens_across_dp()
 
         if not cls.enable_graph and step_context.kv_quant_policy == 8:
             record_file = os.getenv('ASCEND_QUANT_RECORD_FILE')
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 5703ef0638..362acc9909 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,12 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from dataclasses import dataclass
 from typing import Callable, List
 
 import torch
 
+from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 
 from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
+from . import DlinferDistContext
+
+
+def get_dist_ctx():
+    dist_ctx = get_dist_manager().current_context()
+    
+    return DlinferDistContext(dp_size = dist_ctx.dist_config.dp,
+                              tp_size = dist_ctx.dist_config.tp,
+                              ep_size = dist_ctx.dist_config.ep,
+                              dp_rank = dist_ctx.dp_rank,
+                              tp_rank = dist_ctx.attn_tp_group.rank,
+                              ep_rank = dist_ctx.ep_rank,
+                              max_tokens_accros_dp = 1,
+                              tp_group = dist_ctx.attn_tp_group.gpu_group,
+                              ep_group = dist_ctx.ep_gpu_group)
 
 
 class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
@@ -17,9 +34,10 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
         self.dim = dim
         if n_groups != -1:
             raise NotImplementedError('Group router not supported')
+        self.dist_ctx = get_dist_ctx()
 
     def forward(self, x: torch.Tensor):
-        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k)
+        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, self.dist_ctx)
         return routing_weights, selected_experts
 
 
@@ -46,6 +64,7 @@ def __init__(self,
         self.renormalize = renormalize
         self.ep_size = ep_size
         self.ep_group = ep_group
+        self.dist_ctx = get_dist_ctx()
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
         """Update weights."""
@@ -76,7 +95,7 @@ def forward(self,
         assert gate_up_bias is None
         assert down_bias is None
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize, self.ep_size, self.ep_group)
+                         self.renormalize, self.dist_ctx)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 2079f68831..df291adbd8 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -2,6 +2,7 @@
 import dlinfer.ops as ext_ops
 import torch
 from torch import Tensor
+from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext
 
 
 def fused_moe(
@@ -12,9 +13,8 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
-    ep_size: int,
-    ep_group: torch.distributed.ProcessGroup = None,
+    dist_ctx: DlinferDistContext,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
-                             ep_size, ep_group)
+                             dist_ctx)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index ad2fe66056..fdeed7e81a 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
 from torch import Tensor
+from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext
 
 
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int):
-    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk)
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext):
+    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, dist_ctx)
     return routing_weights, selected_experts

From f17f23159247e5f2fe824477180a60c63ac6eb64 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 30 Dec 2025 09:33:36 +0000
Subject: [PATCH 08/25] move DlinferDistContext into dlinfer

---
 lmdeploy/pytorch/backends/dlinfer/__init__.py  | 18 ------------------
 lmdeploy/pytorch/backends/dlinfer/moe.py       |  3 +--
 lmdeploy/pytorch/kernels/dlinfer/__init__.py   |  1 +
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py  |  2 +-
 .../kernels/dlinfer/moe_gating_topk_softmax.py |  2 +-
 5 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py
index d06de6ac2e..ef101fec61 100644
--- a/lmdeploy/pytorch/backends/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py
@@ -1,19 +1 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from dataclasses import dataclass
-
-
-@dataclass
-class DlinferDistContext:
-    dp_size: int = 1
-    tp_size: int = 1
-    ep_size: int = 1
-
-    dp_rank: int = 0
-    tp_rank: int = 0
-    ep_rank: int = 0
-
-    max_tokens_accros_dp: int = 1
-
-    tp_group: torch.distributed.ProcessGroup = None
-    ep_group: torch.distributed.ProcessGroup = None
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 362acc9909..9a53392f1c 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -6,10 +6,9 @@
 import torch
 
 from lmdeploy.pytorch.distributed import get_dist_manager
-from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import DlinferDistContext, fused_moe, moe_gating_topk_softmax
 
 from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
-from . import DlinferDistContext
 
 
 def get_dist_ctx():
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 7b226d7ff4..88cf884f55 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from dlinfer.utils.type_annotation import DlinferDistContext
 from ..default import multinomial_sampling, per_channel_quant
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .awq_kernels import awq_linear
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index df291adbd8..1f18af6880 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -2,7 +2,7 @@
 import dlinfer.ops as ext_ops
 import torch
 from torch import Tensor
-from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext
+from . import DlinferDistContext
 
 
 def fused_moe(
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index fdeed7e81a..90c3408b10 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
 from torch import Tensor
-from lmdeploy.pytorch.backends.dlinfer import DlinferDistContext
+from . import DlinferDistContext
 
 
 def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext):

From 760c0dbceb0447b47c93c040fcde1ab30334a967 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 31 Dec 2025 01:42:18 +0000
Subject: [PATCH 09/25] fix get_max_tokens_across_dp in tp case

---
 lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 3aaa405ce0..566f7e032b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -231,7 +231,7 @@ def get_max_tokens_across_dp():
                 dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group)
                 max_tokens_accros_dp = torch.max(total_token_buffer).item()
             else:
-                max_tokens_accros_dp = 0
+                max_tokens_accros_dp = torch.sum(step_context.q_seqlens).item()
             return max_tokens_accros_dp
 
         q_seqlens_cpu, kv_seqlens_cpu = get_cpu_seqlens(step_context.is_decoding, is_unpaged_prefill)

From 73ebda1e1b30dd1341a429a25997083ca05e7dc6 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Sun, 4 Jan 2026 08:03:03 +0000
Subject: [PATCH 10/25] format code

---
 .../backends/dlinfer/ascend/op_backend.py     |  4 +++-
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 22 +++++++++----------
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  2 ++
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py |  2 +-
 .../dlinfer/moe_gating_topk_softmax.py        |  1 +
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 566f7e032b..ba5b39893b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -227,7 +227,9 @@ def get_max_tokens_across_dp():
             if dist_ctx.dist_config.dp > 1:
                 total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype)
                 world_size = dist_ctx.dist_config.world_size
-                total_token_buffer = torch.zeros(world_size, dtype=step_context.q_seqlens.dtype, device=torch.npu.current_device())
+                total_token_buffer = torch.zeros(world_size,
+                                                 dtype=step_context.q_seqlens.dtype,
+                                                 device=torch.npu.current_device())
                 dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group)
                 max_tokens_accros_dp = torch.max(total_token_buffer).item()
             else:
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 9a53392f1c..0dc75f2d47 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
-from dataclasses import dataclass
 from typing import Callable, List
 
 import torch
@@ -13,16 +11,16 @@
 
 def get_dist_ctx():
     dist_ctx = get_dist_manager().current_context()
-    
-    return DlinferDistContext(dp_size = dist_ctx.dist_config.dp,
-                              tp_size = dist_ctx.dist_config.tp,
-                              ep_size = dist_ctx.dist_config.ep,
-                              dp_rank = dist_ctx.dp_rank,
-                              tp_rank = dist_ctx.attn_tp_group.rank,
-                              ep_rank = dist_ctx.ep_rank,
-                              max_tokens_accros_dp = 1,
-                              tp_group = dist_ctx.attn_tp_group.gpu_group,
-                              ep_group = dist_ctx.ep_gpu_group)
+
+    return DlinferDistContext(dp_size=dist_ctx.dist_config.dp,
+                              tp_size=dist_ctx.dist_config.tp,
+                              ep_size=dist_ctx.dist_config.ep,
+                              dp_rank=dist_ctx.dp_rank,
+                              tp_rank=dist_ctx.attn_tp_group.rank,
+                              ep_rank=dist_ctx.ep_rank,
+                              max_tokens_accros_dp=1,
+                              tp_group=dist_ctx.attn_tp_group.gpu_group,
+                              ep_group=dist_ctx.ep_gpu_group)
 
 
 class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 88cf884f55..79790ff4d7 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dlinfer.utils.type_annotation import DlinferDistContext
+
 from ..default import multinomial_sampling, per_channel_quant
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .awq_kernels import awq_linear
@@ -12,6 +13,7 @@
 from .rms_norm import rms_norm
 
 __all__ = [
+    'DlinferDistContext',
     'rms_norm',
     'apply_rotary_pos_emb',
     'awq_linear',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 1f18af6880..ef9ba53402 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-import torch
 from torch import Tensor
+
 from . import DlinferDistContext
 
 
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index 90c3408b10..b57e33afea 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
 from torch import Tensor
+
 from . import DlinferDistContext
 
 

From 86d00ebb1e63acbb631bd799f1e641217eb2c299 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Tue, 6 Jan 2026 02:49:07 +0000
Subject: [PATCH 11/25] fix grpah_mode dp

---
 lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index ba5b39893b..f46408cd3d 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -226,6 +226,14 @@ def get_max_tokens_across_dp():
             dist_ctx = get_dist_manager().current_context()
             if dist_ctx.dist_config.dp > 1:
                 total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype)
+                if cls.enable_graph and step_context.is_decoding:
+                    from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
+                    total_token_current_rank_item = total_token_current_rank.item()
+                    total_token_current_rank = torch.tensor(
+                        [get_ascend_compatible_size(total_token_current_rank_item)],
+                        dtype=total_token_current_rank.dtype,
+                        device=total_token_current_rank.device,
+                    )
                 world_size = dist_ctx.dist_config.world_size
                 total_token_buffer = torch.zeros(world_size,
                                                  dtype=step_context.q_seqlens.dtype,

From 35a60da9ae7cac8a3c707ae8cf788e8c9b51d3f3 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Mon, 12 Jan 2026 11:54:56 +0000
Subject: [PATCH 12/25] add mlpmetada

---
 .../backends/dlinfer/ascend/op_backend.py     | 146 +++++++++++++++---
 lmdeploy/pytorch/backends/dlinfer/moe.py      |  56 ++++---
 .../pytorch/backends/dlinfer/op_backend.py    |   5 +
 lmdeploy/pytorch/backends/moe.py              |  20 ++-
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |   3 -
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py |  16 +-
 .../dlinfer/moe_gating_topk_softmax.py        |   8 +-
 lmdeploy/pytorch/model_inputs.py              |   1 +
 lmdeploy/pytorch/models/qwen3_moe.py          |  13 +-
 lmdeploy/pytorch/models/qwen3_vl.py           |   6 +
 lmdeploy/pytorch/models/qwen3_vl_moe.py       |   2 +
 lmdeploy/pytorch/nn/moe/base.py               |  17 +-
 lmdeploy/pytorch/nn/moe/default.py            |   2 +
 13 files changed, 227 insertions(+), 68 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 3f2f5e5911..df8e115b5b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -2,6 +2,7 @@
 import itertools
 import os
 import re
+from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
 from typing import Dict, Tuple
@@ -14,6 +15,7 @@
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
+from ..moe import MoeType
 from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
@@ -42,6 +44,30 @@ def is_Ascend310P(cls) -> bool:
     def is_Ascend910(cls) -> bool:
         return cls.device_name().startswith(cls.Ascend910)
 
+    @classmethod
+    @lru_cache(maxsize=1)
+    def soc_version(cls) -> str:
+        return torch.npu.get_soc_version()
+
+    @classmethod
+    def is_A2(cls) -> bool:
+        return 220 <= cls.soc_version() <= 225
+
+    @classmethod
+    def is_A3(cls) -> bool:
+        return 250 <= cls.soc_version() <= 255
+
+
+@dataclass
+class DistMeta:
+    dp_size: int
+    tp_size: int
+    ep_size: int
+    tp_rank: int
+    ep_rank: int
+    tp_group: torch.distributed.ProcessGroup
+    ep_group: torch.distributed.ProcessGroup
+
 
 class AscendKVQuantMeta:
     has_set_value: bool = False
@@ -90,10 +116,12 @@ def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_laye
 
 class AscendOpsBackend(DlinferOpsBackend):
     """Ascend layer backend."""
-    enable_graph = False
-    half_negative_inf = torch.finfo(torch.float16).min
+    enable_graph: bool = False
+    half_negative_inf: float = torch.finfo(torch.float16).min
     total_slots = None
     max_batches = None
+    dist_meta: DistMeta = None
+    graph_capture_sizes = None
     max_tokens_accros_dp = 0
 
     @staticmethod
@@ -235,27 +263,83 @@ def get_kv_start_indices_and_attention_mask(is_decoding, is_unpaged_prefill, q_s
 
             return kv_start_indices, attention_mask
 
-        def get_max_tokens_across_dp():
+        def get_dist_meta():
+            if cls.dist_meta is not None:
+                return cls.dist_meta
             dist_ctx = get_dist_manager().current_context()
-            if dist_ctx.dist_config.dp > 1:
-                total_token_current_rank = torch.sum(step_context.q_seqlens).to(step_context.q_seqlens.dtype)
-                if cls.enable_graph and step_context.is_decoding:
-                    from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
-                    total_token_current_rank_item = total_token_current_rank.item()
-                    total_token_current_rank = torch.tensor(
-                        [get_ascend_compatible_size(total_token_current_rank_item)],
-                        dtype=total_token_current_rank.dtype,
-                        device=total_token_current_rank.device,
-                    )
-                world_size = dist_ctx.dist_config.world_size
-                total_token_buffer = torch.zeros(world_size,
+            dp_size, tp_size, ep_size = dist_ctx.dist_config.dp, dist_ctx.dist_config.tp, dist_ctx.dist_config.ep
+            tp_rank, ep_rank = dist_ctx.attn_tp_group.rank, dist_ctx.ep_rank
+            tp_group = dist_ctx.attn_tp_group.gpu_group
+            ep_group = dist_ctx.ep_gpu_group
+            cls.dist_meta = DistMeta(dp_size=dp_size,
+                                     tp_size=tp_size,
+                                     ep_size=ep_size,
+                                     tp_rank=tp_rank,
+                                     ep_rank=ep_rank,
+                                     tp_group=tp_group,
+                                     ep_group=ep_group)
+            return cls.dist_meta
+
+        def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
+            if ep_size <= 1:
+                return 0, 0, 0, None
+            # get runtime num_tokens
+            is_graph = cls.enable_graph and step_context.is_decoding
+            if is_graph:
+                from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
+                tokens_current_rank = step_context.q_seqlens.shape[0]
+                num_tokens = min(get_ascend_compatible_size(tokens_current_rank), cls.max_batches)
+            else:
+                tokens_current_rank = step_context.q_seqlens.sum().item()
+                num_tokens = tokens_current_rank
+            # get max_tokens_across_dp
+            if dp_size > 1:
+                num_tokens_tensor = torch.tensor([num_tokens],
                                                  dtype=step_context.q_seqlens.dtype,
                                                  device=torch.npu.current_device())
-                dist.all_gather_into_tensor(total_token_buffer, total_token_current_rank, dist_ctx.ep_gpu_group)
-                max_tokens_accros_dp = torch.max(total_token_buffer).item()
+                world_size = dp_size * tp_size
+                num_tokens_buffer = torch.zeros([world_size],
+                                                dtype=step_context.q_seqlens.dtype,
+                                                device=torch.npu.current_device())
+                dist.all_gather_into_tensor(num_tokens_buffer, num_tokens_tensor, ep_group)
+                max_tokens_across_dp = torch.max(num_tokens_buffer).item()
+            else:
+                max_tokens_across_dp = num_tokens
+            # get pad_size
+            paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
+            pad_size = paded_size - num_tokens
+            # get x_active_mask
+            x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
+            if pad_size > 0:
+                x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
+            return num_tokens, max_tokens_across_dp, pad_size, x_active_mask
+
+        @lru_cache
+        def init_mc2_token_capacity(tp_size):
+            max_num_tokens = min(cls.max_batches, 512)
+            num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
+            return num_tokens_per_tp_rank * tp_size
+
+        def select_moe_type(num_tokens, dp_size, tp_size, ep_size):
+            if ep_size <= 1:
+                return MoeType.ALLGATHER
+            mc2_token_capacity = init_mc2_token_capacity(tp_size)
+            is_graph = cls.enable_graph and step_context.is_decoding
+            if is_graph:
+                import math
+                num_tokens = math.ceil(num_tokens / tp_size) * tp_size
+            if SocVersion.is_A2():
+                if num_tokens <= mc2_token_capacity and dp_size * tp_size >= 16:
+                    return MoeType.MC2
+                else:
+                    return MoeType.ALLGATHER
+            elif SocVersion.is_A3():
+                if num_tokens <= mc2_token_capacity:
+                    return MoeType.MC2
+                else:
+                    return MoeType.ALLTOALL
             else:
-                max_tokens_accros_dp = torch.sum(step_context.q_seqlens).item()
-            return max_tokens_accros_dp
+                raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
         q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding,
                                                                              is_unpaged_prefill)
@@ -267,7 +351,6 @@ def get_max_tokens_across_dp():
                                                                                    is_unpaged_prefill, q_seqlens_list,
                                                                                    kv_seqlens_list, max_q_seq_len,
                                                                                    max_kv_seq_len)
-        cls.max_tokens_accros_dp = get_max_tokens_across_dp()
 
         if not cls.enable_graph and step_context.kv_quant_policy == 8:
             record_file = os.getenv('ASCEND_QUANT_RECORD_FILE')
@@ -300,8 +383,29 @@ def get_max_tokens_across_dp():
             quant_policy=step_context.kv_quant_policy,
             quant_meta=AscendKVQuantMeta.quant_meta,
         )
-
         step_context.attn_metadata = attn_metadata
+
+        get_dist_meta()
+        num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size,
+                                                                                    cls.dist_meta.tp_size,
+                                                                                    cls.dist_meta.ep_size,
+                                                                                    cls.dist_meta.ep_group)
+        moe_type = select_moe_type(num_tokens, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size)
+        mlp_meta_cls = cls.get_mlp_metadata_cls()
+        mlp_metadata = mlp_meta_cls(
+            max_tokens_across_dp=max_tokens_across_dp,
+            pad_size=pad_size,
+            dp_size=cls.dist_meta.dp_size,
+            tp_size=cls.dist_meta.tp_size,
+            ep_size=cls.dist_meta.ep_size,
+            tp_rank=cls.dist_meta.tp_rank,
+            ep_rank=cls.dist_meta.ep_rank,
+            tp_group=cls.dist_meta.tp_group,
+            ep_group=cls.dist_meta.ep_group,
+            moe_type=moe_type,
+            x_active_mask=x_active_mask,
+        )
+        step_context.mlp_metadata = mlp_metadata
         return step_context
 
     @staticmethod
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 0dc75f2d47..3c552711e7 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,29 +1,32 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
+from dataclasses import dataclass
 from typing import Callable, List
 
 import torch
+from dlinfer.utils.type_annotation import MoeType
 
-from lmdeploy.pytorch.distributed import get_dist_manager
-from lmdeploy.pytorch.kernels.dlinfer import DlinferDistContext, fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 
-from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
+from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl
 
 
-def get_dist_ctx():
-    dist_ctx = get_dist_manager().current_context()
+@dataclass
+class DlinferMLPMetadata(MLPMetadata):
+    max_tokens_across_dp: int = 1
+    pad_size: int = 0
+    dp_size: int = 1
+    tp_size: int = 1
+    ep_size: int = 1
+    tp_rank: int = 0
+    ep_rank: int = 0
+    tp_group: torch.distributed.ProcessGroup = None
+    ep_group: torch.distributed.ProcessGroup = None
+    moe_type: MoeType = MoeType.UNDEFINED
+    x_active_mask: torch.Tensor = None
 
-    return DlinferDistContext(dp_size=dist_ctx.dist_config.dp,
-                              tp_size=dist_ctx.dist_config.tp,
-                              ep_size=dist_ctx.dist_config.ep,
-                              dp_rank=dist_ctx.dp_rank,
-                              tp_rank=dist_ctx.attn_tp_group.rank,
-                              ep_rank=dist_ctx.ep_rank,
-                              max_tokens_accros_dp=1,
-                              tp_group=dist_ctx.attn_tp_group.gpu_group,
-                              ep_group=dist_ctx.ep_gpu_group)
 
-
-class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
+class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]):
     """Dlinfer softmax topk implementation."""
 
     def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
@@ -31,14 +34,15 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
         self.dim = dim
         if n_groups != -1:
             raise NotImplementedError('Group router not supported')
-        self.dist_ctx = get_dist_ctx()
 
-    def forward(self, x: torch.Tensor):
-        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, self.dist_ctx)
+    def forward(self, x: torch.Tensor, mlp_metada: DlinferMLPMetadata):
+        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, mlp_metada.max_tokens_across_dp,
+                                                                    mlp_metada.pad_size, mlp_metada.tp_size,
+                                                                    mlp_metada.ep_size, mlp_metada.tp_rank)
         return routing_weights, selected_experts
 
 
-class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):
+class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder[DlinferMLPMetadata]):
     """Dlinfer softmax topk implementation builder."""
 
     @staticmethod
@@ -47,7 +51,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
         return DlinferSoftmaxTopKImpl(top_k, dim, n_groups)
 
 
-class DlinferFusedMoEImpl(FusedMoEImpl):
+class DlinferFusedMoEImpl(FusedMoEImpl[DlinferMLPMetadata]):
     """Dlinfer fused moe implementation."""
 
     def __init__(self,
@@ -61,12 +65,13 @@ def __init__(self,
         self.renormalize = renormalize
         self.ep_size = ep_size
         self.ep_group = ep_group
-        self.dist_ctx = get_dist_ctx()
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
         """Update weights."""
         device_type = gate_up_weights.device.type
         if device_type in ['npu']:
+            if os.getenv('DLINFER_RESET_MOE_UPDATE_WEIGHTS', '0') == '1':
+                return gate_up_weights, down_weights
             return gate_up_weights.transpose(-1, -2).contiguous(), down_weights.transpose(-1, -2).contiguous()
         return gate_up_weights, down_weights
 
@@ -84,6 +89,7 @@ def forward(self,
                 topk_ids: torch.LongTensor,
                 gate_up_weights: torch.Tensor,
                 down_weights: torch.Tensor,
+                mlp_metadata: DlinferMLPMetadata,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
                 expert_list: List[int] = None,
@@ -92,10 +98,12 @@ def forward(self,
         assert gate_up_bias is None
         assert down_bias is None
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize, self.dist_ctx)
+                         self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size,
+                         mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group,
+                         mlp_metadata.moe_type, mlp_metadata.x_active_mask)
 
 
-class DlinferFusedMoEBuilder(FusedMoEBuilder):
+class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]):
     """Dlinfer fused moe builder."""
 
     @staticmethod
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 16eb604ccd..0e39907f85 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -67,6 +67,11 @@ def get_attention_metadata_cls():
         from .attention import DlinferAttentionMetadata
         return DlinferAttentionMetadata
 
+    @staticmethod
+    def get_mlp_metadata_cls():
+        from .moe import DlinferMLPMetadata
+        return DlinferMLPMetadata
+
     @staticmethod
     def get_k_block_shape(
         block_size: int,
diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py
index 5b33b97da7..ea945b7550 100644
--- a/lmdeploy/pytorch/backends/moe.py
+++ b/lmdeploy/pytorch/backends/moe.py
@@ -1,13 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 from abc import ABC, abstractmethod
-from typing import Callable, List, Optional
+from dataclasses import dataclass
+from typing import Callable, Generic, List, Optional, TypeVar
 
 import torch
 import torch.distributed as dist
 
 
-class SoftmaxTopKImpl(ABC):
+@dataclass
+class MLPMetadata:
+    """Base MLP metadata."""
+    ...
+
+
+T = TypeVar('T', bound=MLPMetadata)
+
+
+class SoftmaxTopKImpl(ABC, Generic[T]):
     """Softmax topk implementation api."""
 
     @staticmethod
@@ -22,7 +32,7 @@ def forward(self, x: torch.Tensor):
         raise NotImplementedError
 
 
-class SoftmaxTopKBuilder(ABC):
+class SoftmaxTopKBuilder(ABC, Generic[T]):
     """Softmax topk implementation builder."""
 
     @staticmethod
@@ -32,7 +42,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
         raise NotImplementedError
 
 
-class FusedMoEImpl(ABC):
+class FusedMoEImpl(ABC, Generic[T]):
     """Fused moe implementation."""
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
@@ -58,7 +68,7 @@ def forward(self,
         raise NotImplementedError
 
 
-class FusedMoEBuilder(ABC):
+class FusedMoEBuilder(ABC, Generic[T]):
     """Fused moe builder."""
 
     @staticmethod
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 79790ff4d7..7b226d7ff4 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from dlinfer.utils.type_annotation import DlinferDistContext
-
 from ..default import multinomial_sampling, per_channel_quant
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .awq_kernels import awq_linear
@@ -13,7 +11,6 @@
 from .rms_norm import rms_norm
 
 __all__ = [
-    'DlinferDistContext',
     'rms_norm',
     'apply_rotary_pos_emb',
     'awq_linear',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index ef9ba53402..e0cea0b503 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
+import torch.distributed as dist
+from dlinfer.utils.type_annotation import MoeType
 from torch import Tensor
 
-from . import DlinferDistContext
-
 
 def fused_moe(
     hidden_states: Tensor,
@@ -13,8 +13,16 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
-    dist_ctx: DlinferDistContext,
+    pad_size: int,
+    tp_size: int,
+    ep_size: int,
+    tp_rank: int,
+    ep_rank: int,
+    tp_group: dist.ProcessGroup,
+    ep_group: dist.ProcessGroup,
+    moe_type: MoeType,
+    x_active_mask: Tensor,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
-                             dist_ctx)
+                             pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index b57e33afea..db71e87787 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -2,9 +2,9 @@
 import dlinfer.ops as ext_ops
 from torch import Tensor
 
-from . import DlinferDistContext
 
-
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int, dist_ctx: DlinferDistContext):
-    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, dist_ctx)
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int, max_tokens_across_dp: int, pad_size: int, tp_size: int,
+                            ep_size: int, tp_rank: int) -> tuple[Tensor, Tensor]:
+    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, max_tokens_across_dp,
+                                                                        pad_size, tp_size, ep_size, tp_rank)
     return routing_weights, selected_experts
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 80e0540e6d..8e351428c1 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -350,6 +350,7 @@ class StepContext:
     input_multimodals: List[MultiModalTensor] = None
     vision_inputs: VisionModelInputs = None
     attn_metadata: Any = None
+    mlp_metadata: Any = None
     cross_seqlens: torch.LongTensor = None
     cross_kv_seqlens: torch.LongTensor = None
     cross_attn_metadata: Any = None
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
index 9d50cbb86b..66076de6dd 100644
--- a/lmdeploy/pytorch/models/qwen3_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -225,12 +225,13 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         all_routed_experts: torch.Tensor = None,
+        mlp_metadata: Any = None,
     ):
         """forward."""
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
-        topk_weights, topk_ids = self.softmax_topk(router_logits)
+        topk_weights, topk_ids = self.softmax_topk(router_logits, mlp_metadata)
         if all_routed_experts is not None:
             all_routed_experts[:, self.layer_idx, :] = topk_ids
         if get_dist_manager().current_context().dist_config.enable_eplb:
@@ -239,6 +240,7 @@ def forward(
             hidden_states,
             topk_weights,
             topk_ids,
+            mlp_metadata,
         )
 
         out_states = out_states.reshape(batch_size, sequence_length, -1)
@@ -284,6 +286,7 @@ def forward(
         past_key_value: Optional[List[torch.FloatTensor]],
         residual: Optional[torch.Tensor] = None,
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         all_routed_experts: torch.Tensor = None,
     ):
 
@@ -303,7 +306,7 @@ def forward(
 
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts)
+        hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts, mlp_metadata=mlp_metadata)
 
         outputs = (hidden_states, residual)
         return outputs
@@ -349,6 +352,7 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         all_routed_experts: torch.Tensor = None,
     ):
@@ -375,6 +379,7 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
+                mlp_metadata=mlp_metadata,
                 all_routed_experts=all_routed_experts,
             )
 
@@ -430,6 +435,7 @@ def forward(
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
     ):
@@ -450,6 +456,7 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             all_routed_experts=all_routed_experts,
         )
@@ -476,6 +483,7 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
+        mlp_metadata = context.mlp_metadata
 
         # process vision embeddings
         vision_embeddings = context.input_embeddings
@@ -491,6 +499,7 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
         )
 
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index 60c3617ffe..5d81c8c327 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -102,6 +102,7 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
@@ -143,6 +144,7 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
+                mlp_metadata=mlp_metadata,
             )
 
             # add visual features to the hidden states of first several layers
@@ -497,6 +499,7 @@ def forward(
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
         pixel_values: torch.Tensor = None,
@@ -541,6 +544,7 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             mrope_position_ids=mrope_position_ids,
             # args for deepstack
@@ -574,6 +578,7 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
+        mlp_metadata = context.mlp_metadata
 
         pixel_values = None
         vis_cu_seqlens = None
@@ -614,6 +619,7 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             mrope_position_ids=mrope_position_ids,
             pixel_values=pixel_values,
diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py
index 1dc7e32de9..7586db3d11 100644
--- a/lmdeploy/pytorch/models/qwen3_vl_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py
@@ -33,6 +33,7 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
+        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
@@ -74,6 +75,7 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
+                mlp_metadata=mlp_metadata,
             )
 
             # add visual features to the hidden states of first several layers
diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py
index 484dbbe492..2ae9db24fd 100644
--- a/lmdeploy/pytorch/nn/moe/base.py
+++ b/lmdeploy/pytorch/nn/moe/base.py
@@ -8,6 +8,7 @@
 
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.backends import OpType, get_backend
+from lmdeploy.pytorch.backends.moe import MLPMetadata
 from lmdeploy.pytorch.config import TPMode
 from lmdeploy.pytorch.distributed import get_dist_manager, get_tp_world_rank
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
@@ -29,9 +30,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
         impl_builder = get_backend().get_layer_impl_builder(OpType.SoftmaxTopK)
         self.impl = impl_builder.build(top_k, dim, n_groups=n_groups)
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor, mlp_metadata: MLPMetadata):
         """forward."""
-        return self.impl.forward(x)
+        return self.impl.forward(x, mlp_metadata)
 
 
 def update_dims(hidden_dim: int, ffn_dim: int):
@@ -296,7 +297,8 @@ def forward_dptp(self) -> MoEForwardDPTP:
         """Forward dptp."""
         return self._forward_dptp
 
-    def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor):
+    def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor,
+                        mlp_metadata: MLPMetadata):
         """Default forward."""
         state = {
             'hidden_states': hidden_states,
@@ -305,16 +307,21 @@ def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tenso
             'moe_type': MoeType.Default,
         }
         recv_state = self.dispatch(state)
+        recv_state.update({'mlp_metadata': mlp_metadata})
         gemm_state = self.gemm(recv_state)
         out_state = self.combine(gemm_state)
         return out_state['hidden_states']
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_idx: torch.LongTensor,
+                mlp_metadata: MLPMetadata = None):
         """forward."""
         if self.tp > 1 and self.tp_mode == TPMode.DP_TP:
             return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx)
         else:
-            return self.forward_default(hidden_states, topk_weights, topk_idx)
+            return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata)
 
     def renormalize(self, topk_weights):
         """renormalize."""
diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py
index 0633aa001a..358674a466 100644
--- a/lmdeploy/pytorch/nn/moe/default.py
+++ b/lmdeploy/pytorch/nn/moe/default.py
@@ -298,12 +298,14 @@ def gemm(self, state: Dict):
             hidden_states = state['hidden_states']
             topk_weights = state['topk_weights']
             topk_ids = state['topk_idx']
+            mlp_metadata = state['mlp_metadata']
 
             hidden_states = self.impl.forward(hidden_states,
                                               topk_weights,
                                               topk_ids,
                                               self.gate_up.weight,
                                               self.down.weight,
+                                              mlp_metadata,
                                               self.gate_up.bias,
                                               self.down.bias,
                                               self.expert_list,

From 8980ba9ca75d0d2bbb9b2d785f5cf49bff57ac5a Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 14 Jan 2026 09:21:55 +0000
Subject: [PATCH 13/25] good eager mode

---
 .../backends/dlinfer/ascend/op_backend.py     | 37 ++++++++++++++-----
 lmdeploy/pytorch/backends/dlinfer/moe.py      |  3 +-
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  3 +-
 .../kernels/dlinfer/flash_attention.py        |  2 +-
 .../pytorch/kernels/dlinfer/pagedattention.py |  9 +++--
 5 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index df8e115b5b..85733e35b6 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -123,6 +123,7 @@ class AscendOpsBackend(DlinferOpsBackend):
     dist_meta: DistMeta = None
     graph_capture_sizes = None
     max_tokens_accros_dp = 0
+    max_tokens_accros_dp = 0
 
     @staticmethod
     def get_name() -> str:
@@ -308,10 +309,11 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             # get pad_size
             paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
             pad_size = paded_size - num_tokens
-            # get x_active_mask
-            x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
-            if pad_size > 0:
-                x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
+            # # get x_active_mask
+            # x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
+            # if pad_size > 0:
+            #     x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
+            x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device())
             return num_tokens, max_tokens_across_dp, pad_size, x_active_mask
 
         @lru_cache
@@ -320,27 +322,39 @@ def init_mc2_token_capacity(tp_size):
             num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
             return num_tokens_per_tp_rank * tp_size
 
-        def select_moe_type(num_tokens, dp_size, tp_size, ep_size):
+        def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             if ep_size <= 1:
                 return MoeType.ALLGATHER
             mc2_token_capacity = init_mc2_token_capacity(tp_size)
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
                 import math
-                num_tokens = math.ceil(num_tokens / tp_size) * tp_size
+                max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size
             if SocVersion.is_A2():
-                if num_tokens <= mc2_token_capacity and dp_size * tp_size >= 16:
+                if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16:
                     return MoeType.MC2
                 else:
                     return MoeType.ALLGATHER
             elif SocVersion.is_A3():
-                if num_tokens <= mc2_token_capacity:
+                if max_tokens_across_dp <= mc2_token_capacity:
                     return MoeType.MC2
                 else:
                     return MoeType.ALLTOALL
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
+        def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type):
+            if moe_type in {MoeType.MC2, MoeType.ALLTOALL}:
+                x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
+            else:
+                return None
+            # if pad_size > 0:
+            #     x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
+            # if tp_size > 1:
+            #     split_x_active_mask = torch.tensor_split(x_active_mask, tp_size, dim=0)
+            #     x_active_mask = split_x_active_mask[tp_rank]
+            return x_active_mask
+
         q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding,
                                                                              is_unpaged_prefill)
         q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu,
@@ -385,13 +399,15 @@ def select_moe_type(num_tokens, dp_size, tp_size, ep_size):
         )
         step_context.attn_metadata = attn_metadata
 
-        get_dist_meta()
+        cls.dist_meta = get_dist_meta()
         num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size,
                                                                                     cls.dist_meta.tp_size,
                                                                                     cls.dist_meta.ep_size,
                                                                                     cls.dist_meta.ep_group)
-        moe_type = select_moe_type(num_tokens, cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size)
+        moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
+                                   cls.dist_meta.ep_size)
         mlp_meta_cls = cls.get_mlp_metadata_cls()
+        cls.max_tokens_accros_dp = max_tokens_across_dp
         mlp_metadata = mlp_meta_cls(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,
@@ -441,6 +457,7 @@ def device_count():
     @staticmethod
     def support_ray():
         """Support ray."""
+        # return False
         if not _envs.ascend_set_rt_visable_devices_by_ray:
             os.environ['RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES'] = '1'
         return True
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 3c552711e7..10aab3c213 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -4,9 +4,8 @@
 from typing import Callable, List
 
 import torch
-from dlinfer.utils.type_annotation import MoeType
 
-from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax
 
 from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl
 
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 7b226d7ff4..f9ea874ae5 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,7 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .flash_attention import flash_attention_fwd
-from .fused_moe import fused_moe
+from .fused_moe import MoeType, fused_moe
 from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
@@ -15,6 +15,7 @@
     'apply_rotary_pos_emb',
     'awq_linear',
     'fill_kv_cache',
+    'MoeType',
     'fused_moe',
     'paged_attention_fwd',
     'flash_attention_fwd',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py b/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py
index a1b4c659d1..7f3037b247 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-from dlinfer.utils.type_annotation import Tensor
+from torch import Tensor
 
 
 def flash_attention_fwd(
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index 13f4e12a58..8996508aff 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
 import dlinfer.ops as ext_ops
-import torch
-from dlinfer.utils.type_annotation import Optional, Sequence, Tensor
+from torch import Tensor
 
 
 def prefill_attention(
@@ -111,8 +112,8 @@ def paged_token_attention(
 
 def paged_attention_fwd(
     query_states: Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
+    key_states: Tensor,
+    value_states: Tensor,
     attn_output: Tensor,
     key_cache: Tensor,
     value_cache: Tensor,

From a4f003ba342a6a0a34da2cff0a716da139d83cbd Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 14 Jan 2026 10:48:34 +0000
Subject: [PATCH 14/25] good graph mode

---
 .../backends/dlinfer/ascend/op_backend.py     | 28 +++----------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 85733e35b6..ab57dc97bb 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -122,8 +122,6 @@ class AscendOpsBackend(DlinferOpsBackend):
     max_batches = None
     dist_meta: DistMeta = None
     graph_capture_sizes = None
-    max_tokens_accros_dp = 0
-    max_tokens_accros_dp = 0
 
     @staticmethod
     def get_name() -> str:
@@ -309,12 +307,9 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             # get pad_size
             paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
             pad_size = paded_size - num_tokens
-            # # get x_active_mask
-            # x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
-            # if pad_size > 0:
-            #     x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
+            # get x_active_mask
             x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device())
-            return num_tokens, max_tokens_across_dp, pad_size, x_active_mask
+            return max_tokens_across_dp, pad_size, x_active_mask
 
         @lru_cache
         def init_mc2_token_capacity(tp_size):
@@ -343,18 +338,6 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
-        def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type):
-            if moe_type in {MoeType.MC2, MoeType.ALLTOALL}:
-                x_active_mask = torch.ones(num_tokens, dtype=torch.bool, device=torch.npu.current_device())
-            else:
-                return None
-            # if pad_size > 0:
-            #     x_active_mask = torch.nn.functional.pad(x_active_mask, (0, pad_size), value=False)
-            # if tp_size > 1:
-            #     split_x_active_mask = torch.tensor_split(x_active_mask, tp_size, dim=0)
-            #     x_active_mask = split_x_active_mask[tp_rank]
-            return x_active_mask
-
         q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding,
                                                                              is_unpaged_prefill)
         q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu,
@@ -400,14 +383,11 @@ def get_x_active_mask(num_tokens, pad_size, tp_size, tp_rank, moe_type):
         step_context.attn_metadata = attn_metadata
 
         cls.dist_meta = get_dist_meta()
-        num_tokens, max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size,
-                                                                                    cls.dist_meta.tp_size,
-                                                                                    cls.dist_meta.ep_size,
-                                                                                    cls.dist_meta.ep_group)
+        max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size,
+                                                                        cls.dist_meta.ep_size, cls.dist_meta.ep_group)
         moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
                                    cls.dist_meta.ep_size)
         mlp_meta_cls = cls.get_mlp_metadata_cls()
-        cls.max_tokens_accros_dp = max_tokens_across_dp
         mlp_metadata = mlp_meta_cls(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,

From 74997d3d11d5450e131ce2d040ca43ecf160bc0f Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Thu, 15 Jan 2026 02:52:01 +0000
Subject: [PATCH 15/25] good dp*tp+ep feature

---
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 8 +++++++-
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 10aab3c213..173f955c56 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -64,6 +64,11 @@ def __init__(self,
         self.renormalize = renormalize
         self.ep_size = ep_size
         self.ep_group = ep_group
+        self.expert_ids_per_ep_rank = torch.tensor(
+            [i % (self.num_experts // self.ep_size) for i in range(num_experts)],
+            dtype=torch.int32,
+            device=torch.npu.current_device(),
+        )
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
         """Update weights."""
@@ -96,10 +101,11 @@ def forward(self,
         """forward."""
         assert gate_up_bias is None
         assert down_bias is None
+
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
                          self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size,
                          mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group,
-                         mlp_metadata.moe_type, mlp_metadata.x_active_mask)
+                         mlp_metadata.moe_type, mlp_metadata.x_active_mask, self.expert_ids_per_ep_rank)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index e0cea0b503..ea94ebd671 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -22,7 +22,9 @@ def fused_moe(
     ep_group: dist.ProcessGroup,
     moe_type: MoeType,
     x_active_mask: Tensor,
+    expert_ids_per_ep_rank: Tensor,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
-                             pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask)
+                             pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask,
+                             expert_ids_per_ep_rank)

From 40044668d747879f6b3ed8a797478f93d6d066a4 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 16 Jan 2026 03:39:20 +0000
Subject: [PATCH 16/25] fix tp err

---
 lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index ab57dc97bb..3b85ab1cc9 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -281,7 +281,7 @@ def get_dist_meta():
 
         def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             if ep_size <= 1:
-                return 0, 0, 0, None
+                return 0, 0, 0
             # get runtime num_tokens
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:

From 1166d22ae4d0146bb27d9912811504ea4f654da3 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Thu, 22 Jan 2026 11:39:31 +0000
Subject: [PATCH 17/25] update pad_size

---
 .../backends/dlinfer/ascend/op_backend.py     | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 3b85ab1cc9..fb4cf03026 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
+import math
 import os
 import re
 from dataclasses import dataclass
@@ -304,12 +305,24 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
                 max_tokens_across_dp = torch.max(num_tokens_buffer).item()
             else:
                 max_tokens_across_dp = num_tokens
-            # get pad_size
-            paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
-            pad_size = paded_size - num_tokens
+            # # get pad_size
+            # paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
+            # pad_size = paded_size - num_tokens
             # get x_active_mask
             x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device())
-            return max_tokens_across_dp, pad_size, x_active_mask
+            return num_tokens, max_tokens_across_dp, x_active_mask
+
+        def get_pad_size(num_tokens, max_tokens_across_dp, tp_size, moe_type):
+            if moe_type == MoeType.MC2:
+                paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
+                pad_size = paded_size - num_tokens
+            elif moe_type == MoeType.ALLTOALL:
+                pad_size = tp_size - num_tokens
+            elif moe_type == MoeType.ALLGATHER:
+                pad_size = max_tokens_across_dp - num_tokens
+            else:
+                pad_size = 0
+            return pad_size
 
         @lru_cache
         def init_mc2_token_capacity(tp_size):
@@ -383,10 +396,11 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
         step_context.attn_metadata = attn_metadata
 
         cls.dist_meta = get_dist_meta()
-        max_tokens_across_dp, pad_size, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size,
-                                                                        cls.dist_meta.ep_size, cls.dist_meta.ep_group)
+        num_tokens, max_tokens_across_dp, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size,
+                                                                          cls.dist_meta.ep_size, cls.dist_meta.ep_group)
         moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
                                    cls.dist_meta.ep_size)
+        pad_size = get_pad_size(num_tokens, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
         mlp_meta_cls = cls.get_mlp_metadata_cls()
         mlp_metadata = mlp_meta_cls(
             max_tokens_across_dp=max_tokens_across_dp,

From fec438f36379159bc9b2a17010156224ddc3ccf5 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 23 Jan 2026 10:40:57 +0000
Subject: [PATCH 18/25] optimize ep moe

---
 .../backends/dlinfer/ascend/op_backend.py     | 72 ++++++++++---------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index fb4cf03026..9ec6a5f74f 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -283,46 +283,30 @@ def get_dist_meta():
         def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             if ep_size <= 1:
                 return 0, 0, 0
-            # get runtime num_tokens
+            # get runtime_tokens_current_rank
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
                 from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
-                tokens_current_rank = step_context.q_seqlens.shape[0]
-                num_tokens = min(get_ascend_compatible_size(tokens_current_rank), cls.max_batches)
+                actual_tokens_current_rank = step_context.q_seqlens.shape[0]
+                runtime_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank),
+                                                  cls.max_batches)
             else:
-                tokens_current_rank = step_context.q_seqlens.sum().item()
-                num_tokens = tokens_current_rank
+                actual_tokens_current_rank = step_context.q_seqlens.sum().item()
+                runtime_tokens_current_rank = actual_tokens_current_rank
             # get max_tokens_across_dp
             if dp_size > 1:
-                num_tokens_tensor = torch.tensor([num_tokens],
-                                                 dtype=step_context.q_seqlens.dtype,
-                                                 device=torch.npu.current_device())
+                runtime_tokens_tensor = torch.tensor([runtime_tokens_current_rank],
+                                                     dtype=step_context.q_seqlens.dtype,
+                                                     device=torch.npu.current_device())
                 world_size = dp_size * tp_size
-                num_tokens_buffer = torch.zeros([world_size],
-                                                dtype=step_context.q_seqlens.dtype,
-                                                device=torch.npu.current_device())
-                dist.all_gather_into_tensor(num_tokens_buffer, num_tokens_tensor, ep_group)
-                max_tokens_across_dp = torch.max(num_tokens_buffer).item()
+                runtime_tokens_buffer = torch.zeros([world_size],
+                                                    dtype=step_context.q_seqlens.dtype,
+                                                    device=torch.npu.current_device())
+                dist.all_gather_into_tensor(runtime_tokens_buffer, runtime_tokens_tensor, ep_group)
+                max_tokens_across_dp = torch.max(runtime_tokens_buffer).item()
             else:
-                max_tokens_across_dp = num_tokens
-            # # get pad_size
-            # paded_size = (max_tokens_across_dp + tp_size - 1) // tp_size * tp_size
-            # pad_size = paded_size - num_tokens
-            # get x_active_mask
-            x_active_mask = torch.ones(tokens_current_rank, dtype=torch.bool, device=torch.npu.current_device())
-            return num_tokens, max_tokens_across_dp, x_active_mask
-
-        def get_pad_size(num_tokens, max_tokens_across_dp, tp_size, moe_type):
-            if moe_type == MoeType.MC2:
-                paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
-                pad_size = paded_size - num_tokens
-            elif moe_type == MoeType.ALLTOALL:
-                pad_size = tp_size - num_tokens
-            elif moe_type == MoeType.ALLGATHER:
-                pad_size = max_tokens_across_dp - num_tokens
-            else:
-                pad_size = 0
-            return pad_size
+                max_tokens_across_dp = runtime_tokens_current_rank
+            return actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp
 
         @lru_cache
         def init_mc2_token_capacity(tp_size):
@@ -351,6 +335,23 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
+        def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size,
+                         moe_type):
+            x_active_mask = None
+            if moe_type == MoeType.MC2:
+                paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
+                pad_size = paded_size - runtime_tokens_current_rank
+                x_active_mask = torch.ones(actual_tokens_current_rank,
+                                           dtype=torch.bool,
+                                           device=torch.npu.current_device())
+            elif moe_type == MoeType.ALLTOALL:
+                pad_size = tp_size - runtime_tokens_current_rank
+            elif moe_type == MoeType.ALLGATHER:
+                pad_size = max_tokens_across_dp - runtime_tokens_current_rank
+            else:
+                pad_size = 0
+            return pad_size, x_active_mask
+
         q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding,
                                                                              is_unpaged_prefill)
         q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu,
@@ -396,11 +397,12 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
         step_context.attn_metadata = attn_metadata
 
         cls.dist_meta = get_dist_meta()
-        num_tokens, max_tokens_across_dp, x_active_mask = get_tokens_info(cls.dist_meta.dp_size, cls.dist_meta.tp_size,
-                                                                          cls.dist_meta.ep_size, cls.dist_meta.ep_group)
+        actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp = get_tokens_info(
+            cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group)
         moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
                                    cls.dist_meta.ep_size)
-        pad_size = get_pad_size(num_tokens, max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
+        pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank,
+                                               max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
         mlp_meta_cls = cls.get_mlp_metadata_cls()
         mlp_metadata = mlp_meta_cls(
             max_tokens_across_dp=max_tokens_across_dp,

From d7177a105c55e0896e240304c11fa385a9ae40fc Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 28 Jan 2026 02:33:06 +0000
Subject: [PATCH 19/25] opt ep moe

---
 .../pytorch/backends/dlinfer/ascend/op_backend.py     | 11 +++++++++++
 lmdeploy/pytorch/backends/dlinfer/moe.py              |  4 +++-
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py         |  3 ++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 9ec6a5f74f..26dbf5b172 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -352,6 +352,15 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to
                 pad_size = 0
             return pad_size, x_active_mask
 
+        @lru_cache(maxsize=1)
+        def get_moe_group_name(group):
+            if group is None:
+                return None
+            local_rank = torch.distributed.get_rank(group=group)
+            backend = group._get_backend(torch.device('npu'))
+            group_name = backend.get_hccl_comm_name(local_rank)
+            return group_name
+
         q_seqlens_cpu, kv_seqlens_cpu, kv_seqlens_expanded = get_cpu_seqlens(step_context.is_decoding,
                                                                              is_unpaged_prefill)
         q_seqlens_list, kv_seqlens_list = get_list_seqlens(step_context.is_decoding, is_unpaged_prefill, q_seqlens_cpu,
@@ -403,6 +412,7 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to
                                    cls.dist_meta.ep_size)
         pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank,
                                                max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
+        moe_group_name = get_moe_group_name(cls.dist_meta.ep_group)
         mlp_meta_cls = cls.get_mlp_metadata_cls()
         mlp_metadata = mlp_meta_cls(
             max_tokens_across_dp=max_tokens_across_dp,
@@ -416,6 +426,7 @@ def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_to
             ep_group=cls.dist_meta.ep_group,
             moe_type=moe_type,
             x_active_mask=x_active_mask,
+            moe_group_name=moe_group_name,
         )
         step_context.mlp_metadata = mlp_metadata
         return step_context
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 173f955c56..fc12c1679a 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -23,6 +23,7 @@ class DlinferMLPMetadata(MLPMetadata):
     ep_group: torch.distributed.ProcessGroup = None
     moe_type: MoeType = MoeType.UNDEFINED
     x_active_mask: torch.Tensor = None
+    moe_group_name: str = None
 
 
 class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]):
@@ -105,7 +106,8 @@ def forward(self,
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
                          self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size,
                          mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group,
-                         mlp_metadata.moe_type, mlp_metadata.x_active_mask, self.expert_ids_per_ep_rank)
+                         mlp_metadata.moe_type, mlp_metadata.x_active_mask, mlp_metadata.moe_group_name,
+                         self.expert_ids_per_ep_rank)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index ea94ebd671..decce941ac 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -22,9 +22,10 @@ def fused_moe(
     ep_group: dist.ProcessGroup,
     moe_type: MoeType,
     x_active_mask: Tensor,
+    moe_group_name: str,
     expert_ids_per_ep_rank: Tensor,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
                              pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask,
-                             expert_ids_per_ep_rank)
+                             moe_group_name, expert_ids_per_ep_rank)

From 1d3325e206c09378e2efdefbbfc0dfa3be2a9f20 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Wed, 28 Jan 2026 07:56:14 +0000
Subject: [PATCH 20/25] fix ascend dptp

---
 lmdeploy/pytorch/nn/moe/base.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py
index 2ae9db24fd..dc7d7ab8a0 100644
--- a/lmdeploy/pytorch/nn/moe/base.py
+++ b/lmdeploy/pytorch/nn/moe/base.py
@@ -135,7 +135,8 @@ def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: to
         cur_out = self.gemm_func(hidden_states, topk_weights, topk_ids)
         return self.reduce_scatter(cur_out, output_states, tp_sizes)
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor):
+    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                mlp_metadata: MLPMetadata):
         """forward."""
 
         def __slice_tensor(tensor: torch.Tensor, slice_size: int):
@@ -177,6 +178,7 @@ def __slice_and_gather():
 
         # pre
         cur_inputs = __slice_and_gather()
+        cur_inputs.update(dict(mlp_metadata=mlp_metadata))
 
         out_handles = []
         # main loop
@@ -185,6 +187,7 @@ def __slice_and_gather():
             _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
             out_handles.append(handle)
             cur_inputs = next_inputs
+            cur_inputs.update(dict(mlp_metadata=mlp_metadata))
 
         # post
         _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
@@ -259,12 +262,13 @@ def init_dist_args(self, all_reduce: bool):
 
         if self.tp > 1 and self.tp_mode == TPMode.DP_TP:
 
-            def __gemm_func(hidden_states, topk_weights, topk_ids):
+            def __gemm_func(hidden_states, topk_weights, topk_ids, mlp_metadata):
                 return self.gemm(
                     dict(
                         hidden_states=hidden_states,
                         topk_weights=topk_weights,
                         topk_idx=topk_ids,
+                        mlp_metadata=mlp_metadata,
                         moe_type=MoeType.Default,
                     ))['hidden_states']
 
@@ -319,7 +323,7 @@ def forward(self,
                 mlp_metadata: MLPMetadata = None):
         """forward."""
         if self.tp > 1 and self.tp_mode == TPMode.DP_TP:
-            return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx)
+            return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx, mlp_metadata)
         else:
             return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata)
 

From b1f94e43673257fb9f237f0481d9208c12765118 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 30 Jan 2026 03:48:41 +0000
Subject: [PATCH 21/25] refactor code

---
 .../backends/dlinfer/ascend/op_backend.py     |  8 ++---
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 30 ++++++++++---------
 .../pytorch/backends/dlinfer/op_backend.py    |  5 ----
 lmdeploy/pytorch/backends/moe.py              | 20 ++++---------
 lmdeploy/pytorch/model_inputs.py              |  1 -
 lmdeploy/pytorch/models/qwen3_moe.py          | 13 ++------
 lmdeploy/pytorch/models/qwen3_vl.py           |  6 ----
 lmdeploy/pytorch/models/qwen3_vl_moe.py       |  2 --
 lmdeploy/pytorch/nn/moe/base.py               | 27 +++++------------
 lmdeploy/pytorch/nn/moe/default.py            |  2 --
 10 files changed, 35 insertions(+), 79 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 26dbf5b172..f303e7a63d 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -16,7 +16,7 @@
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
-from ..moe import MoeType
+from ..moe import MOEMetadata, MoeType
 from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
@@ -413,8 +413,8 @@ def get_moe_group_name(group):
         pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank,
                                                max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
         moe_group_name = get_moe_group_name(cls.dist_meta.ep_group)
-        mlp_meta_cls = cls.get_mlp_metadata_cls()
-        mlp_metadata = mlp_meta_cls(
+
+        moe_metadata = MOEMetadata(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,
             dp_size=cls.dist_meta.dp_size,
@@ -428,7 +428,7 @@ def get_moe_group_name(group):
             x_active_mask=x_active_mask,
             moe_group_name=moe_group_name,
         )
-        step_context.mlp_metadata = mlp_metadata
+        step_context.moe_metadata = moe_metadata
         return step_context
 
     @staticmethod
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index fc12c1679a..aa4331a529 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -6,12 +6,13 @@
 import torch
 
 from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
-from ..moe import FusedMoEBuilder, FusedMoEImpl, MLPMetadata, SoftmaxTopKBuilder, SoftmaxTopKImpl
+from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
 
 
 @dataclass
-class DlinferMLPMetadata(MLPMetadata):
+class MOEMetadata:
     max_tokens_across_dp: int = 1
     pad_size: int = 0
     dp_size: int = 1
@@ -26,7 +27,7 @@ class DlinferMLPMetadata(MLPMetadata):
     moe_group_name: str = None
 
 
-class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl[DlinferMLPMetadata]):
+class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
     """Dlinfer softmax topk implementation."""
 
     def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
@@ -35,14 +36,15 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
         if n_groups != -1:
             raise NotImplementedError('Group router not supported')
 
-    def forward(self, x: torch.Tensor, mlp_metada: DlinferMLPMetadata):
-        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, mlp_metada.max_tokens_across_dp,
-                                                                    mlp_metada.pad_size, mlp_metada.tp_size,
-                                                                    mlp_metada.ep_size, mlp_metada.tp_rank)
+    def forward(self, x: torch.Tensor):
+        moe_metadata = get_step_ctx_manager().current_context().moe_metadata
+        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata.max_tokens_across_dp,
+                                                                    moe_metadata.pad_size, moe_metadata.tp_size,
+                                                                    moe_metadata.ep_size, moe_metadata.tp_rank)
         return routing_weights, selected_experts
 
 
-class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder[DlinferMLPMetadata]):
+class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):
     """Dlinfer softmax topk implementation builder."""
 
     @staticmethod
@@ -51,7 +53,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
         return DlinferSoftmaxTopKImpl(top_k, dim, n_groups)
 
 
-class DlinferFusedMoEImpl(FusedMoEImpl[DlinferMLPMetadata]):
+class DlinferFusedMoEImpl(FusedMoEImpl):
     """Dlinfer fused moe implementation."""
 
     def __init__(self,
@@ -94,7 +96,6 @@ def forward(self,
                 topk_ids: torch.LongTensor,
                 gate_up_weights: torch.Tensor,
                 down_weights: torch.Tensor,
-                mlp_metadata: DlinferMLPMetadata,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
                 expert_list: List[int] = None,
@@ -102,15 +103,16 @@ def forward(self,
         """forward."""
         assert gate_up_bias is None
         assert down_bias is None
+        moe_metadata = get_step_ctx_manager().current_context().moe_metadata
 
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize, mlp_metadata.pad_size, mlp_metadata.tp_size, mlp_metadata.ep_size,
-                         mlp_metadata.tp_rank, mlp_metadata.ep_rank, mlp_metadata.tp_group, mlp_metadata.ep_group,
-                         mlp_metadata.moe_type, mlp_metadata.x_active_mask, mlp_metadata.moe_group_name,
+                         self.renormalize, moe_metadata.pad_size, moe_metadata.tp_size, moe_metadata.ep_size,
+                         moe_metadata.tp_rank, moe_metadata.ep_rank, moe_metadata.tp_group, moe_metadata.ep_group,
+                         moe_metadata.moe_type, moe_metadata.x_active_mask, moe_metadata.moe_group_name,
                          self.expert_ids_per_ep_rank)
 
 
-class DlinferFusedMoEBuilder(FusedMoEBuilder[DlinferMLPMetadata]):
+class DlinferFusedMoEBuilder(FusedMoEBuilder):
     """Dlinfer fused moe builder."""
 
     @staticmethod
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 0e39907f85..16eb604ccd 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -67,11 +67,6 @@ def get_attention_metadata_cls():
         from .attention import DlinferAttentionMetadata
         return DlinferAttentionMetadata
 
-    @staticmethod
-    def get_mlp_metadata_cls():
-        from .moe import DlinferMLPMetadata
-        return DlinferMLPMetadata
-
     @staticmethod
     def get_k_block_shape(
         block_size: int,
diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py
index ea945b7550..5b33b97da7 100644
--- a/lmdeploy/pytorch/backends/moe.py
+++ b/lmdeploy/pytorch/backends/moe.py
@@ -1,23 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Callable, Generic, List, Optional, TypeVar
+from typing import Callable, List, Optional
 
 import torch
 import torch.distributed as dist
 
 
-@dataclass
-class MLPMetadata:
-    """Base MLP metadata."""
-    ...
-
-
-T = TypeVar('T', bound=MLPMetadata)
-
-
-class SoftmaxTopKImpl(ABC, Generic[T]):
+class SoftmaxTopKImpl(ABC):
     """Softmax topk implementation api."""
 
     @staticmethod
@@ -32,7 +22,7 @@ def forward(self, x: torch.Tensor):
         raise NotImplementedError
 
 
-class SoftmaxTopKBuilder(ABC, Generic[T]):
+class SoftmaxTopKBuilder(ABC):
     """Softmax topk implementation builder."""
 
     @staticmethod
@@ -42,7 +32,7 @@ def build(top_k: int, dim: int = -1, n_groups: int = -1):
         raise NotImplementedError
 
 
-class FusedMoEImpl(ABC, Generic[T]):
+class FusedMoEImpl(ABC):
     """Fused moe implementation."""
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
@@ -68,7 +58,7 @@ def forward(self,
         raise NotImplementedError
 
 
-class FusedMoEBuilder(ABC, Generic[T]):
+class FusedMoEBuilder(ABC):
     """Fused moe builder."""
 
     @staticmethod
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 8e351428c1..80e0540e6d 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -350,7 +350,6 @@ class StepContext:
     input_multimodals: List[MultiModalTensor] = None
     vision_inputs: VisionModelInputs = None
     attn_metadata: Any = None
-    mlp_metadata: Any = None
     cross_seqlens: torch.LongTensor = None
     cross_kv_seqlens: torch.LongTensor = None
     cross_attn_metadata: Any = None
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
index 66076de6dd..9d50cbb86b 100644
--- a/lmdeploy/pytorch/models/qwen3_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -225,13 +225,12 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         all_routed_experts: torch.Tensor = None,
-        mlp_metadata: Any = None,
     ):
         """forward."""
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
-        topk_weights, topk_ids = self.softmax_topk(router_logits, mlp_metadata)
+        topk_weights, topk_ids = self.softmax_topk(router_logits)
         if all_routed_experts is not None:
             all_routed_experts[:, self.layer_idx, :] = topk_ids
         if get_dist_manager().current_context().dist_config.enable_eplb:
@@ -240,7 +239,6 @@ def forward(
             hidden_states,
             topk_weights,
             topk_ids,
-            mlp_metadata,
         )
 
         out_states = out_states.reshape(batch_size, sequence_length, -1)
@@ -286,7 +284,6 @@ def forward(
         past_key_value: Optional[List[torch.FloatTensor]],
         residual: Optional[torch.Tensor] = None,
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         all_routed_experts: torch.Tensor = None,
     ):
 
@@ -306,7 +303,7 @@ def forward(
 
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts, mlp_metadata=mlp_metadata)
+        hidden_states = self.mlp(hidden_states, all_routed_experts=all_routed_experts)
 
         outputs = (hidden_states, residual)
         return outputs
@@ -352,7 +349,6 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         all_routed_experts: torch.Tensor = None,
     ):
@@ -379,7 +375,6 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
-                mlp_metadata=mlp_metadata,
                 all_routed_experts=all_routed_experts,
             )
 
@@ -435,7 +430,6 @@ def forward(
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
     ):
@@ -456,7 +450,6 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
-            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             all_routed_experts=all_routed_experts,
         )
@@ -483,7 +476,6 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
-        mlp_metadata = context.mlp_metadata
 
         # process vision embeddings
         vision_embeddings = context.input_embeddings
@@ -499,7 +491,6 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
-            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
         )
 
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index 5d81c8c327..60c3617ffe 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -102,7 +102,6 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
@@ -144,7 +143,6 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
-                mlp_metadata=mlp_metadata,
             )
 
             # add visual features to the hidden states of first several layers
@@ -499,7 +497,6 @@ def forward(
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
         pixel_values: torch.Tensor = None,
@@ -544,7 +541,6 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
-            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             mrope_position_ids=mrope_position_ids,
             # args for deepstack
@@ -578,7 +574,6 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
-        mlp_metadata = context.mlp_metadata
 
         pixel_values = None
         vis_cu_seqlens = None
@@ -619,7 +614,6 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
-            mlp_metadata=mlp_metadata,
             inputs_embeds=inputs_embeds,
             mrope_position_ids=mrope_position_ids,
             pixel_values=pixel_values,
diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py
index 7586db3d11..1dc7e32de9 100644
--- a/lmdeploy/pytorch/models/qwen3_vl_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py
@@ -33,7 +33,6 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
-        mlp_metadata: Any = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
@@ -75,7 +74,6 @@ def forward(
                 past_key_value=past_key_value,
                 residual=residual,
                 attn_metadata=attn_metadata,
-                mlp_metadata=mlp_metadata,
             )
 
             # add visual features to the hidden states of first several layers
diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py
index dc7d7ab8a0..484dbbe492 100644
--- a/lmdeploy/pytorch/nn/moe/base.py
+++ b/lmdeploy/pytorch/nn/moe/base.py
@@ -8,7 +8,6 @@
 
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.backends import OpType, get_backend
-from lmdeploy.pytorch.backends.moe import MLPMetadata
 from lmdeploy.pytorch.config import TPMode
 from lmdeploy.pytorch.distributed import get_dist_manager, get_tp_world_rank
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
@@ -30,9 +29,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
         impl_builder = get_backend().get_layer_impl_builder(OpType.SoftmaxTopK)
         self.impl = impl_builder.build(top_k, dim, n_groups=n_groups)
 
-    def forward(self, x: torch.Tensor, mlp_metadata: MLPMetadata):
+    def forward(self, x: torch.Tensor):
         """forward."""
-        return self.impl.forward(x, mlp_metadata)
+        return self.impl.forward(x)
 
 
 def update_dims(hidden_dim: int, ffn_dim: int):
@@ -135,8 +134,7 @@ def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: to
         cur_out = self.gemm_func(hidden_states, topk_weights, topk_ids)
         return self.reduce_scatter(cur_out, output_states, tp_sizes)
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                mlp_metadata: MLPMetadata):
+    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor):
         """forward."""
 
         def __slice_tensor(tensor: torch.Tensor, slice_size: int):
@@ -178,7 +176,6 @@ def __slice_and_gather():
 
         # pre
         cur_inputs = __slice_and_gather()
-        cur_inputs.update(dict(mlp_metadata=mlp_metadata))
 
         out_handles = []
         # main loop
@@ -187,7 +184,6 @@ def __slice_and_gather():
             _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
             out_handles.append(handle)
             cur_inputs = next_inputs
-            cur_inputs.update(dict(mlp_metadata=mlp_metadata))
 
         # post
         _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
@@ -262,13 +258,12 @@ def init_dist_args(self, all_reduce: bool):
 
         if self.tp > 1 and self.tp_mode == TPMode.DP_TP:
 
-            def __gemm_func(hidden_states, topk_weights, topk_ids, mlp_metadata):
+            def __gemm_func(hidden_states, topk_weights, topk_ids):
                 return self.gemm(
                     dict(
                         hidden_states=hidden_states,
                         topk_weights=topk_weights,
                         topk_idx=topk_ids,
-                        mlp_metadata=mlp_metadata,
                         moe_type=MoeType.Default,
                     ))['hidden_states']
 
@@ -301,8 +296,7 @@ def forward_dptp(self) -> MoEForwardDPTP:
         """Forward dptp."""
         return self._forward_dptp
 
-    def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor,
-                        mlp_metadata: MLPMetadata):
+    def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor):
         """Default forward."""
         state = {
             'hidden_states': hidden_states,
@@ -311,21 +305,16 @@ def forward_default(self, hidden_states: torch.Tensor, topk_weights: torch.Tenso
             'moe_type': MoeType.Default,
         }
         recv_state = self.dispatch(state)
-        recv_state.update({'mlp_metadata': mlp_metadata})
         gemm_state = self.gemm(recv_state)
         out_state = self.combine(gemm_state)
         return out_state['hidden_states']
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                topk_weights: torch.Tensor,
-                topk_idx: torch.LongTensor,
-                mlp_metadata: MLPMetadata = None):
+    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_idx: torch.LongTensor):
         """forward."""
         if self.tp > 1 and self.tp_mode == TPMode.DP_TP:
-            return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx, mlp_metadata)
+            return self.forward_dptp.forward(hidden_states, topk_weights, topk_idx)
         else:
-            return self.forward_default(hidden_states, topk_weights, topk_idx, mlp_metadata)
+            return self.forward_default(hidden_states, topk_weights, topk_idx)
 
     def renormalize(self, topk_weights):
         """renormalize."""
diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py
index 358674a466..0633aa001a 100644
--- a/lmdeploy/pytorch/nn/moe/default.py
+++ b/lmdeploy/pytorch/nn/moe/default.py
@@ -298,14 +298,12 @@ def gemm(self, state: Dict):
             hidden_states = state['hidden_states']
             topk_weights = state['topk_weights']
             topk_ids = state['topk_idx']
-            mlp_metadata = state['mlp_metadata']
 
             hidden_states = self.impl.forward(hidden_states,
                                               topk_weights,
                                               topk_ids,
                                               self.gate_up.weight,
                                               self.down.weight,
-                                              mlp_metadata,
                                               self.gate_up.bias,
                                               self.down.bias,
                                               self.expert_list,

From 2f7710807c1e775e138af62d2fc5e38cb0984ae5 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Thu, 5 Feb 2026 09:10:29 +0000
Subject: [PATCH 22/25] refactor code

---
 docker/Dockerfile_ascend_a3                   |  3 +-
 .../backends/dlinfer/ascend/op_backend.py     | 21 ++++----
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 50 +++++++------------
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  5 +-
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 19 ++-----
 .../dlinfer/moe_gating_topk_softmax.py        |  8 +--
 requirements/runtime_ascend.txt               |  6 +--
 7 files changed, 43 insertions(+), 69 deletions(-)

diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3
index d8fc152ed1..1d8064a129 100644
--- a/docker/Dockerfile_ascend_a3
+++ b/docker/Dockerfile_ascend_a3
@@ -4,7 +4,7 @@
 ARG ASCEND_DEVICE_TYPE=ascend_a3
 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub
 
-FROM ${ASCEND_HUB}/cann:8.3.rc1-a3-openeuler24.03-py3.11 AS ascend_a3_base
+FROM ${ASCEND_HUB}/cann:8.5.0-a3-openeuler24.03-py3.11 AS ascend_a3_base
 
 FROM ${ASCEND_DEVICE_TYPE}_base AS builder
 ENV DEBIAN_FRONTEND=noninteractive
@@ -22,6 +22,5 @@ ARG LMDEPLOY_TAG=main
 RUN --mount=type=cache,target=/root/.cache \
     pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \
-    pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \
     TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \
     LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG}
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index f303e7a63d..ffa3da6cb4 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -16,7 +16,7 @@
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
-from ..moe import MOEMetadata, MoeType
+from ..moe import DlinferMoeMetada, DlinferMoeType
 from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
@@ -316,7 +316,7 @@ def init_mc2_token_capacity(tp_size):
 
         def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             if ep_size <= 1:
-                return MoeType.ALLGATHER
+                return DlinferMoeType.ALLGATHER
             mc2_token_capacity = init_mc2_token_capacity(tp_size)
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
@@ -324,29 +324,29 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
                 max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size
             if SocVersion.is_A2():
                 if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16:
-                    return MoeType.MC2
+                    return DlinferMoeType.MC2
                 else:
-                    return MoeType.ALLGATHER
+                    return DlinferMoeType.ALLGATHER
             elif SocVersion.is_A3():
                 if max_tokens_across_dp <= mc2_token_capacity:
-                    return MoeType.MC2
+                    return DlinferMoeType.MC2
                 else:
-                    return MoeType.ALLTOALL
+                    return DlinferMoeType.ALLTOALL
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
         def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size,
                          moe_type):
             x_active_mask = None
-            if moe_type == MoeType.MC2:
+            if moe_type == DlinferMoeType.MC2:
                 paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
                 pad_size = paded_size - runtime_tokens_current_rank
                 x_active_mask = torch.ones(actual_tokens_current_rank,
                                            dtype=torch.bool,
                                            device=torch.npu.current_device())
-            elif moe_type == MoeType.ALLTOALL:
+            elif moe_type == DlinferMoeType.ALLTOALL:
                 pad_size = tp_size - runtime_tokens_current_rank
-            elif moe_type == MoeType.ALLGATHER:
+            elif moe_type == DlinferMoeType.ALLGATHER:
                 pad_size = max_tokens_across_dp - runtime_tokens_current_rank
             else:
                 pad_size = 0
@@ -414,7 +414,7 @@ def get_moe_group_name(group):
                                                max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
         moe_group_name = get_moe_group_name(cls.dist_meta.ep_group)
 
-        moe_metadata = MOEMetadata(
+        moe_metadata = DlinferMoeMetada(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,
             dp_size=cls.dist_meta.dp_size,
@@ -464,7 +464,6 @@ def device_count():
     @staticmethod
     def support_ray():
         """Support ray."""
-        # return False
         if not _envs.ascend_set_rt_visable_devices_by_ray:
             os.environ['RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES'] = '1'
         return True
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index aa4331a529..a347dba27b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,32 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from dataclasses import dataclass
 from typing import Callable, List
 
 import torch
 
-from lmdeploy.pytorch.kernels.dlinfer import MoeType, fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetada  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeType  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
 from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
 
 
-@dataclass
-class MOEMetadata:
-    max_tokens_across_dp: int = 1
-    pad_size: int = 0
-    dp_size: int = 1
-    tp_size: int = 1
-    ep_size: int = 1
-    tp_rank: int = 0
-    ep_rank: int = 0
-    tp_group: torch.distributed.ProcessGroup = None
-    ep_group: torch.distributed.ProcessGroup = None
-    moe_type: MoeType = MoeType.UNDEFINED
-    x_active_mask: torch.Tensor = None
-    moe_group_name: str = None
-
-
 class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
     """Dlinfer softmax topk implementation."""
 
@@ -37,10 +22,9 @@ def __init__(self, top_k: int, dim: int = -1, n_groups: int = -1):
             raise NotImplementedError('Group router not supported')
 
     def forward(self, x: torch.Tensor):
-        moe_metadata = get_step_ctx_manager().current_context().moe_metadata
-        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata.max_tokens_across_dp,
-                                                                    moe_metadata.pad_size, moe_metadata.tp_size,
-                                                                    moe_metadata.ep_size, moe_metadata.tp_rank)
+        step_context = get_step_ctx_manager().current_context()
+        moe_metadata = getattr(step_context, 'moe_metadata', None)
+        routing_weights, selected_experts = moe_gating_topk_softmax(x, self.top_k, moe_metadata)
         return routing_weights, selected_experts
 
 
@@ -67,11 +51,13 @@ def __init__(self,
         self.renormalize = renormalize
         self.ep_size = ep_size
         self.ep_group = ep_group
-        self.expert_ids_per_ep_rank = torch.tensor(
-            [i % (self.num_experts // self.ep_size) for i in range(num_experts)],
-            dtype=torch.int32,
-            device=torch.npu.current_device(),
-        )
+        self.expert_ids_per_ep_rank = None
+        if self.ep_size > 1:
+            self.expert_ids_per_ep_rank = torch.tensor(
+                [i % (self.num_experts // self.ep_size) for i in range(num_experts)],
+                dtype=torch.int32,
+                device=torch.cuda.current_device(),
+            )
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor):
         """Update weights."""
@@ -103,13 +89,13 @@ def forward(self,
         """forward."""
         assert gate_up_bias is None
         assert down_bias is None
-        moe_metadata = get_step_ctx_manager().current_context().moe_metadata
 
+        step_context = get_step_ctx_manager().current_context()
+        moe_metadata = getattr(step_context, 'moe_metadata', None)
+        if moe_metadata is not None:
+            moe_metadata.expert_ids_per_ep_rank = self.expert_ids_per_ep_rank
         return fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, self.top_k,
-                         self.renormalize, moe_metadata.pad_size, moe_metadata.tp_size, moe_metadata.ep_size,
-                         moe_metadata.tp_rank, moe_metadata.ep_rank, moe_metadata.tp_group, moe_metadata.ep_group,
-                         moe_metadata.moe_type, moe_metadata.x_active_mask, moe_metadata.moe_group_name,
-                         self.expert_ids_per_ep_rank)
+                         self.renormalize, moe_metadata)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index f9ea874ae5..834de084df 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,7 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .flash_attention import flash_attention_fwd
-from .fused_moe import MoeType, fused_moe
+from .fused_moe import DlinferMoeMetada, DlinferMoeType, fused_moe
 from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
@@ -15,7 +15,8 @@
     'apply_rotary_pos_emb',
     'awq_linear',
     'fill_kv_cache',
-    'MoeType',
+    'DlinferMoeType',
+    'DlinferMoeMetada',
     'fused_moe',
     'paged_attention_fwd',
     'flash_attention_fwd',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index decce941ac..7ff6bbccf0 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-import torch.distributed as dist
-from dlinfer.utils.type_annotation import MoeType
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
+from dlinfer.utils.type_annotation import MoeType as DlinferMoeType  # noqa: F401
 from torch import Tensor
 
 
@@ -13,19 +13,8 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
-    pad_size: int,
-    tp_size: int,
-    ep_size: int,
-    tp_rank: int,
-    ep_rank: int,
-    tp_group: dist.ProcessGroup,
-    ep_group: dist.ProcessGroup,
-    moe_type: MoeType,
-    x_active_mask: Tensor,
-    moe_group_name: str,
-    expert_ids_per_ep_rank: Tensor,
+    moe_metadata: DlinferMoeMetada,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
-                             pad_size, tp_size, ep_size, tp_rank, ep_rank, tp_group, ep_group, moe_type, x_active_mask,
-                             moe_group_name, expert_ids_per_ep_rank)
+                             moe_metadata)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index db71e87787..c991595041 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
 import dlinfer.ops as ext_ops
 from torch import Tensor
 
 
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int, max_tokens_across_dp: int, pad_size: int, tp_size: int,
-                            ep_size: int, tp_rank: int) -> tuple[Tensor, Tensor]:
-    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, max_tokens_across_dp,
-                                                                        pad_size, tp_size, ep_size, tp_rank)
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: Any) -> tuple[Tensor, Tensor]:
+    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts
diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
index d94a38d0bf..22d1ca8418 100644
--- a/requirements/runtime_ascend.txt
+++ b/requirements/runtime_ascend.txt
@@ -22,9 +22,9 @@ safetensors
 sentencepiece
 shortuuid
 tiktoken
-torch>=2.3.1,<2.9.0
-torch-npu>=2.3.1,<2.9.0
-torchvision>=0.18.1,<0.24.0
+torch>=2.3.1,<2.10.0
+torch-npu>=2.3.1,<2.10.0
+torchvision>=0.18.1,<0.25.0
 transformers
 uvicorn
 xgrammar

From 07eaf3f71c633e762914ecb2f2e71a90bc9b45b6 Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Thu, 5 Feb 2026 10:58:41 +0000
Subject: [PATCH 23/25] remove useless code

---
 .../pytorch/backends/dlinfer/ascend/op_backend.py  |  2 --
 lmdeploy/pytorch/engine/executor/ray_executor.py   |  3 ---
 .../kernels/dlinfer/moe_gating_topk_softmax.py     |  5 ++---
 lmdeploy/pytorch/models/deepseek_v2.py             | 14 --------------
 4 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index ffa3da6cb4..0455bf50da 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -118,11 +118,9 @@ def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_laye
 class AscendOpsBackend(DlinferOpsBackend):
     """Ascend layer backend."""
     enable_graph: bool = False
-    half_negative_inf: float = torch.finfo(torch.float16).min
     total_slots = None
     max_batches = None
     dist_meta: DistMeta = None
-    graph_capture_sizes = None
 
     @staticmethod
     def get_name() -> str:
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index b0a4219a46..e4b4fbac2a 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -286,9 +286,6 @@ def __init__(
             self._prefetch_task: asyncio.Task = None
             self.remote_outs: asyncio.Queue = None
 
-            rank_offset = dist_config.dp_rank * attn_tp
-            self.rank_offset = rank_offset
-
             logger.info('Init distributed environment by device.')
             self.rank_offset = dist_config.dp_rank * attn_tp
             self._init_distributed_environment_by_device(device_type)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index c991595041..68d7de7fe2 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,10 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any
-
 import dlinfer.ops as ext_ops
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
 from torch import Tensor
 
 
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: Any) -> tuple[Tensor, Tensor]:
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: DlinferMoeMetada) -> tuple[Tensor, Tensor]:
     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index 5e781f8034..4db550eb8d 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -1185,14 +1185,10 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             if weight_name not in name:
                 continue
             name = name.replace(weight_name, param_name)
-            if name not in params_dict.keys():
-                continue
             param = params_dict[name]
             load_weight(param, loaded_weight, expert_id=expert_id, shard_id=shard_id)
             break
         else:
-            if name not in params_dict.keys():
-                return
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
@@ -1223,8 +1219,6 @@ def __load_kcvc(name: str, weight: torch.Tensor):
                                                                                         dim=1)
             w_vc = w_vc.transpose(1, 2).contiguous()
             kc_param_name = name.replace('.kv_b_proj', '.kc')
-            if kc_param_name not in params_dict.keys():
-                return
             param_kc = params_dict[kc_param_name]
             load_weight(param_kc, w_kc)
             vc_param_name = name.replace('.kv_b_proj', '.vc')
@@ -1271,8 +1265,6 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
             else:
                 loaded_weight = loaded_weight.to(device)
                 weight = __update_pe(loaded_weight, head_dim, pe_dim_offset)
-            if name not in params_dict.keys():
-                continue
             param = params_dict[name]
             load_weight(param, weight)
             break
@@ -1290,8 +1282,6 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
                 else:
                     __load_kcvc(name, loaded_weight)
             else:
-                if name not in params_dict.keys():
-                    return
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
 
@@ -1379,13 +1369,9 @@ def __skip_layers():
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
-                    if name not in params_dict.keys():
-                        continue
                     param = params_dict[name]
                     load_weight(param, loaded_weight, shard_id=shard_id)
                     break
                 else:
-                    if name not in params_dict.keys():
-                        continue
                     param = params_dict[name]
                     load_weight(param, loaded_weight)

From 700db7de264fe545aece99bf4ea5466d8df7d3cb Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 6 Feb 2026 09:32:38 +0000
Subject: [PATCH 24/25] update code

---
 docker/Dockerfile_ascend_a3                   |  1 +
 .../supported_models/supported_models.md      |  2 +-
 .../backends/dlinfer/ascend/op_backend.py     | 58 +++++++++----------
 lmdeploy/pytorch/backends/dlinfer/moe.py      |  4 +-
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  6 +-
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py |  6 +-
 .../dlinfer/moe_gating_topk_softmax.py        |  5 +-
 7 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3
index 1d8064a129..aa975d3b9c 100644
--- a/docker/Dockerfile_ascend_a3
+++ b/docker/Dockerfile_ascend_a3
@@ -22,5 +22,6 @@ ARG LMDEPLOY_TAG=main
 RUN --mount=type=cache,target=/root/.cache \
     pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \
+    pip install --no-cache-dir torch==2.9.0 torch-npu==2.9.0 torchvision==0.24.0 && \
     TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \
     LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG}
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 317ab78e71..399594af91 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -150,7 +150,7 @@
 |   QWen2.5-VL   | 3B - 72B  | MLLM |       Yes        |       Yes        |       -       |       -       |      Yes       |        -         |    Yes    |    No     |
 |   QWen2-MoE    |  A14.57B  | LLM  |       Yes        |        -         |      No       |      No       |       -        |        -         |    Yes    |     -     |
 |     QWen3      | 0.6B-235B | LLM  |       Yes        |       Yes        |      No       |      No       |      Yes       |       Yes        |    Yes    |    Yes    |
-|  DeepSeek-V2   |    16B    | LLM  |       Yes        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
+|  DeepSeek-V2   |    16B    | LLM  |        No        |       Yes        |      No       |      No       |       -        |        -         |     -     |     -     |
 | InternVL(v1.5) |  2B-26B   | MLLM |       Yes        |        -         |      Yes      |      Yes      |       -        |        -         |    Yes    |     -     |
 |   InternVL2    |  1B-40B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |
 |  InternVL2.5   |  1B-78B   | MLLM |       Yes        |       Yes        |      Yes      |      Yes      |      Yes       |        -         |    Yes    |    Yes    |
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 0455bf50da..843ba6e987 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -16,7 +16,7 @@
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.utils import get_logger
 
-from ..moe import DlinferMoeMetada, DlinferMoeType
+from ..moe import DlinferMoECommType, DlinferMoeMetadata
 from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
@@ -281,19 +281,19 @@ def get_dist_meta():
         def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
             if ep_size <= 1:
                 return 0, 0, 0
-            # get runtime_tokens_current_rank
+            # get padded_tokens_current_rank
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
                 from dlinfer.framework.lmdeploy_ext.cudagraph.ascend_cudagraph import get_ascend_compatible_size
                 actual_tokens_current_rank = step_context.q_seqlens.shape[0]
-                runtime_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank),
-                                                  cls.max_batches)
+                padded_tokens_current_rank = min(get_ascend_compatible_size(actual_tokens_current_rank),
+                                                 cls.max_batches)
             else:
                 actual_tokens_current_rank = step_context.q_seqlens.sum().item()
-                runtime_tokens_current_rank = actual_tokens_current_rank
+                padded_tokens_current_rank = actual_tokens_current_rank
             # get max_tokens_across_dp
             if dp_size > 1:
-                runtime_tokens_tensor = torch.tensor([runtime_tokens_current_rank],
+                runtime_tokens_tensor = torch.tensor([padded_tokens_current_rank],
                                                      dtype=step_context.q_seqlens.dtype,
                                                      device=torch.npu.current_device())
                 world_size = dp_size * tp_size
@@ -303,8 +303,8 @@ def get_tokens_info(dp_size, tp_size, ep_size, ep_group):
                 dist.all_gather_into_tensor(runtime_tokens_buffer, runtime_tokens_tensor, ep_group)
                 max_tokens_across_dp = torch.max(runtime_tokens_buffer).item()
             else:
-                max_tokens_across_dp = runtime_tokens_current_rank
-            return actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp
+                max_tokens_across_dp = padded_tokens_current_rank
+            return actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp
 
         @lru_cache
         def init_mc2_token_capacity(tp_size):
@@ -312,9 +312,9 @@ def init_mc2_token_capacity(tp_size):
             num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
             return num_tokens_per_tp_rank * tp_size
 
-        def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
+        def select_moe_comm_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             if ep_size <= 1:
-                return DlinferMoeType.ALLGATHER
+                return DlinferMoECommType.ALLGATHER
             mc2_token_capacity = init_mc2_token_capacity(tp_size)
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
@@ -322,30 +322,30 @@ def select_moe_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
                 max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size
             if SocVersion.is_A2():
                 if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16:
-                    return DlinferMoeType.MC2
+                    return DlinferMoECommType.MC2
                 else:
-                    return DlinferMoeType.ALLGATHER
+                    return DlinferMoECommType.ALLGATHER
             elif SocVersion.is_A3():
                 if max_tokens_across_dp <= mc2_token_capacity:
-                    return DlinferMoeType.MC2
+                    return DlinferMoECommType.MC2
                 else:
-                    return DlinferMoeType.ALLTOALL
+                    return DlinferMoECommType.ALLTOALL
             else:
                 raise ValueError(f'Unsupported soc_version: {SocVersion.soc_version()}')
 
-        def get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp, tp_size,
-                         moe_type):
+        def get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp, tp_size,
+                         moe_comm_type):
             x_active_mask = None
-            if moe_type == DlinferMoeType.MC2:
+            if moe_comm_type == DlinferMoECommType.MC2:
                 paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
-                pad_size = paded_size - runtime_tokens_current_rank
+                pad_size = paded_size - padded_tokens_current_rank
                 x_active_mask = torch.ones(actual_tokens_current_rank,
                                            dtype=torch.bool,
                                            device=torch.npu.current_device())
-            elif moe_type == DlinferMoeType.ALLTOALL:
-                pad_size = tp_size - runtime_tokens_current_rank
-            elif moe_type == DlinferMoeType.ALLGATHER:
-                pad_size = max_tokens_across_dp - runtime_tokens_current_rank
+            elif moe_comm_type == DlinferMoECommType.ALLTOALL:
+                pad_size = tp_size - padded_tokens_current_rank
+            elif moe_comm_type == DlinferMoECommType.ALLGATHER:
+                pad_size = max_tokens_across_dp - padded_tokens_current_rank
             else:
                 pad_size = 0
             return pad_size, x_active_mask
@@ -404,15 +404,15 @@ def get_moe_group_name(group):
         step_context.attn_metadata = attn_metadata
 
         cls.dist_meta = get_dist_meta()
-        actual_tokens_current_rank, runtime_tokens_current_rank, max_tokens_across_dp = get_tokens_info(
+        actual_tokens_current_rank, padded_tokens_current_rank, max_tokens_across_dp = get_tokens_info(
             cls.dist_meta.dp_size, cls.dist_meta.tp_size, cls.dist_meta.ep_size, cls.dist_meta.ep_group)
-        moe_type = select_moe_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
-                                   cls.dist_meta.ep_size)
-        pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, runtime_tokens_current_rank,
-                                               max_tokens_across_dp, cls.dist_meta.tp_size, moe_type)
+        moe_comm_type = select_moe_comm_type(max_tokens_across_dp, cls.dist_meta.dp_size, cls.dist_meta.tp_size,
+                                             cls.dist_meta.ep_size)
+        pad_size, x_active_mask = get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank,
+                                               max_tokens_across_dp, cls.dist_meta.tp_size, moe_comm_type)
         moe_group_name = get_moe_group_name(cls.dist_meta.ep_group)
 
-        moe_metadata = DlinferMoeMetada(
+        moe_metadata = DlinferMoeMetadata(
             max_tokens_across_dp=max_tokens_across_dp,
             pad_size=pad_size,
             dp_size=cls.dist_meta.dp_size,
@@ -422,7 +422,7 @@ def get_moe_group_name(group):
             ep_rank=cls.dist_meta.ep_rank,
             tp_group=cls.dist_meta.tp_group,
             ep_group=cls.dist_meta.ep_group,
-            moe_type=moe_type,
+            moe_comm_type=moe_comm_type,
             x_active_mask=x_active_mask,
             moe_group_name=moe_group_name,
         )
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index a347dba27b..f034a0bb07 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -4,8 +4,8 @@
 
 import torch
 
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetada  # noqa: F401
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeType  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoECommType  # noqa: F401
+from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetadata  # noqa: F401
 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 834de084df..660368ba23 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,7 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .flash_attention import flash_attention_fwd
-from .fused_moe import DlinferMoeMetada, DlinferMoeType, fused_moe
+from .fused_moe import DlinferMoECommType, DlinferMoeMetadata, fused_moe
 from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
@@ -15,8 +15,8 @@
     'apply_rotary_pos_emb',
     'awq_linear',
     'fill_kv_cache',
-    'DlinferMoeType',
-    'DlinferMoeMetada',
+    'DlinferMoECommType',
+    'DlinferMoeMetadata',
     'fused_moe',
     'paged_attention_fwd',
     'flash_attention_fwd',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
index 7ff6bbccf0..4624e0c199 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
-from dlinfer.utils.type_annotation import MoeType as DlinferMoeType  # noqa: F401
+from dlinfer.utils.type_annotation import MoECommType as DlinferMoECommType  # noqa: F401
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
 from torch import Tensor
 
 
@@ -13,7 +13,7 @@ def fused_moe(
     topk_ids: Tensor,
     topk: int,
     renormalize: bool,
-    moe_metadata: DlinferMoeMetada,
+    moe_metadata: DlinferMoeMetadata,
 ):
     """Dlinfer fused moe."""
     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize,
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index 68d7de7fe2..c72f5f2324 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dlinfer.ops as ext_ops
-from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetada
+from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
 from torch import Tensor
 
 
-def moe_gating_topk_softmax(router_logits: Tensor, topk: int, moe_metadata: DlinferMoeMetada) -> tuple[Tensor, Tensor]:
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int,
+                            moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]:
     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts

From 6a957ff0853ffa590614a40e4257485d353897df Mon Sep 17 00:00:00 2001
From: yaofengchen <fengchenyao@foxmail.com>
Date: Fri, 6 Feb 2026 11:17:16 +0000
Subject: [PATCH 25/25] update code

---
 lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py     | 7 +++----
 .../pytorch/kernels/dlinfer/moe_gating_topk_softmax.py     | 4 +++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 843ba6e987..484cbd1b72 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -47,7 +47,7 @@ def is_Ascend910(cls) -> bool:
 
     @classmethod
     @lru_cache(maxsize=1)
-    def soc_version(cls) -> str:
+    def soc_version(cls) -> int:
         return torch.npu.get_soc_version()
 
     @classmethod
@@ -318,7 +318,6 @@ def select_moe_comm_type(max_tokens_across_dp, dp_size, tp_size, ep_size):
             mc2_token_capacity = init_mc2_token_capacity(tp_size)
             is_graph = cls.enable_graph and step_context.is_decoding
             if is_graph:
-                import math
                 max_tokens_across_dp = math.ceil(max_tokens_across_dp / tp_size) * tp_size
             if SocVersion.is_A2():
                 if max_tokens_across_dp <= mc2_token_capacity and dp_size * tp_size >= 16:
@@ -337,8 +336,8 @@ def get_pad_info(actual_tokens_current_rank, padded_tokens_current_rank, max_tok
                          moe_comm_type):
             x_active_mask = None
             if moe_comm_type == DlinferMoECommType.MC2:
-                paded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
-                pad_size = paded_size - padded_tokens_current_rank
+                padded_size = math.ceil(max_tokens_across_dp / tp_size) * tp_size
+                pad_size = padded_size - padded_tokens_current_rank
                 x_active_mask = torch.ones(actual_tokens_current_rank,
                                            dtype=torch.bool,
                                            device=torch.npu.current_device())
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index c72f5f2324..cc1a324bf4 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import dlinfer.ops as ext_ops
 from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
 from torch import Tensor
 
 
 def moe_gating_topk_softmax(router_logits: Tensor, topk: int,
-                            moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]:
+                            moe_metadata: DlinferMoeMetadata) -> Tuple[Tensor, Tensor]:
     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts