From 6918c37c1bcb50a7808a50e628c7009b6a7e0112 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 28 Apr 2026 06:56:34 +0000
Subject: [PATCH 01/25] init xtuner ep doc

---
 .dev_scripts/run_validate_xtuner_ep_md.sh |  31 ++
 .dev_scripts/validate_xtuner_ep_md.py     | 386 ++++++++++++++++++++++
 xtuner_ep.md                              | 374 +++++++++++++++++++++
 3 files changed, 791 insertions(+)
 create mode 100755 .dev_scripts/run_validate_xtuner_ep_md.sh
 create mode 100644 .dev_scripts/validate_xtuner_ep_md.py
 create mode 100644 xtuner_ep.md

diff --git a/.dev_scripts/run_validate_xtuner_ep_md.sh b/.dev_scripts/run_validate_xtuner_ep_md.sh
new file mode 100755
index 000000000..b00ac00f4
--- /dev/null
+++ b/.dev_scripts/run_validate_xtuner_ep_md.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# 默认使用用户指定的 fla 环境；需要切换时可在命令前覆盖 CONDA_ENV。
+CONDA_ENV="${CONDA_ENV:-fla}"
+CONDA_SH="${CONDA_SH:-~/miniconda3/etc/profile.d/conda.sh}"
+
+# xtuner_ep.md 的示例固定为 EP=2；默认额外验证 4 份 DP replica。
+EP_SIZE="${EP_SIZE:-2}"
+DP_SIZE="${DP_SIZE:-4}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * DP_SIZE))}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+MASTER_PORT="${MASTER_PORT:-29531}"
+
+source "${CONDA_SH}"
+conda activate "${CONDA_ENV}"
+
+# 显式使用当前仓库代码，避免导入 conda 环境或其他目录下安装的 xtuner。
+export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
+export CUDA_VISIBLE_DEVICES
+export EP_SIZE
+export DP_SIZE
+
+cd "${REPO_ROOT}"
+torchrun \
+  --nproc-per-node="${NPROC_PER_NODE}" \
+  --master-port="${MASTER_PORT}" \
+  .dev_scripts/validate_xtuner_ep_md.py
diff --git a/.dev_scripts/validate_xtuner_ep_md.py b/.dev_scripts/validate_xtuner_ep_md.py
new file mode 100644
index 000000000..465842ef8
--- /dev/null
+++ b/.dev_scripts/validate_xtuner_ep_md.py
@@ -0,0 +1,386 @@
+"""验证 xtuner_ep.md 中 EP all2all 示例的中间顺序。
+
+运行方式：
+    EP_SIZE=2 DP_SIZE=4 torchrun --nproc-per-node=8 .dev_scripts/validate_xtuner_ep_md.py
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+
+# 只从 xtuner 引入被验证的 dispatcher，避免依赖无关的模型/训练类。
+from xtuner.v1.module.dispatcher.torch_all2all import TorchAll2AllDispatcher
+
+
+EP_SIZE = 2
+DEFAULT_DP_SIZE = 4
+N_ROUTED_EXPERTS = 6
+EXPERTS_PER_RANK = 3
+EXPERT_OUTPUT_SCALE = 100.0
+HIDDEN_SIZE = 128
+
+
+@dataclass(frozen=True)
+class RankCase:
+    token_values: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    topk_weights: tuple[tuple[float, float], ...]
+
+
+@dataclass(frozen=True)
+class RankExpected:
+    input_hidden: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    pre_hidden: tuple[float, ...]
+    pre_row_id_map: tuple[int, ...]
+    dispatch_hidden: tuple[float, ...]
+    input_splits: tuple[int, ...]
+    output_splits: tuple[int, ...]
+    tokens_per_expert_group: tuple[float, ...]
+    post_hidden: tuple[float, ...]
+    post_row_ids_map: tuple[int, ...]
+    tokens_per_expert: tuple[float, ...]
+    pre_combine_hidden: tuple[float, ...]
+    combine_hidden: tuple[float, ...]
+    post_combine_hidden: tuple[float, ...]
+
+
+@dataclass(frozen=True)
+class ParallelInfo:
+    global_rank: int
+    dp_rank: int
+    ep_rank: int
+    device: torch.device
+    ep_group: dist.ProcessGroup
+
+
+CASES: dict[int, RankCase] = {
+    0: RankCase(
+        token_values=(10.0, 11.0, 12.0, 13.0),
+        topk_ids=((0, 4), (3, 1), (2, 5), (4, 0)),
+        topk_weights=((0.25, 0.75), (0.4, 0.6), (0.7, 0.3), (0.8, 0.2)),
+    ),
+    1: RankCase(
+        token_values=(20.0, 21.0, 22.0, 23.0),
+        topk_ids=((1, 3), (4, 2), (5, 0), (3, 1)),
+        topk_weights=((0.2, 0.8), (0.5, 0.5), (0.9, 0.1), (0.35, 0.65)),
+    ),
+}
+
+
+EXPECTED: dict[int, RankExpected] = {
+    0: RankExpected(
+        input_hidden=(10.0, 11.0, 12.0, 13.0),
+        topk_ids=((0, 4), (3, 1), (2, 5), (4, 0)),
+        pre_hidden=(10.0, 13.0, 11.0, 12.0, 11.0, 10.0, 13.0, 12.0),
+        pre_row_id_map=(0, 4, 3, 6, 5, 2, 7, 1),
+        dispatch_hidden=(10.0, 13.0, 11.0, 12.0, 22.0, 20.0, 23.0, 21.0),
+        input_splits=(4, 4),
+        output_splits=(4, 4),
+        tokens_per_expert_group=(2.0, 1.0, 1.0, 1.0, 2.0, 1.0),
+        post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 12.0, 21.0),
+        post_row_ids_map=(0, 1, 3, 6, 2, 4, 5, 7),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        pre_combine_hidden=(10.0, 13.0, 111.0, 212.0, 22.0, 120.0, 123.0, 221.0),
+        combine_hidden=(10.0, 13.0, 111.0, 212.0, 311.0, 410.0, 413.0, 512.0),
+        post_combine_hidden=(310.0, 191.0, 302.0, 333.0),
+    ),
+    1: RankExpected(
+        input_hidden=(20.0, 21.0, 22.0, 23.0),
+        topk_ids=((1, 3), (4, 2), (5, 0), (3, 1)),
+        pre_hidden=(22.0, 20.0, 23.0, 21.0, 20.0, 23.0, 21.0, 22.0),
+        pre_row_id_map=(1, 6, 7, 5, 4, 3, 0, 2),
+        dispatch_hidden=(11.0, 10.0, 13.0, 12.0, 20.0, 23.0, 21.0, 22.0),
+        input_splits=(4, 4),
+        output_splits=(4, 4),
+        tokens_per_expert_group=(1.0, 2.0, 1.0, 2.0, 1.0, 1.0),
+        post_hidden=(11.0, 20.0, 23.0, 10.0, 13.0, 21.0, 12.0, 22.0),
+        post_row_ids_map=(0, 3, 4, 6, 1, 2, 5, 7),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        pre_combine_hidden=(311.0, 410.0, 413.0, 512.0, 320.0, 323.0, 421.0, 522.0),
+        combine_hidden=(22.0, 120.0, 123.0, 221.0, 320.0, 323.0, 421.0, 522.0),
+        post_combine_hidden=(280.0, 321.0, 472.0, 193.0),
+    ),
+}
+
+
+def main() -> None:
+    try:
+        parallel_info = _init_distributed()
+        snapshots = _run_xtuner_ep_case(parallel_info)
+        _validate(parallel_info, snapshots)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+def _init_distributed() -> ParallelInfo:
+    if not torch.cuda.is_available():
+        raise RuntimeError("TorchAll2AllDispatcher 当前依赖 CUDA，请在 GPU 上用 torchrun 运行。")
+
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+
+    ep_size = _get_env_int("EP_SIZE", EP_SIZE)
+    dp_size = _get_env_int("DP_SIZE", DEFAULT_DP_SIZE)
+    world_size = dist.get_world_size()
+    if ep_size != EP_SIZE:
+        raise RuntimeError("xtuner_ep.md 的示例固定为 EP=2。")
+    if world_size != ep_size * dp_size:
+        raise RuntimeError(
+            f"当前配置要求 world_size = EP_SIZE * DP_SIZE = {ep_size * dp_size}，实际为 {world_size}。"
+        )
+
+    # 与 MoE 初始化保持一致：mesh_shape=(dp, ep)，EP 组为连续 rank 对。
+    ep_mesh = init_device_mesh(
+        "cuda",
+        (dp_size, ep_size),
+        mesh_dim_names=("dp", "ep"),
+    )["ep"]
+
+    global_rank = dist.get_rank()
+    return ParallelInfo(
+        global_rank=global_rank,
+        dp_rank=global_rank // ep_size,
+        ep_rank=ep_mesh.get_local_rank(),
+        device=torch.device("cuda", local_rank),
+        ep_group=ep_mesh.get_group(),
+    )
+
+
+@torch.no_grad()
+def _run_xtuner_ep_case(parallel_info: ParallelInfo) -> dict[str, Any]:
+    case = CASES[parallel_info.ep_rank]
+    hidden_states = torch.zeros((len(case.token_values), HIDDEN_SIZE), dtype=torch.float32, device=parallel_info.device)
+    hidden_states[:, 0] = torch.tensor(case.token_values, dtype=torch.float32, device=parallel_info.device)
+    topk_ids = torch.tensor(case.topk_ids, dtype=torch.long, device=parallel_info.device)
+    topk_weights = torch.tensor(case.topk_weights, dtype=torch.float32, device=parallel_info.device)
+
+    dispatcher = TorchAll2AllDispatcher(
+        n_routed_experts=N_ROUTED_EXPERTS,
+        training_dtype="bf16",
+        process_group=parallel_info.ep_group,
+    )
+
+    # 对应文档 1：source rank 内按 global expert 排序。
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
+
+    # 对应文档 2：第一次 all2all，发往目标 EP rank。
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+
+    # 对应文档 3：destination rank 内按 local expert 重新分组。
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+
+    # 用 expert id 改写输出，确保最后的 topK 加权还原也被验证。
+    experts_out = _mock_local_experts(
+        hidden_states=post_dispatched["hidden_states"],
+        tokens_per_expert=post_dispatched["tokens_per_expert"],
+        ep_rank=parallel_info.ep_rank,
+    )
+
+    # 对应文档 5：恢复 all2all receive 顺序。
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_out,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        decoding=False,
+    )
+
+    # 对应文档 6：第二次 all2all，把 expert 输出送回 source rank。
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+
+    # 对应文档 7：用第一次 row_id_map 加权合并 topK。
+    post_combined = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+
+    return {
+        "input_hidden": hidden_states,
+        "topk_ids": topk_ids,
+        "pre_hidden": pre_dispatched["hidden_states"],
+        "pre_row_id_map": pre_dispatched["row_id_map"],
+        "dispatch_hidden": dispatched["hidden_states"],
+        "input_splits": dispatched["input_splits"],
+        "output_splits": dispatched["output_splits"],
+        "tokens_per_expert_group": dispatched["tokens_per_expert_group"],
+        "post_hidden": post_dispatched["hidden_states"],
+        "post_row_ids_map": post_dispatched["row_ids_map"],
+        "tokens_per_expert": post_dispatched["tokens_per_expert"],
+        "pre_combine_hidden": pre_combined["hidden_states"],
+        "combine_hidden": combined["hidden_states"],
+        "post_combine_hidden": post_combined["hidden_states"],
+    }
+
+
+def _mock_local_experts(
+    *,
+    hidden_states: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    ep_rank: int,
+) -> torch.Tensor:
+    local_expert_ids = torch.arange(EXPERTS_PER_RANK, dtype=torch.float32, device=hidden_states.device)
+    local_expert_ids = torch.repeat_interleave(local_expert_ids, tokens_per_expert.to(torch.long))
+    global_expert_ids = ep_rank * EXPERTS_PER_RANK + local_expert_ids
+    return hidden_states + global_expert_ids.view(-1, 1) * EXPERT_OUTPUT_SCALE
+
+
+def _validate(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    expected = EXPECTED[parallel_info.ep_rank]
+    error: AssertionError | None = None
+
+    try:
+        if os.getenv("XTUNER_EP_DEBUG", "0") == "1":
+            _print_snapshots(parallel_info, snapshots)
+        _assert_tensor_close(parallel_info, "pre_hidden", snapshots["pre_hidden"], expected.pre_hidden, first_col=True)
+        _assert_tensor_close(parallel_info, "pre_row_id_map", snapshots["pre_row_id_map"], expected.pre_row_id_map)
+        _assert_tensor_close(
+            parallel_info,
+            "dispatch_hidden",
+            snapshots["dispatch_hidden"],
+            expected.dispatch_hidden,
+            first_col=True,
+        )
+        _assert_list_equal(parallel_info, "input_splits", snapshots["input_splits"], expected.input_splits)
+        _assert_list_equal(parallel_info, "output_splits", snapshots["output_splits"], expected.output_splits)
+        _assert_tensor_close(
+            parallel_info,
+            "tokens_per_expert_group",
+            snapshots["tokens_per_expert_group"],
+            expected.tokens_per_expert_group,
+        )
+        _assert_tensor_close(parallel_info, "post_hidden", snapshots["post_hidden"], expected.post_hidden, first_col=True)
+        _assert_tensor_close(parallel_info, "post_row_ids_map", snapshots["post_row_ids_map"], expected.post_row_ids_map)
+        _assert_tensor_close(parallel_info, "tokens_per_expert", snapshots["tokens_per_expert"], expected.tokens_per_expert)
+        _assert_tensor_close(
+            parallel_info,
+            "pre_combine_hidden",
+            snapshots["pre_combine_hidden"],
+            expected.pre_combine_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "combine_hidden",
+            snapshots["combine_hidden"],
+            expected.combine_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "post_combine_hidden",
+            snapshots["post_combine_hidden"],
+            expected.post_combine_hidden,
+            atol=1e-4,
+            first_col=True,
+        )
+    except AssertionError as exc:
+        error = exc
+
+    failed = torch.tensor([int(error is not None)], dtype=torch.int32, device=parallel_info.device)
+    dist.all_reduce(failed, op=dist.ReduceOp.SUM)
+
+    if failed.item() != 0:
+        if error is not None:
+            raise error
+        raise AssertionError("其他 rank 的 xtuner_ep.md 校验失败。")
+
+    if parallel_info.global_rank == 0:
+        print("xtuner_ep.md EP=2 DP=4 all2all 示例校验通过。")
+
+
+def _assert_tensor_close(
+    parallel_info: ParallelInfo,
+    name: str,
+    actual: torch.Tensor,
+    expected: tuple[float, ...] | tuple[int, ...],
+    *,
+    atol: float = 0.0,
+    first_col: bool = False,
+) -> None:
+    # 文档只跟踪 activation 行来源，不展开 D_h；脚本用第一列承载 token 标识。
+    actual_1d = actual.detach()
+    if first_col and actual_1d.dim() > 1:
+        actual_1d = actual_1d[:, 0]
+    actual_1d = actual_1d.reshape(-1).to(torch.float32)
+    expected_tensor = torch.tensor(expected, dtype=torch.float32, device=actual.device)
+    try:
+        torch.testing.assert_close(actual_1d, expected_tensor, rtol=0.0, atol=atol)
+    except AssertionError as exc:
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank}, dp_rank={parallel_info.dp_rank}, "
+            f"ep_rank={parallel_info.ep_rank} 的 {name} 不符合 xtuner_ep.md 示例："
+            f"actual={actual_1d.cpu().tolist()}, expected={expected_tensor.cpu().tolist()}"
+        ) from exc
+
+
+def _assert_list_equal(parallel_info: ParallelInfo, name: str, actual: list[int], expected: tuple[int, ...]) -> None:
+    if actual != list(expected):
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank}, dp_rank={parallel_info.dp_rank}, "
+            f"ep_rank={parallel_info.ep_rank} 的 {name} 不符合 xtuner_ep.md 示例："
+            f"actual={actual}, expected={expected}"
+        )
+
+
+def _get_env_int(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return int(value)
+
+
+def _print_snapshots(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    hidden_names = {
+        "input_hidden",
+        "pre_hidden",
+        "dispatch_hidden",
+        "post_hidden",
+        "pre_combine_hidden",
+        "combine_hidden",
+        "post_combine_hidden",
+    }
+    for name, value in snapshots.items():
+        if isinstance(value, torch.Tensor):
+            tensor = value.detach()
+            if name in hidden_names and tensor.dim() > 1:
+                tensor = tensor[:, 0]
+            print(
+                f"[global_rank={parallel_info.global_rank} dp_rank={parallel_info.dp_rank} "
+                f"ep_rank={parallel_info.ep_rank}] {name}: {tensor.reshape(-1).cpu().tolist()}",
+                flush=True,
+            )
+        else:
+            print(
+                f"[global_rank={parallel_info.global_rank} dp_rank={parallel_info.dp_rank} "
+                f"ep_rank={parallel_info.ep_rank}] {name}: {value}",
+                flush=True,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xtuner_ep.md b/xtuner_ep.md
new file mode 100644
index 000000000..1d71cc3cb
--- /dev/null
+++ b/xtuner_ep.md
@@ -0,0 +1,374 @@
+# MoEDecoderLayer._forward 中 TorchAll2AllDispatcher 的 EP 流程
+
+下面用一个缩小版一致例子，把 `MoEDecoderLayer._forward` 里的 EP all2all 流程从头串起来。真实 Qwen3MoE30BA3 是 `E=128, K=8, EP=4`；示例改成：
+
+```text
+EP = 2
+E_local = 3
+E = 6
+K = 2
+每个 EP rank 本地 N = B*S = 4 个 token
+```
+
+专家归属：
+
+```text
+ep0 owns global expert 0,1,2  -> local expert 0,1,2
+ep1 owns global expert 3,4,5  -> local expert 0,1,2
+```
+
+示例 token：
+
+```text
+ep0 source tokens: A0 A1 A2 A3
+ep1 source tokens: B0 B1 B2 B3
+```
+
+为方便阅读，下面主要跟踪 activation 行的来源，不展开 `D_h` 维。
+
+## 0. `_pre_moe_forward` 后
+
+对任意一个 EP rank，本地输入：
+
+```text
+hidden_states: [N, D_h] = [4, D_h]
+logits:        [N, E]   = [4, 6]
+topk_ids:      [N, K]   = [4, 2]
+topk_weights:  [N, K]   = [4, 2]
+```
+
+设两个 source rank 的 routing 结果如下：
+
+```text
+ep0 topk_ids:
+A0 -> [0, 4]
+A1 -> [3, 1]
+A2 -> [2, 5]
+A3 -> [4, 0]
+
+ep1 topk_ids:
+B0 -> [1, 3]
+B1 -> [4, 2]
+B2 -> [5, 0]
+B3 -> [3, 1]
+```
+
+## 1. `dispatch_preprocess`: 本地 token 按 global expert 排序
+
+先把每个 token 复制 `K=2` 份，所以每个 source rank 都从 `[4, D_h]` 变成 `[8, D_h]`。
+
+对 `ep0`，flatten 后的 copy 是：
+
+```text
+flat row:          0   1   2   3   4   5   6   7
+token copy:        A0  A0  A1  A1  A2  A2  A3  A3
+global expert id:  0   4   3   1   2   5   4   0
+```
+
+按 global expert id 稳定排序后：
+
+```text
+pre row:           0   1   2   3   4   5   6   7
+token copy:        A0  A3  A1  A2  A1  A0  A3  A2
+global expert id:  0   0   1   2   3   4   4   5
+row_id_map:        0   4   3   6   5   2   7   1
+```
+
+所以：
+
+```text
+pre_dispatched["hidden_states"]: [N*K, D_h] = [8, D_h]
+pre_dispatched["row_id_map"]:    [N*K]      = [8]
+```
+
+这里的 `row_id_map` 是 `permute` 返回、后续 `unpermute` 消费的还原 map。当前 `grouped_gemm`
+backend 下它不是简单的 “pre row j 对应原始 topK flatten 空间里的哪个位置”，不要把它当成普通
+`index_put` 的下标表来手算。
+
+对 `ep1` 同理：
+
+```text
+flat row:          0   1   2   3   4   5   6   7
+token copy:        B0  B0  B1  B1  B2  B2  B3  B3
+global expert id:  1   3   4   2   5   0   3   1
+
+pre row:           0   1   2   3   4   5   6   7
+token copy:        B2  B0  B3  B1  B0  B3  B1  B2
+global expert id:  0   1   1   2   3   3   4   5
+row_id_map:        1   6   7   5   4   3   0   2
+```
+
+## 2. `dispatch`: 第一次 all2all
+
+每个 source rank 根据 global expert 所属 EP rank 切分。
+
+`ep0` 的 pre rows：
+
+```text
+pre row:           0  1  2  3 | 4  5  6  7
+token copy:        A0 A3 A1 A2| A1 A0 A3 A2
+global expert id:  0  0  1  2 | 3  4  4  5
+target ep rank:    0  0  0  0 | 1  1  1  1
+```
+
+所以：
+
+```text
+ep0 input_splits = [4, 4]
+```
+
+`ep1` 的 pre rows：
+
+```text
+pre row:           0  1  2  3 | 4  5  6  7
+token copy:        B2 B0 B3 B1| B0 B3 B1 B2
+global expert id:  0  1  1  2 | 3  3  4  5
+target ep rank:    0  0  0  0 | 1  1  1  1
+```
+
+所以：
+
+```text
+ep1 input_splits = [4, 4]
+```
+
+all2all 后，`ep0` 收到所有发给 experts `0,1,2` 的 token copy：
+
+```text
+dispatched row:    0  1  2  3 | 4  5  6  7
+source ep rank:    0  0  0  0 | 1  1  1  1
+token copy:        A0 A3 A1 A2| B2 B0 B3 B1
+global expert id:  0  0  1  2 | 0  1  1  2
+local expert id:   0  0  1  2 | 0  1  1  2
+```
+
+`ep1` 收到所有发给 experts `3,4,5` 的 token copy：
+
+```text
+dispatched row:    0  1  2  3 | 4  5  6  7
+source ep rank:    0  0  0  0 | 1  1  1  1
+token copy:        A1 A0 A3 A2| B0 B3 B1 B2
+global expert id:  3  4  4  5 | 3  3  4  5
+local expert id:   0  1  1  2 | 0  0  1  2
+```
+
+形状：
+
+```text
+dispatched["hidden_states"]:              [M_recv, D_h]
+dispatched["tokens_per_expert_group"]:    [EP, E_local] = [2, 3]
+```
+
+在这个例子里两个 rank 都是 `M_recv=8`，但真实训练里不保证均匀。
+
+## 3. `dispatch_postprocess`: destination rank 内按 local expert 再排序
+
+all2all 后的顺序是：
+
+```text
+source ep0 block | source ep1 block
+```
+
+并且每个 source 块内部已经按当前 destination rank 的 local expert id 排好。但 grouped GEMM 要的是整个 `M_recv` 范围内按 local expert 连续分组，所以还要再 permute 一次。
+
+对 `ep0`：
+
+```text
+dispatch 后:
+dispatched row:    0  1  2  3 | 4  5  6  7
+source ep rank:    0  0  0  0 | 1  1  1  1
+token copy:        A0 A3 A1 A2| B2 B0 B3 B1
+local expert id:   0  0  1  2 | 0  1  1  2
+```
+
+按 local expert id 全局排序后：
+
+```text
+post row:          0  1  2 | 3  4  5 | 6  7
+token copy:        A0 A3 B2| A1 B0 B3| A2 B1
+local expert id:   0  0  0 | 1  1  1 | 2  2
+row_ids_map:       0  1  3 | 6  2  4 | 5  7
+```
+
+所以：
+
+```text
+post_dispatched["hidden_states"]:     [8, D_h]
+post_dispatched["row_ids_map"]:       [8]
+post_dispatched["tokens_per_expert"]: [3] = [3, 3, 2]
+```
+
+对 `ep1`：
+
+```text
+dispatch 后:
+dispatched row:    0  1  2  3 | 4  5  6  7
+source ep rank:    0  0  0  0 | 1  1  1  1
+token copy:        A1 A0 A3 A2| B0 B3 B1 B2
+local expert id:   0  1  1  2 | 0  0  1  2
+```
+
+按 local expert id 全局排序后：
+
+```text
+post row:          0  1  2 | 3  4  5 | 6  7
+token copy:        A1 B0 B3| A0 A3 B1| A2 B2
+local expert id:   0  0  0 | 1  1  1 | 2  2
+row_ids_map:       0  3  4 | 6  1  2 | 5  7
+```
+
+形状仍然：
+
+```text
+post_dispatched["hidden_states"]:     [8, D_h]
+post_dispatched["tokens_per_expert"]: [3] = [3, 3, 2]
+```
+
+## 4. local experts grouped GEMM
+
+每个 EP rank 只计算自己本地 3 个 experts。
+
+对 `ep0`，grouped GEMM 分段是：
+
+```text
+post row:          0  1  2 | 3  4  5 | 6  7
+token copy:        A0 A3 B2| A1 B0 B3| A2 B1
+local expert id:   0  0  0 | 1  1  1 | 2  2
+tokens_per_expert: 3        | 3        | 2
+```
+
+输出：
+
+```text
+experts_out: [M_recv, D_h] = [8, D_h]
+```
+
+`ep1` 也是同理：
+
+```text
+post row:          0  1  2 | 3  4  5 | 6  7
+token copy:        A1 B0 B3| A0 A3 B1| A2 B2
+local expert id:   0  0  0 | 1  1  1 | 2  2
+tokens_per_expert: 3        | 3        | 2
+```
+
+## 5. `combine_preprocess`: 恢复 all2all receive 顺序
+
+专家输出现在是 local expert grouped 顺序，必须先恢复成 dispatch 后的 source-block 顺序，才能反向 all2all。
+
+对 `ep0`，用：
+
+```text
+row_ids_map = [0, 1, 3, 6, 2, 4, 5, 7]
+```
+
+做 `unpermute(experts_out, row_ids_map)` 后：
+
+```text
+pre_combined row:  0  1  2  3 | 4  5  6  7
+source ep rank:    0  0  0  0 | 1  1  1  1
+token copy:        A0 A3 A1 A2| B2 B0 B3 B1
+local expert id:   0  0  1  2 | 0  1  1  2
+```
+
+形状：
+
+```text
+pre_combined["hidden_states"]: [M_recv, D_h] = [8, D_h]
+```
+
+## 6. `combine`: 第二次 all2all，把 expert 输出送回 source rank
+
+`combine` 用的是第一次 dispatch 的反向 split：
+
+```text
+input_split_sizes  = dispatched["output_splits"]
+output_split_sizes = dispatched["input_splits"]
+```
+
+对 source `ep0` 来说，它会收回自己原来发出去的 8 个 token copy 输出：
+
+```text
+combined row on source ep0: 0  1  2  3 | 4  5  6  7
+from dest ep rank:          0  0  0  0 | 1  1  1  1
+token copy:                 A0 A3 A1 A2| A1 A0 A3 A2
+global expert id:           0  0  1  2 | 3  4  4  5
+```
+
+这个顺序正好对应 `ep0 dispatch_preprocess` 后的 sorted order。
+
+形状：
+
+```text
+combined["hidden_states"]: [N*K, D_h] = [8, D_h]
+```
+
+## 7. `combine_postprocess`: 用第一次 `row_id_map` 加权合并 topK
+
+回到 source `ep0` 后，用最开始的：
+
+```text
+pre_dispatched["row_id_map"] = [0, 4, 3, 6, 5, 2, 7, 1]
+topk_weights:                [N, K] = [4, 2]
+```
+
+把 sorted expert output 加权合并回原始 token 空间。概念上等价于先按原始 topK copy 分组：
+
+```text
+combined row:      0  1  2  3  4  5  6  7
+token copy:        A0 A3 A1 A2 A1 A0 A3 A2
+conceptual group:  A0 A0 | A1 A1 | A2 A2 | A3 A3
+topk slot:         0  1  | 0  1  | 0  1  | 0  1
+```
+
+然后 reshape：
+
+```text
+[N*K, D_h] -> [N, K, D_h] = [4, 2, D_h]
+```
+
+乘 `topk_weights [4, 2]` 并对 `K` 求和：
+
+```text
+A0 final = out(A0,e0) * w(A0,e0) + out(A0,e4) * w(A0,e4)
+A1 final = out(A1,e3) * w(A1,e3) + out(A1,e1) * w(A1,e1)
+A2 final = out(A2,e2) * w(A2,e2) + out(A2,e5) * w(A2,e5)
+A3 final = out(A3,e4) * w(A3,e4) + out(A3,e0) * w(A3,e0)
+```
+
+形状：
+
+```text
+post_combined["hidden_states"]: [N, D_h] = [4, D_h]
+```
+
+最后恢复原始 batch/seq：
+
+```text
+combined_hidden_states: [B, S, D_h]
+```
+
+## 8. `_post_moe_forward`
+
+前提是 `n_shared_experts=0`，所以没有 shared expert 分支：
+
+```text
+hidden_states = combined_hidden_states * hidden_factor + residual
+```
+
+输出：
+
+```text
+hidden_states:  [B, S, D_h]
+router_logits:  [N, E]
+router_weights: [N, E]
+```
+
+## 核心总结
+
+第一次 `row_id_map [N*K]` 是 source rank 上 `permute` 产生、最后由 `unpermute(..., probs=topk_weights)`
+消费的还原 map，负责加权合并回 `[N, D_h]`。
+
+第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
+只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。两个 map 都应当按 backend opaque map 理解，
+不要按普通排序下标手算。

From b0445131f5a5bddc71b18d6ca157b8f926529c90 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 28 Apr 2026 08:54:45 +0000
Subject: [PATCH 02/25] fix row_id_map in dispatch_preprocess

---
 .dev_scripts/run_validate_xtuner_ep_md.sh |  8 ++--
 .dev_scripts/validate_xtuner_ep_md.py     |  6 +++
 xtuner_ep.md                              | 56 +++++++++++++++++------
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/.dev_scripts/run_validate_xtuner_ep_md.sh b/.dev_scripts/run_validate_xtuner_ep_md.sh
index b00ac00f4..a32192acf 100755
--- a/.dev_scripts/run_validate_xtuner_ep_md.sh
+++ b/.dev_scripts/run_validate_xtuner_ep_md.sh
@@ -6,7 +6,10 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 
 # 默认使用用户指定的 fla 环境；需要切换时可在命令前覆盖 CONDA_ENV。
 CONDA_ENV="${CONDA_ENV:-fla}"
-CONDA_SH="${CONDA_SH:-~/miniconda3/etc/profile.d/conda.sh}"
+source $(conda info --base)/etc/profile.d/conda.sh
+conda activate "${CONDA_ENV}"
+
+export XTUNER_EP_DEBUG=1
 
 # xtuner_ep.md 的示例固定为 EP=2；默认额外验证 4 份 DP replica。
 EP_SIZE="${EP_SIZE:-2}"
@@ -15,9 +18,6 @@ NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * DP_SIZE))}"
 CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
 MASTER_PORT="${MASTER_PORT:-29531}"
 
-source "${CONDA_SH}"
-conda activate "${CONDA_ENV}"
-
 # 显式使用当前仓库代码，避免导入 conda 环境或其他目录下安装的 xtuner。
 export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
 export CUDA_VISIBLE_DEVICES
diff --git a/.dev_scripts/validate_xtuner_ep_md.py b/.dev_scripts/validate_xtuner_ep_md.py
index 465842ef8..f4414d3a4 100644
--- a/.dev_scripts/validate_xtuner_ep_md.py
+++ b/.dev_scripts/validate_xtuner_ep_md.py
@@ -46,6 +46,7 @@ class RankExpected:
     post_hidden: tuple[float, ...]
     post_row_ids_map: tuple[int, ...]
     tokens_per_expert: tuple[float, ...]
+    experts_out: tuple[float, ...]
     pre_combine_hidden: tuple[float, ...]
     combine_hidden: tuple[float, ...]
     post_combine_hidden: tuple[float, ...]
@@ -87,6 +88,7 @@ class ParallelInfo:
         post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 12.0, 21.0),
         post_row_ids_map=(0, 1, 3, 6, 2, 4, 5, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 212.0, 221.0),
         pre_combine_hidden=(10.0, 13.0, 111.0, 212.0, 22.0, 120.0, 123.0, 221.0),
         combine_hidden=(10.0, 13.0, 111.0, 212.0, 311.0, 410.0, 413.0, 512.0),
         post_combine_hidden=(310.0, 191.0, 302.0, 333.0),
@@ -103,6 +105,7 @@ class ParallelInfo:
         post_hidden=(11.0, 20.0, 23.0, 10.0, 13.0, 21.0, 12.0, 22.0),
         post_row_ids_map=(0, 3, 4, 6, 1, 2, 5, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(311.0, 320.0, 323.0, 410.0, 413.0, 421.0, 512.0, 522.0),
         pre_combine_hidden=(311.0, 410.0, 413.0, 512.0, 320.0, 323.0, 421.0, 522.0),
         combine_hidden=(22.0, 120.0, 123.0, 221.0, 320.0, 323.0, 421.0, 522.0),
         post_combine_hidden=(280.0, 321.0, 472.0, 193.0),
@@ -231,6 +234,7 @@ def _run_xtuner_ep_case(parallel_info: ParallelInfo) -> dict[str, Any]:
         "post_hidden": post_dispatched["hidden_states"],
         "post_row_ids_map": post_dispatched["row_ids_map"],
         "tokens_per_expert": post_dispatched["tokens_per_expert"],
+        "experts_out": experts_out,
         "pre_combine_hidden": pre_combined["hidden_states"],
         "combine_hidden": combined["hidden_states"],
         "post_combine_hidden": post_combined["hidden_states"],
@@ -276,6 +280,7 @@ def _validate(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
         _assert_tensor_close(parallel_info, "post_hidden", snapshots["post_hidden"], expected.post_hidden, first_col=True)
         _assert_tensor_close(parallel_info, "post_row_ids_map", snapshots["post_row_ids_map"], expected.post_row_ids_map)
         _assert_tensor_close(parallel_info, "tokens_per_expert", snapshots["tokens_per_expert"], expected.tokens_per_expert)
+        _assert_tensor_close(parallel_info, "experts_out", snapshots["experts_out"], expected.experts_out, first_col=True)
         _assert_tensor_close(
             parallel_info,
             "pre_combine_hidden",
@@ -360,6 +365,7 @@ def _print_snapshots(parallel_info: ParallelInfo, snapshots: dict[str, Any]) ->
         "pre_hidden",
         "dispatch_hidden",
         "post_hidden",
+        "experts_out",
         "pre_combine_hidden",
         "combine_hidden",
         "post_combine_hidden",
diff --git a/xtuner_ep.md b/xtuner_ep.md
index 1d71cc3cb..f530237bd 100644
--- a/xtuner_ep.md
+++ b/xtuner_ep.md
@@ -57,15 +57,21 @@ B3 -> [3, 1]
 
 先把每个 token 复制 `K=2` 份，所以每个 source rank 都从 `[4, D_h]` 变成 `[8, D_h]`。
 
+`grouped_gemm.backend.permute` 内部使用 **topk-slot-first** 展开：先列出所有 N 个 token 的
+第 0 号 topk copy，再列出第 1 号 topk copy，依此类推。`row_id_map[i] = j` 表示源 flat 空间
+（topk-slot-first）第 `i` 个位置的 token copy 排序后落在第 `j` 个位置（scatter 语义）；
+同 expert 时按 token index 升序排列。
+
 对 `ep0`，flatten 后的 copy 是：
 
 ```text
-flat row:          0   1   2   3   4   5   6   7
-token copy:        A0  A0  A1  A1  A2  A2  A3  A3
-global expert id:  0   4   3   1   2   5   4   0
+flat pos:          0   1   2   3   4   5   6   7
+token copy:        A0  A1  A2  A3  A0  A1  A2  A3
+global expert id:  0   3   2   4   4   1   5   0
+topk slot:         0   0   0   0   1   1   1   1
 ```
 
-按 global expert id 稳定排序后：
+按 `(expert, token index)` 排序后：
 
 ```text
 pre row:           0   1   2   3   4   5   6   7
@@ -74,23 +80,37 @@ global expert id:  0   0   1   2   3   4   4   5
 row_id_map:        0   4   3   6   5   2   7   1
 ```
 
+将上面两组放到一起看`row_id_map`映射关系
+
+```text
+flat pos:          0   1   2   3   4   5   6   7
+token copy:        A0  A1  A2  A3  A0  A1  A2  A3
+row_id_map:        0   4   3   6   5   2   7   1
+
+pre row:           0   1   2   3   4   5   6   7
+token copy:        A0  A3  A1  A2  A1  A0  A3  A2
+global expert id:  0   0   1   2   3   4   4   5
+```
+
+
+
 所以：
 
 ```text
-pre_dispatched["hidden_states"]: [N*K, D_h] = [8, D_h]
-pre_dispatched["row_id_map"]:    [N*K]      = [8]
+pre_dispatched[“hidden_states”]: [N*K, D_h] = [8, D_h]
+pre_dispatched[“row_id_map”]:    [N*K]      = [8]
 ```
 
-这里的 `row_id_map` 是 `permute` 返回、后续 `unpermute` 消费的还原 map。当前 `grouped_gemm`
-backend 下它不是简单的 “pre row j 对应原始 topK flatten 空间里的哪个位置”，不要把它当成普通
-`index_put` 的下标表来手算。
+`backend.unpermute(combined, row_id_map, probs)` 对应的逆操作是 gather：
+`output[i] = combined[row_id_map[i]]`，输出按 topk-slot-first 排布后乘以 `probs` 再沿 K 方向求和。
 
 对 `ep1` 同理：
 
 ```text
-flat row:          0   1   2   3   4   5   6   7
-token copy:        B0  B0  B1  B1  B2  B2  B3  B3
-global expert id:  1   3   4   2   5   0   3   1
+flat pos:          0   1   2   3   4   5   6   7
+token copy:        B0  B1  B2  B3  B0  B1  B2  B3
+global expert id:  1   4   5   3   3   2   0   1
+topk slot:         0   0   0   0   1   1   1   1
 
 pre row:           0   1   2   3   4   5   6   7
 token copy:        B2  B0  B3  B1  B0  B3  B1  B2
@@ -367,8 +387,14 @@ router_weights: [N, E]
 ## 核心总结
 
 第一次 `row_id_map [N*K]` 是 source rank 上 `permute` 产生、最后由 `unpermute(..., probs=topk_weights)`
-消费的还原 map，负责加权合并回 `[N, D_h]`。
+消费的还原 map，负责加权合并回 `[N, D_h]`。其精确语义：
+
+- **scatter**：`row_id_map[i] = j` 表示 topk-slot-first 源 flat 空间第 `i` 个位置的 token copy
+  排序后落在 sorted 空间第 `j` 个位置。
+- **unpermute 逆操作**：gather，`output[i] = combined[row_id_map[i]]`，输出按 topk-slot-first
+  排布后乘 `probs` 再沿 K 求和，得到 `[N, D_h]`。
+- `grouped_gemm.backend.permute` 内部使用 topk-slot-first 展开，同 expert 时按 token index 升序；
+  手动从 token-first flat 展开推导会得到不同的值，两者不可混用。
 
 第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
-只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。两个 map 都应当按 backend opaque map 理解，
-不要按普通排序下标手算。
+语义相同（scatter，1D indices 无 topk 展开），只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。

From 4c0147f5f8b44348a768720661da89f0f7875398 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 28 Apr 2026 10:18:27 +0000
Subject: [PATCH 03/25] feat(dispatcher): add torch all2all TP/EP dispatcher
 and TP+EP docs validation

- Add torch_all2all_tpep dispatcher and wire it in dispatcher __init__
- Add megatron_tp_ep.md and validate_xtuner_tpep_md script with shell runner
- Apply ruff formatting to validation script; fix mypy (ctx Any, combine_preprocess signature)

Made-with: Cursor
---
 .dev_scripts/run_validate_xtuner_tpep_md.sh   |  33 ++
 .dev_scripts/validate_xtuner_tpep_md.py       | 515 ++++++++++++++++++
 megatron_tp_ep.md                             | 207 +++++++
 xtuner/v1/module/dispatcher/__init__.py       |  13 +-
 .../module/dispatcher/torch_all2all_tpep.py   | 329 +++++++++++
 5 files changed, 1096 insertions(+), 1 deletion(-)
 create mode 100755 .dev_scripts/run_validate_xtuner_tpep_md.sh
 create mode 100644 .dev_scripts/validate_xtuner_tpep_md.py
 create mode 100644 megatron_tp_ep.md
 create mode 100644 xtuner/v1/module/dispatcher/torch_all2all_tpep.py

diff --git a/.dev_scripts/run_validate_xtuner_tpep_md.sh b/.dev_scripts/run_validate_xtuner_tpep_md.sh
new file mode 100755
index 000000000..51d9c0825
--- /dev/null
+++ b/.dev_scripts/run_validate_xtuner_tpep_md.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# 默认使用用户指定的 fla 环境；需要切换时可在命令前覆盖 CONDA_ENV。
+CONDA_ENV="${CONDA_ENV:-fla}"
+source $(conda info --base)/etc/profile.d/conda.sh
+conda activate "${CONDA_ENV}"
+
+export XTUNER_TPEP_DEBUG=1
+
+# xtuner_ep.md 的示例固定为 EP=2；默认额外验证 4 份 DP replica。
+EP_SIZE="${EP_SIZE:-2}"
+TP_SIZE="${TP_SIZE:-2}"
+DP_SIZE="${DP_SIZE:-1}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * TP_SIZE * DP_SIZE))}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+MASTER_PORT="${MASTER_PORT:-29531}"
+
+# 显式使用当前仓库代码，避免导入 conda 环境或其他目录下安装的 xtuner。
+export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
+export CUDA_VISIBLE_DEVICES
+export EP_SIZE
+export TP_SIZE
+export DP_SIZE
+
+cd "${REPO_ROOT}"
+torchrun \
+  --nproc-per-node="${NPROC_PER_NODE}" \
+  --master-port="${MASTER_PORT}" \
+  .dev_scripts/validate_xtuner_tpep_md.py
diff --git a/.dev_scripts/validate_xtuner_tpep_md.py b/.dev_scripts/validate_xtuner_tpep_md.py
new file mode 100644
index 000000000..33de6ab38
--- /dev/null
+++ b/.dev_scripts/validate_xtuner_tpep_md.py
@@ -0,0 +1,515 @@
+"""验证 XTuner TP+EP all2all 示例的中间顺序。
+
+参数设置（固定）:
+    EP = 2, TP = 2 → world_size = EP * TP * DP = 4 * DP_SIZE
+
+Device mesh 排列（mesh_shape=(dp, ep, tp)）:
+    rank 0 → (dp=0, ep=0, tp=0)  tokens: A0=10, A1=11
+    rank 1 → (dp=0, ep=0, tp=1)  tokens: A2=12, A3=13
+    rank 2 → (dp=0, ep=1, tp=0)  tokens: B0=20, B1=21
+    rank 3 → (dp=0, ep=1, tp=1)  tokens: B2=22, B3=23
+
+每个 TP rank 持有 N_local=2 个 token，EP+TP 后的流程：
+
+    dispatch_preprocess : 按 expert 排序（每 TP rank 独立）
+    dispatch            : EP AlltoAll（每 TP rank 独立，仅路由本 TP 的 token 副本）
+    dispatch_postprocess: TP AllGather → 将 TP slices 合并成 M_total token
+                          + 按 local expert 再排序（供 grouped GEMM）
+    [Expert GEMM]       : 冗余计算（同一 EP rank 内各 TP rank 计算结果相同）
+    combine_preprocess  : unpermute → TP ReduceScatterMean → 恢复每 TP rank M_ep_recv
+    combine             : EP AlltoAll 逆向
+    combine_postprocess : unpermute + topk 加权求和 → [N_local, H]
+
+运行方式：
+    EP_SIZE=2 TP_SIZE=2 DP_SIZE=1 torchrun --nproc-per-node=4 \
+        .dev_scripts/validate_xtuner_tpep_md.py
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+
+from xtuner.v1.module.dispatcher.torch_all2all_tpep import TorchAll2AllTPEPDispatcher
+
+
+EP_SIZE = 2
+TP_SIZE = 2
+DEFAULT_DP_SIZE = 1
+N_ROUTED_EXPERTS = 6
+EXPERTS_PER_RANK = 3
+EXPERT_OUTPUT_SCALE = 100.0
+HIDDEN_SIZE = 128
+
+
+@dataclass(frozen=True)
+class RankCase:
+    token_values: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    topk_weights: tuple[tuple[float, float], ...]
+
+
+@dataclass(frozen=True)
+class RankExpected:
+    input_hidden: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    pre_hidden: tuple[float, ...]
+    pre_row_id_map: tuple[int, ...]
+    dispatch_hidden: tuple[float, ...]
+    input_splits: tuple[int, ...]
+    output_splits: tuple[int, ...]
+    tokens_per_expert_group: tuple[float, ...]
+    output_splits_tp: tuple[int, ...]
+    post_hidden: tuple[float, ...]
+    post_row_ids_map: tuple[int, ...]
+    tokens_per_expert: tuple[float, ...]
+    experts_out: tuple[float, ...]
+    pre_combine_hidden: tuple[float, ...]
+    combine_hidden: tuple[float, ...]
+    post_combine_hidden: tuple[float, ...]
+
+
+@dataclass(frozen=True)
+class ParallelInfo:
+    global_rank: int
+    dp_rank: int
+    ep_rank: int
+    tp_rank: int
+    device: torch.device
+    ep_group: dist.ProcessGroup
+    tp_group: dist.ProcessGroup
+
+
+# (ep_rank, tp_rank) → RankCase
+# ep0_tp0: A0, A1  |  ep0_tp1: A2, A3
+# ep1_tp0: B0, B1  |  ep1_tp1: B2, B3
+CASES: dict[tuple[int, int], RankCase] = {
+    (0, 0): RankCase(
+        token_values=(10.0, 11.0),
+        topk_ids=((0, 4), (3, 1)),
+        topk_weights=((0.25, 0.75), (0.4, 0.6)),
+    ),
+    (0, 1): RankCase(
+        token_values=(12.0, 13.0),
+        topk_ids=((2, 5), (4, 0)),
+        topk_weights=((0.7, 0.3), (0.8, 0.2)),
+    ),
+    (1, 0): RankCase(
+        token_values=(20.0, 21.0),
+        topk_ids=((1, 3), (4, 2)),
+        topk_weights=((0.2, 0.8), (0.5, 0.5)),
+    ),
+    (1, 1): RankCase(
+        token_values=(22.0, 23.0),
+        topk_ids=((5, 0), (3, 1)),
+        topk_weights=((0.9, 0.1), (0.35, 0.65)),
+    ),
+}
+
+
+# All expected values derived by hand.  See xtuner_tpep.md for the full derivation.
+#
+# Notation (token value as token id):
+#   A0=10, A1=11, A2=12, A3=13  (ep0 source tokens)
+#   B0=20, B1=21, B2=22, B3=23  (ep1 source tokens)
+#   expert mock: out = in + global_expert_id * 100
+EXPECTED: dict[tuple[int, int], RankExpected] = {
+    # rank 0: (ep=0, tp=0) — tokens A0, A1
+    (0, 0): RankExpected(
+        input_hidden=(10.0, 11.0),
+        topk_ids=((0, 4), (3, 1)),
+        # sorted (topk-slot-first then by expert): A0(e0), A1(e1), A1(e3), A0(e4)
+        pre_hidden=(10.0, 11.0, 11.0, 10.0),
+        pre_row_id_map=(0, 2, 3, 1),
+        # after EP A2A: from self=[A0(e0),A1(e1)], from ep1_tp0=[B0(e1),B1(e2)]
+        dispatch_hidden=(10.0, 11.0, 20.0, 21.0),
+        input_splits=(2, 2),
+        output_splits=(2, 2),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 0.0, 1.0, 1.0),
+        output_splits_tp=(4, 4),
+        # after TP AllGather (tp0||tp1) + sort by local expert:
+        # e0: A0,A3,B2  e1: A1,B0,B3  e2: B1,A2
+        post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 21.0, 12.0),
+        post_row_ids_map=(0, 3, 4, 6, 1, 7, 2, 5),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        # expert adds global_expert_id * 100
+        experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
+        # after ReduceScatterMean — tp0 slice [0:4]
+        pre_combine_hidden=(10.0, 111.0, 120.0, 221.0),
+        # after EP A2A reverse: from self=[10,111], from ep1_tp0=[311,410]
+        combine_hidden=(10.0, 111.0, 311.0, 410.0),
+        post_combine_hidden=(310.0, 191.0),
+    ),
+    # rank 1: (ep=0, tp=1) — tokens A2, A3
+    (0, 1): RankExpected(
+        input_hidden=(12.0, 13.0),
+        topk_ids=((2, 5), (4, 0)),
+        # sorted: A3(e0), A2(e2), A3(e4), A2(e5)
+        pre_hidden=(13.0, 12.0, 13.0, 12.0),
+        pre_row_id_map=(1, 2, 3, 0),
+        # after EP A2A: from self=[A3(e0),A2(e2)], from ep1_tp1=[B2(e0),B3(e1)]
+        dispatch_hidden=(13.0, 12.0, 22.0, 23.0),
+        input_splits=(2, 2),
+        output_splits=(2, 2),
+        tokens_per_expert_group=(1.0, 0.0, 1.0, 1.0, 1.0, 0.0),
+        output_splits_tp=(4, 4),
+        # both tp ranks see the same gathered tensor after AllGather
+        post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 21.0, 12.0),
+        post_row_ids_map=(0, 3, 4, 6, 1, 7, 2, 5),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
+        # after ReduceScatterMean — tp1 slice [4:8]
+        pre_combine_hidden=(13.0, 212.0, 22.0, 123.0),
+        # after EP A2A reverse: from self=[13,212], from ep1_tp1=[413,512]
+        combine_hidden=(13.0, 212.0, 413.0, 512.0),
+        post_combine_hidden=(302.0, 333.0),
+    ),
+    # rank 2: (ep=1, tp=0) — tokens B0, B1
+    (1, 0): RankExpected(
+        input_hidden=(20.0, 21.0),
+        topk_ids=((1, 3), (4, 2)),
+        # sorted: B0(e1), B1(e2), B0(e3), B1(e4)
+        pre_hidden=(20.0, 21.0, 20.0, 21.0),
+        pre_row_id_map=(0, 3, 2, 1),
+        # after EP A2A: from ep0_tp0=[A1(e3),A0(e4)], from self=[B0(e3),B1(e4)]
+        dispatch_hidden=(11.0, 10.0, 20.0, 21.0),
+        input_splits=(2, 2),
+        output_splits=(2, 2),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 1.0, 1.0, 0.0),
+        output_splits_tp=(4, 4),
+        # after TP AllGather (tp0||tp1) + sort: e3: A1,B0,B3  e4: A0,B1,A3  e5: A2,B2
+        post_hidden=(11.0, 20.0, 23.0, 10.0, 21.0, 13.0, 12.0, 22.0),
+        post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
+        # after ReduceScatterMean — tp0 slice [0:4]
+        pre_combine_hidden=(311.0, 410.0, 320.0, 421.0),
+        # after EP A2A reverse: from ep0_tp0=[120,221], from self=[320,421]
+        combine_hidden=(120.0, 221.0, 320.0, 421.0),
+        post_combine_hidden=(280.0, 321.0),
+    ),
+    # rank 3: (ep=1, tp=1) — tokens B2, B3
+    (1, 1): RankExpected(
+        input_hidden=(22.0, 23.0),
+        topk_ids=((5, 0), (3, 1)),
+        # sorted: B2(e0), B3(e1), B3(e3), B2(e5)
+        pre_hidden=(22.0, 23.0, 23.0, 22.0),
+        pre_row_id_map=(3, 2, 0, 1),
+        # after EP A2A: from ep0_tp1=[A3(e4),A2(e5)], from self=[B3(e3),B2(e5)]
+        dispatch_hidden=(13.0, 12.0, 23.0, 22.0),
+        input_splits=(2, 2),
+        output_splits=(2, 2),
+        tokens_per_expert_group=(0.0, 1.0, 1.0, 1.0, 0.0, 1.0),
+        output_splits_tp=(4, 4),
+        post_hidden=(11.0, 20.0, 23.0, 10.0, 21.0, 13.0, 12.0, 22.0),
+        post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
+        # after ReduceScatterMean — tp1 slice [4:8]
+        pre_combine_hidden=(413.0, 512.0, 323.0, 522.0),
+        # after EP A2A reverse: from ep0_tp1=[22,123], from self=[323,522]
+        combine_hidden=(22.0, 123.0, 323.0, 522.0),
+        post_combine_hidden=(472.0, 193.0),
+    ),
+}
+
+
+def main() -> None:
+    try:
+        parallel_info = _init_distributed()
+        snapshots = _run_tpep_case(parallel_info)
+        _validate(parallel_info, snapshots)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+def _init_distributed() -> ParallelInfo:
+    if not torch.cuda.is_available():
+        raise RuntimeError("TorchAll2AllTPEPDispatcher 当前依赖 CUDA，请在 GPU 上用 torchrun 运行。")
+
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+
+    ep_size = _get_env_int("EP_SIZE", EP_SIZE)
+    tp_size = _get_env_int("TP_SIZE", TP_SIZE)
+    dp_size = _get_env_int("DP_SIZE", DEFAULT_DP_SIZE)
+
+    if ep_size != EP_SIZE or tp_size != TP_SIZE:
+        raise RuntimeError("本脚本固定为 EP=2, TP=2。")
+
+    world_size = dist.get_world_size()
+    if world_size != dp_size * ep_size * tp_size:
+        raise RuntimeError(f"需要 world_size = DP*EP*TP = {dp_size * ep_size * tp_size}，实际为 {world_size}。")
+
+    # mesh_shape=(dp, ep, tp):
+    #   rank 0 → (dp=0,ep=0,tp=0), rank 1 → (dp=0,ep=0,tp=1)
+    #   rank 2 → (dp=0,ep=1,tp=0), rank 3 → (dp=0,ep=1,tp=1)
+    mesh = init_device_mesh(
+        "cuda",
+        (dp_size, ep_size, tp_size),
+        mesh_dim_names=("dp", "ep", "tp"),
+    )
+
+    global_rank = dist.get_rank()
+    ep_rank = mesh["ep"].get_local_rank()
+    tp_rank = mesh["tp"].get_local_rank()
+    dp_rank = mesh["dp"].get_local_rank()
+
+    return ParallelInfo(
+        global_rank=global_rank,
+        dp_rank=dp_rank,
+        ep_rank=ep_rank,
+        tp_rank=tp_rank,
+        device=torch.device("cuda", local_rank),
+        ep_group=mesh["ep"].get_group(),
+        tp_group=mesh["tp"].get_group(),
+    )
+
+
+@torch.no_grad()
+def _run_tpep_case(parallel_info: ParallelInfo) -> dict[str, Any]:
+    case = CASES[(parallel_info.ep_rank, parallel_info.tp_rank)]
+    hidden_states = torch.zeros(
+        (len(case.token_values), HIDDEN_SIZE), dtype=torch.float32, device=parallel_info.device
+    )
+    hidden_states[:, 0] = torch.tensor(case.token_values, dtype=torch.float32, device=parallel_info.device)
+    topk_ids = torch.tensor(case.topk_ids, dtype=torch.long, device=parallel_info.device)
+    topk_weights = torch.tensor(case.topk_weights, dtype=torch.float32, device=parallel_info.device)
+
+    dispatcher = TorchAll2AllTPEPDispatcher(
+        n_routed_experts=N_ROUTED_EXPERTS,
+        ep_group=parallel_info.ep_group,
+        tp_group=parallel_info.tp_group,
+        training_dtype="bf16",
+    )
+
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
+
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+
+    experts_out = _mock_local_experts(
+        hidden_states=post_dispatched["hidden_states"],
+        tokens_per_expert=post_dispatched["tokens_per_expert"],
+        ep_rank=parallel_info.ep_rank,
+    )
+
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_out,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        decoding=False,
+    )
+
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+
+    post_combined = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+
+    return {
+        "input_hidden": hidden_states,
+        "topk_ids": topk_ids,
+        "pre_hidden": pre_dispatched["hidden_states"],
+        "pre_row_id_map": pre_dispatched["row_id_map"],
+        "dispatch_hidden": dispatched["hidden_states"],
+        "input_splits": dispatched["input_splits"],
+        "output_splits": dispatched["output_splits"],
+        "tokens_per_expert_group": dispatched["tokens_per_expert_group"],
+        "output_splits_tp": post_dispatched["output_splits_tp"],
+        "post_hidden": post_dispatched["hidden_states"],
+        "post_row_ids_map": post_dispatched["row_ids_map"],
+        "tokens_per_expert": post_dispatched["tokens_per_expert"],
+        "experts_out": experts_out,
+        "pre_combine_hidden": pre_combined["hidden_states"],
+        "combine_hidden": combined["hidden_states"],
+        "post_combine_hidden": post_combined["hidden_states"],
+    }
+
+
+def _mock_local_experts(
+    *,
+    hidden_states: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    ep_rank: int,
+) -> torch.Tensor:
+    local_expert_ids = torch.arange(EXPERTS_PER_RANK, dtype=torch.float32, device=hidden_states.device)
+    local_expert_ids = torch.repeat_interleave(local_expert_ids, tokens_per_expert.to(torch.long))
+    global_expert_ids = ep_rank * EXPERTS_PER_RANK + local_expert_ids
+    return hidden_states + global_expert_ids.view(-1, 1) * EXPERT_OUTPUT_SCALE
+
+
+def _validate(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    key = (parallel_info.ep_rank, parallel_info.tp_rank)
+    expected = EXPECTED[key]
+    error: AssertionError | None = None
+
+    try:
+        if os.getenv("XTUNER_TPEP_DEBUG", "0") == "1":
+            _print_snapshots(parallel_info, snapshots)
+
+        _assert_tensor_close(parallel_info, "pre_hidden", snapshots["pre_hidden"], expected.pre_hidden, first_col=True)
+        _assert_tensor_close(parallel_info, "pre_row_id_map", snapshots["pre_row_id_map"], expected.pre_row_id_map)
+        _assert_tensor_close(
+            parallel_info, "dispatch_hidden", snapshots["dispatch_hidden"], expected.dispatch_hidden, first_col=True
+        )
+        _assert_list_equal(parallel_info, "input_splits", snapshots["input_splits"], expected.input_splits)
+        _assert_list_equal(parallel_info, "output_splits", snapshots["output_splits"], expected.output_splits)
+        _assert_tensor_close(
+            parallel_info,
+            "tokens_per_expert_group",
+            snapshots["tokens_per_expert_group"],
+            expected.tokens_per_expert_group,
+        )
+        _assert_list_equal(parallel_info, "output_splits_tp", snapshots["output_splits_tp"], expected.output_splits_tp)
+        _assert_tensor_close(
+            parallel_info, "post_hidden", snapshots["post_hidden"], expected.post_hidden, first_col=True
+        )
+        _assert_tensor_close(
+            parallel_info, "post_row_ids_map", snapshots["post_row_ids_map"], expected.post_row_ids_map
+        )
+        _assert_tensor_close(
+            parallel_info, "tokens_per_expert", snapshots["tokens_per_expert"], expected.tokens_per_expert
+        )
+        _assert_tensor_close(
+            parallel_info, "experts_out", snapshots["experts_out"], expected.experts_out, first_col=True
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "pre_combine_hidden",
+            snapshots["pre_combine_hidden"],
+            expected.pre_combine_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "combine_hidden",
+            snapshots["combine_hidden"],
+            expected.combine_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "post_combine_hidden",
+            snapshots["post_combine_hidden"],
+            expected.post_combine_hidden,
+            atol=1e-4,
+            first_col=True,
+        )
+    except AssertionError as exc:
+        error = exc
+
+    failed = torch.tensor([int(error is not None)], dtype=torch.int32, device=parallel_info.device)
+    dist.all_reduce(failed, op=dist.ReduceOp.SUM)
+
+    if failed.item() != 0:
+        if error is not None:
+            raise error
+        raise AssertionError("其他 rank 的 TP+EP 示例校验失败。")
+
+    if parallel_info.global_rank == 0:
+        print("xtuner TP+EP EP=2 TP=2 all2all 示例校验通过。")
+
+
+def _assert_tensor_close(
+    parallel_info: ParallelInfo,
+    name: str,
+    actual: torch.Tensor,
+    expected: tuple[float, ...] | tuple[int, ...],
+    *,
+    atol: float = 0.0,
+    first_col: bool = False,
+) -> None:
+    actual_1d = actual.detach()
+    if first_col and actual_1d.dim() > 1:
+        actual_1d = actual_1d[:, 0]
+    actual_1d = actual_1d.reshape(-1).to(torch.float32)
+    expected_tensor = torch.tensor(expected, dtype=torch.float32, device=actual.device)
+    try:
+        torch.testing.assert_close(actual_1d, expected_tensor, rtol=0.0, atol=atol)
+    except AssertionError as exc:
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank} ep_rank={parallel_info.ep_rank} "
+            f"tp_rank={parallel_info.tp_rank} 的 {name} 不符合预期："
+            f"actual={actual_1d.cpu().tolist()}, expected={expected_tensor.cpu().tolist()}"
+        ) from exc
+
+
+def _assert_list_equal(
+    parallel_info: ParallelInfo,
+    name: str,
+    actual: list[int],
+    expected: tuple[int, ...],
+) -> None:
+    if actual != list(expected):
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank} ep_rank={parallel_info.ep_rank} "
+            f"tp_rank={parallel_info.tp_rank} 的 {name} 不符合预期："
+            f"actual={actual}, expected={list(expected)}"
+        )
+
+
+def _get_env_int(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return int(value)
+
+
+def _print_snapshots(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    hidden_names = {
+        "input_hidden",
+        "pre_hidden",
+        "dispatch_hidden",
+        "post_hidden",
+        "experts_out",
+        "pre_combine_hidden",
+        "combine_hidden",
+        "post_combine_hidden",
+    }
+    for name, value in snapshots.items():
+        if isinstance(value, torch.Tensor):
+            tensor = value.detach()
+            if name in hidden_names and tensor.dim() > 1:
+                tensor = tensor[:, 0]
+            print(
+                f"[global_rank={parallel_info.global_rank} ep_rank={parallel_info.ep_rank} "
+                f"tp_rank={parallel_info.tp_rank}] {name}: {tensor.reshape(-1).cpu().tolist()}",
+                flush=True,
+            )
+        else:
+            print(
+                f"[global_rank={parallel_info.global_rank} ep_rank={parallel_info.ep_rank} "
+                f"tp_rank={parallel_info.tp_rank}] {name}: {value}",
+                flush=True,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/megatron_tp_ep.md b/megatron_tp_ep.md
new file mode 100644
index 000000000..e255948fd
--- /dev/null
+++ b/megatron_tp_ep.md
@@ -0,0 +1,207 @@
+以下是 EP + TP 同时开启时，`MoELayer.forward` 调用 `MoEAlltoAllTokenDispatcher` 的完整流程。
+
+---
+
+## 前置形状约定
+
+| 符号             | 含义                                               |
+| ---------------- | -------------------------------------------------- |
+| `S/TP * B`       | 每个设备持有的 local tokens（SP 下序列按 TP 切分） |
+| `H`              | hidden size（专家计算不按 TP 切分 H 维）           |
+| `E`              | 总专家数                                           |
+| `E_local = E/EP` | 每个 EP rank 持有的本地专家数                      |
+
+输入：`hidden_states [S/TP, B, H]`，每个设备只持有序列的 `1/TP` 片段。
+
+---
+
+## token_permutation 流程
+
+### 1. `preprocess(routing_map)`
+
+在 `tp_ep_group`（TP × EP 域）上做一次 AllGather，收集全局的 `num_tokens → expert` 分布，计算：
+
+- `input_splits [EP]`：本 rank 要向各 EP rank 发送多少 token
+- `output_splits [EP]`：本 rank 将从各 EP rank 收到多少 token（仅计我的 TP 切片）
+- `output_splits_tp [TP]`：EP A2A 后，各 TP rank 各持有多少 token（用于后续 AllGather 的不等分）
+- `num_global_tokens_per_local_expert_cpu`：每个本地专家将处理多少 token（用于 sort_chunks）
+
+---
+
+### 2. Permutation 1：按专家排序（本地）
+
+```
+hidden_states [N_local, H]
+    → permute(routing_map)
+    → permutated_local_input_tokens [num_out_tokens, H]
+```
+
+将本地 token 按 **目标 EP rank → 目标专家** 的顺序排列，为 EP A2A 的连续内存布局做准备。同时保存逆映射 `reversed_local_input_permutation_mapping`。
+
+---
+
+### 3. EP AlltoAll（第一次 A2A）
+
+```
+all_to_all(ep_group,
+           send=permutated_local_input_tokens,
+           output_splits=output_splits,   # 我将收到多少
+           input_splits=input_splits)     # 我将发出多少
+→ global_input_tokens [M_ep_recv, H]
+```
+
+每个 EP rank 收到来自所有 EP rank 的、目标是本 rank 本地专家的 token，但**仍只是每个 EP rank 的 TP 切片**（即来自同一 EP rank 不同 TP rank 的 token 还未合并）。
+
+---
+
+### 4. TP AllGather（补全序列切片）
+
+```python
+if self.tp_size > 1:
+    global_input_tokens = gather_from_sequence_parallel_region(
+        global_input_tokens, group=tp_group,
+        output_split_sizes=output_splits_tp.tolist()
+    )
+→ global_input_tokens [M_total, H]
+```
+
+在 TP 组内 AllGather，把同一 EP rank 下不同 TP rank 持有的 token 片段拼合。之后每个设备（同一 EP rank 内的所有 TP rank）都持有完整的、需要送入本地专家的 token 集合。
+
+---
+
+### 5. Permutation 2：按本地专家排序（为 Grouped GEMM）
+
+```python
+if self.num_local_experts > 1:
+    global_input_tokens = sort_chunks_by_idxs(
+        global_input_tokens,
+        num_global_tokens_per_local_expert_cpu.ravel(),
+        sort_input_by_local_experts
+    )
+→ dispatched_input [M_total, H]，按 local expert 连续分组
+```
+
+AllGather 后的顺序是 `[TP rank 0 的 block | TP rank 1 的 block | ...]`，每块内部已按本地专家排序，但整体不连续。这里做一次 sort_chunks 让同一专家的 token 在内存中连续，满足 Grouped GEMM 的输入要求。
+
+---
+
+## 专家计算
+
+```
+experts(dispatched_input, tokens_per_expert)
+→ expert_output [M_total, H]
+```
+
+每个 EP rank 用 Grouped GEMM 计算本地 `E_local` 个专家，各 TP rank 计算相同的数据（专家权重本身不按 TP 切分 H 维，是完整权重的副本）。
+
+---
+
+## token_unpermutation 流程（逆序）
+
+### 6. Unpermutation 2：逆 sort_chunks
+
+```python
+if self.num_local_experts > 1:
+    hidden_states = sort_chunks_by_idxs(
+        hidden_states,
+        num_global_tokens_per_local_expert_cpu.T.ravel(),
+        restore_output_by_local_experts
+    )
+→ [M_total, H]，恢复为 [TP rank 0 block | TP rank 1 block | ...] 顺序
+```
+
+---
+
+### 7. TP ReduceScatter
+
+```python
+if self.tp_size > 1:
+    hidden_states = reduce_scatter_to_sequence_parallel_region(
+        hidden_states, group=tp_group,
+        input_split_sizes=output_splits_tp.tolist()
+    )
+→ [M_ep_recv, H]
+```
+
+对专家输出在 TP 组内做 ReduceScatter：各 TP rank 持有相同的专家输出，reduce（求和）后 scatter，每个 TP rank 只保留属于自己的 token 片段。
+
+---
+
+### 8. EP AlltoAll（第二次 A2A，逆向）
+
+```python
+all_to_all(ep_group,
+           send=hidden_states,
+           output_splits=input_splits,    # 逆向：原来发多少现在收多少
+           input_splits=output_splits)
+→ permutated_local_input_tokens [num_out_tokens, H]
+```
+
+将专家输出发回各 source EP rank，每个 rank 收回自己原来发出的 token 的专家输出。
+
+---
+
+### 9. Unpermutation 1：还原 token 顺序 + topK 加权求和
+
+```python
+output = unpermute(
+    permutated_local_input_tokens,
+    reversed_local_input_permutation_mapping,
+    restore_shape=hidden_shape_before_permute,
+    probs=self.probs,
+    routing_map=self.routing_map
+)
+→ output [N_local, H]
+```
+
+用 Permutation 1 保存的逆映射，将 token 还原到原始顺序，并对 topK 个专家的输出按 `probs` 加权求和。
+
+最终 `reshape` 回 `[S/TP, B, H]`。
+
+---
+
+## 整体数据流一览
+
+```
+[S/TP, B, H]
+  │
+  ▼ Permutation 1（按 EP rank/expert 排序）
+[num_out_tokens, H]
+  │
+  ▼ EP AlltoAll → 各 EP rank 收到目标 token（仍是 TP 切片）
+[M_ep_recv, H]
+  │
+  ▼ TP AllGather → 补全序列切片，每 TP rank 数据一致
+[M_total, H]
+  │
+  ▼ Permutation 2（按 local expert 连续分组）
+[M_total, H]
+  │
+  ▼ Grouped GEMM（E_local 个专家）
+[M_total, H]
+  │
+  ▼ Unpermutation 2（逆 sort_chunks）
+[M_total, H]
+  │
+  ▼ TP ReduceScatter → 各 TP rank 只保留自己的片段
+[M_ep_recv, H]
+  │
+  ▼ EP AlltoAll（逆向）→ token 回到 source rank
+[num_out_tokens, H]
+  │
+  ▼ Unpermutation 1 + topK 加权求和
+[S/TP, B, H]
+```
+
+---
+
+## 关键设计要点
+
+| 通信             | 作用                                                                                                    |
+| ---------------- | ------------------------------------------------------------------------------------------------------- |
+| EP A2A（正向）   | 将 token 路由到持有目标专家的 EP rank                                                                   |
+| TP AllGather     | 每个 EP rank 内合并 TP 切片，得到完整待计算 token 集；各 TP rank 计算完全相同的专家输出（**冗余计算**） |
+| TP ReduceScatter | 对冗余输出 reduce，并按 SP 切分还给各 TP rank                                                           |
+| EP A2A（逆向）   | 将专家输出归还 source rank                                                                              |
+
+TP 维度的 AllGather + ReduceScatter 是对称的，引入冗余计算但避免了对专家权重做 TP 切分，保持专家计算的完整性。EP 维度的两次 A2A 实现了 token 到专家的路由与归还。
\ No newline at end of file
diff --git a/xtuner/v1/module/dispatcher/__init__.py b/xtuner/v1/module/dispatcher/__init__.py
index 970f18297..710360b94 100644
--- a/xtuner/v1/module/dispatcher/__init__.py
+++ b/xtuner/v1/module/dispatcher/__init__.py
@@ -20,6 +20,7 @@
     PreDispatchResult,
 )
 from .torch_all2all import TorchAll2AllDispatcher
+from .torch_all2all_tpep import TorchAll2AllTPEPDispatcher
 
 
 logger = get_logger()
@@ -31,6 +32,7 @@ def build_dispatcher(
     dispatcher: Literal["deepep", "all2all", "agrs"] | None,
     n_routed_experts: int,
     ep_group: dist.ProcessGroup | None = None,
+    tp_group: dist.ProcessGroup | None = None,
     training_dtype: Literal["bf16", "fp8"] = "bf16",
     generate_dtype: Literal["bf16", "fp8"] = "bf16",
 ) -> DispacherInterface:
@@ -60,7 +62,15 @@ def build_dispatcher(
             generate_dtype=generate_dtype,
         )  # type: ignore
     elif dispatcher == "all2all":
-        assert ep_group is not None, "DeepEPDispatcher requires a non-null process group."
+        assert ep_group is not None, "TorchAll2AllDispatcher requires a non-null ep_group."
+        if tp_group is not None and tp_group.size() > 1:
+            return TorchAll2AllTPEPDispatcher(
+                n_routed_experts=n_routed_experts,
+                ep_group=ep_group,
+                tp_group=tp_group,
+                training_dtype=training_dtype,
+                generate_dtype=generate_dtype,
+            )  # type: ignore[return-value]
         return TorchAll2AllDispatcher(
             n_routed_experts=n_routed_experts,
             process_group=ep_group,
@@ -83,6 +93,7 @@ def build_dispatcher(
     "DispacherInterface",
     "NaiveDispatcher",
     "TorchAll2AllDispatcher",
+    "TorchAll2AllTPEPDispatcher",
     "MoEAGRSDispatcher",
     "build_dispatcher",
     "PreDispatchResult",
diff --git a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
new file mode 100644
index 000000000..d53905afd
--- /dev/null
+++ b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
@@ -0,0 +1,329 @@
+"""TorchAll2AllTPEPDispatcher: EP AlltoAll dispatcher with TP AllGather/ReduceScatter.
+
+Forward data flow (adds two TP collectives around the existing EP dispatcher):
+
+    dispatch_preprocess : permute by expert (each TP rank independently, N_local tokens)
+    dispatch            : EP AlltoAll (each TP rank independently, routing N_local token copies)
+    dispatch_postprocess: TP AllGather → merge TP slices into M_total tokens
+                          then permute by local expert (for grouped GEMM)
+    [Expert GEMM]       : each TP rank computes full expert output (redundant across TP)
+    combine_preprocess  : unpermute back to TP-AllGather order
+                          then TP ReduceScatterMean → restore M_ep_recv per TP rank
+    combine             : EP AlltoAll reverse (each TP rank independently)
+    combine_postprocess : unpermute with topk_weights → [N_local, H] per TP rank
+
+Design rationale (mirrors Megatron MoEAlltoAllTokenDispatcher with TP+EP):
+  - Expert weights are NOT sharded by TP; each TP rank holds a full copy.
+  - TP AllGather before experts and TP ReduceScatterMean after experts form a symmetric pair
+    that keeps the forward values numerically identical to the EP-only case.
+  - ReduceScatterMean (avg reduce) is used so that the redundant expert outputs from all TP
+    ranks reduce back to the original values without a TP-factor scaling in the forward pass.
+  - The backward of ReduceScatterMean (AllGather) and AllGather backward (AllReduce+slice)
+    introduce a 1/TP scaling in the gradient. This is a known design trade-off consistent
+    with the Megatron approach; the model learns to compensate via weight initialisation.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal, cast
+
+import torch
+import torch.distributed as dist
+from typing_extensions import override
+
+from xtuner.v1.ops import permute, unpermute
+
+from . import XTUNER_DISPATCHER_DEBUG
+from .torch_all2all import (
+    TorchAll2AllDispatcher,
+    TorchAll2AllDispatchResult,
+    TorchAll2AllPostDispatchResult,
+    TorchAll2AllPreCombineResult,
+    TorchAll2AllPreDispatchResult,
+    get_backward_hook,
+    get_backward_pre_hook,
+)
+
+
+class TorchAll2AllTPEPPostDispatchResult(TorchAll2AllPostDispatchResult):
+    """Post-dispatch result for TP+EP dispatcher.
+
+    Extends the EP-only result with per-TP-rank token counts needed to perform the
+    TP ReduceScatterMean in ``combine_preprocess``.
+    """
+
+    output_splits_tp: list[int]
+
+
+class _TPAllGather(torch.autograd.Function):
+    """TP AllGather with autograd support.
+
+    Forward : ``all_gather`` across the TP group, concatenating along the token dim.
+    Backward: ``all_reduce`` (SUM) the gradient then slice — equivalent to a reduce-scatter
+              sum in the unequal-size case.  This introduces a 1/TP factor relative to the
+              mathematically exact gradient when computation is redundant across TP ranks,
+              consistent with the Megatron redundant-TP-expert design.
+    """
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+    ) -> torch.Tensor:
+        chunks = [torch.empty(s, hidden.shape[1], dtype=hidden.dtype, device=hidden.device) for s in all_sizes]
+        dist.all_gather(chunks, hidden.contiguous(), group=tp_group)
+        ctx.tp_group = tp_group
+        ctx.tp_size = tp_size
+        ctx.tp_rank = tp_rank
+        ctx.all_sizes = all_sizes
+        return torch.cat(chunks, dim=0)
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None]:
+        grad = grad.contiguous()
+        dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=ctx.tp_group)
+        offset = sum(ctx.all_sizes[: ctx.tp_rank])
+        return grad[offset : offset + ctx.all_sizes[ctx.tp_rank]].clone(), None, None, None, None
+
+
+class _TPReduceScatterMean(torch.autograd.Function):
+    """TP ReduceScatterMean with autograd support.
+
+    Forward : ``all_reduce`` (SUM) / TP_size then slice — equivalent to a mean reduce-scatter.
+              When all TP ranks hold identical tensors (redundant expert computation), this
+              returns the original un-scaled value for each rank's slice.
+    Backward: ``all_gather`` the gradient slices to reconstruct the full gradient tensor,
+              then divide by TP_size (chain rule through the /TP_size division).
+    """
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+    ) -> torch.Tensor:
+        hidden = hidden.clone()
+        dist.all_reduce(hidden, op=dist.ReduceOp.SUM, group=tp_group)
+        hidden = hidden / tp_size
+        offset = sum(all_sizes[:tp_rank])
+        ctx.tp_group = tp_group
+        ctx.tp_size = tp_size
+        ctx.tp_rank = tp_rank
+        ctx.all_sizes = all_sizes
+        return hidden[offset : offset + all_sizes[tp_rank]].contiguous()
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad_slice: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None]:
+        chunks = [
+            torch.empty(s, grad_slice.shape[1], dtype=grad_slice.dtype, device=grad_slice.device)
+            for s in ctx.all_sizes
+        ]
+        dist.all_gather(chunks, grad_slice.contiguous(), group=ctx.tp_group)
+        full_grad = torch.cat(chunks, dim=0) / ctx.tp_size
+        return full_grad, None, None, None, None
+
+
+def _tp_all_gather(
+    hidden: torch.Tensor,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, list[int]]:
+    """All-gather ``hidden`` across the TP group and return the gathered tensor
+    plus per-rank sizes."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        return hidden, [hidden.shape[0]]
+
+    tp_rank = dist.get_rank(group=tp_group)
+    local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
+    all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
+    dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
+    all_sizes = [int(s) for s in all_sizes_t.tolist()]
+
+    gathered = _TPAllGather.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
+    return gathered, all_sizes
+
+
+def _tp_reduce_scatter_mean(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+) -> torch.Tensor:
+    """Mean-reduce-scatter ``hidden`` across the TP group, returning this
+    rank's slice."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        return hidden
+
+    tp_rank = dist.get_rank(group=tp_group)
+    return _TPReduceScatterMean.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
+
+
+def _tp_all_gather_tokens_per_expert_group(
+    tokens_per_expert_group: torch.Tensor,
+    tp_group: dist.ProcessGroup,
+) -> torch.Tensor:
+    """Gather per-TP expert counts in the same TP-rank order as
+    ``_tp_all_gather``."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        return tokens_per_expert_group.unsqueeze(0)
+
+    gathered = tokens_per_expert_group.new_empty((tp_size, *tokens_per_expert_group.shape))
+    dist.all_gather_into_tensor(gathered, tokens_per_expert_group.contiguous(), group=tp_group)
+    return gathered
+
+
+class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
+    """TP+EP dispatcher: wraps ``TorchAll2AllDispatcher`` with TP AllGather and
+    ReduceScatterMean.
+
+    Overrides only ``dispatch_postprocess`` and ``combine_preprocess``; all other steps
+    (dispatch_preprocess, dispatch, combine, combine_postprocess) are unchanged from the
+    EP-only base class.
+
+    Args:
+        n_routed_experts (int): Total number of routed experts across all EP ranks.
+        ep_group (dist.ProcessGroup): Expert parallel process group.
+        tp_group (dist.ProcessGroup): Tensor parallel process group.
+        training_dtype (str): Dtype for training, ``"bf16"`` or ``"fp8"``.
+        generate_dtype (str): Dtype for generation, ``"bf16"`` or ``"fp8"``.
+    """
+
+    def __init__(
+        self,
+        *,
+        n_routed_experts: int,
+        ep_group: dist.ProcessGroup,
+        tp_group: dist.ProcessGroup,
+        training_dtype: Literal["fp8", "bf16"] = "bf16",
+        generate_dtype: Literal["fp8", "bf16"] = "bf16",
+    ) -> None:
+        super().__init__(
+            n_routed_experts=n_routed_experts,
+            process_group=ep_group,
+            training_dtype=training_dtype,
+            generate_dtype=generate_dtype,
+        )
+        self._tp_group = tp_group
+        self._tp_size = tp_group.size()
+
+    @override
+    def dispatch_postprocess(
+        self,
+        *,
+        pre_dispatched: TorchAll2AllPreDispatchResult,
+        dispatched: TorchAll2AllDispatchResult,
+        async_op: bool = False,
+        decoding: bool = False,
+    ) -> TorchAll2AllTPEPPostDispatchResult:
+        if async_op:
+            # async_op for TP collectives is not yet implemented; fall back to synchronous.
+            assert dispatched["forward_finished_event"] is not None, "Use async_op=True for dispatch!"
+            self.wait_comm_stream(dispatched["forward_finished_event"])
+
+        # TP AllGather: [M_ep_recv, H] → [M_total, H]; also returns per-TP-rank sizes.
+        gathered_hidden, output_splits_tp = _tp_all_gather(
+            dispatched["hidden_states"],
+            tp_group=self._tp_group,
+        )
+
+        # Permute [M_total, H] into local-expert order for grouped GEMM.  Since
+        # TP AllGather concatenates tp0_block | tp1_block | ..., expert counts
+        # must be gathered in the same TP order before building the row labels.
+        gathered_tokens_per_expert_group = _tp_all_gather_tokens_per_expert_group(
+            dispatched["tokens_per_expert_group"],
+            tp_group=self._tp_group,
+        )
+        token_counts = gathered_tokens_per_expert_group.ravel()
+        local_expert_ids = self._expert_ids_per_ep_rank.repeat(self._tp_size)
+        global_input_tokens_local_experts_indices = torch.repeat_interleave(
+            local_expert_ids,
+            token_counts,
+            output_size=gathered_hidden.shape[0],
+        )
+        global_input_tokens, row_ids_map = permute(
+            gathered_hidden,
+            global_input_tokens_local_experts_indices.to(torch.int32),
+        )
+        tokens_per_expert = gathered_tokens_per_expert_group.sum(dim=(0, 1))
+
+        if async_op:
+            assert dispatched["backward_previous_event"] is not None, "Use async_op=True for dispatch!"
+            if global_input_tokens.grad_fn is not None:
+                global_input_tokens.grad_fn.register_hook(
+                    get_backward_hook(
+                        dispatched["backward_previous_event"],
+                        name="TorchAll2AllTPEPDispatcher.dispatch_postprocess",
+                        debug=XTUNER_DISPATCHER_DEBUG,
+                    )
+                )
+
+        if decoding:
+            raise NotImplementedError("Decoding is not yet supported for TorchAll2AllTPEPDispatcher.")
+
+        return TorchAll2AllTPEPPostDispatchResult(
+            hidden_states=global_input_tokens,
+            row_ids_map=row_ids_map,
+            tokens_per_expert=tokens_per_expert,
+            output_splits_tp=output_splits_tp,
+        )
+
+    @override
+    def combine_preprocess(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        pre_dispatched: TorchAll2AllPreDispatchResult,
+        dispatched: TorchAll2AllDispatchResult,
+        post_dispatched: TorchAll2AllPostDispatchResult,
+        async_op: bool = False,
+        decoding: bool = False,
+    ) -> TorchAll2AllPreCombineResult:
+        tpep_post = cast(TorchAll2AllTPEPPostDispatchResult, post_dispatched)
+        # Unpermute [M_total, H] back to TP-AllGather order (tp0_block | tp1_block | ...).
+        hidden_states = unpermute(hidden_states, tpep_post["row_ids_map"])
+
+        # TP ReduceScatterMean: [M_total, H] → [M_ep_recv, H] for this TP rank.
+        hidden_states = _tp_reduce_scatter_mean(
+            hidden_states,
+            all_sizes=tpep_post["output_splits_tp"],
+            tp_group=self._tp_group,
+        )
+
+        if async_op:
+            backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
+            forward_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+            forward_finished_event.record()
+            if hidden_states.grad_fn is not None:
+                hidden_states.grad_fn.register_prehook(
+                    get_backward_pre_hook(
+                        backward_previous_event=backward_previous_event,
+                        name="TorchAll2AllTPEPDispatcher.combine_preprocess",
+                        debug=XTUNER_DISPATCHER_DEBUG,
+                    )
+                )
+        else:
+            backward_previous_event = None
+            forward_finished_event = None
+
+        if decoding:
+            raise NotImplementedError("Decoding is not yet supported for TorchAll2AllTPEPDispatcher.")
+
+        return TorchAll2AllPreCombineResult(
+            hidden_states=hidden_states,
+            backward_previous_event=backward_previous_event,
+            forward_finished_event=forward_finished_event,
+        )

From 111f35e04e07efb09c889d64a7f8820165fc0164 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 28 Apr 2026 12:12:40 +0000
Subject: [PATCH 04/25] add tp ep demo case with moe block

---
 .../run_validate_moeblock_tpep_vs_single.sh   |  33 ++
 .../validate_moeblock_tpep_vs_single.py       | 374 ++++++++++++++++++
 2 files changed, 407 insertions(+)
 create mode 100644 .dev_scripts/run_validate_moeblock_tpep_vs_single.sh
 create mode 100644 .dev_scripts/validate_moeblock_tpep_vs_single.py

diff --git a/.dev_scripts/run_validate_moeblock_tpep_vs_single.sh b/.dev_scripts/run_validate_moeblock_tpep_vs_single.sh
new file mode 100644
index 000000000..d706ebbf9
--- /dev/null
+++ b/.dev_scripts/run_validate_moeblock_tpep_vs_single.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# 默认使用用户指定的 fla 环境；需要切换时可在命令前覆盖 CONDA_ENV。
+CONDA_ENV="${CONDA_ENV:-fla}"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "${CONDA_ENV}"
+
+# 本脚本固定验证 EP=2, TP=2。
+EP_SIZE="${EP_SIZE:-2}"
+TP_SIZE="${TP_SIZE:-2}"
+DP_SIZE="${DP_SIZE:-1}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * TP_SIZE * DP_SIZE))}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+MASTER_PORT="${MASTER_PORT:-29532}"
+XTUNER_USE_CUTLASS_GROUP_GEMM="${XTUNER_USE_CUTLASS_GROUP_GEMM:-1}"
+
+# 显式使用当前仓库代码，避免导入 conda 环境或其他目录下安装的 xtuner。
+export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
+export CUDA_VISIBLE_DEVICES
+export EP_SIZE
+export TP_SIZE
+export DP_SIZE
+export XTUNER_USE_CUTLASS_GROUP_GEMM
+
+cd "${REPO_ROOT}"
+torchrun \
+  --nproc-per-node="${NPROC_PER_NODE}" \
+  --master-port="${MASTER_PORT}" \
+  .dev_scripts/validate_moeblock_tpep_vs_single.py
diff --git a/.dev_scripts/validate_moeblock_tpep_vs_single.py b/.dev_scripts/validate_moeblock_tpep_vs_single.py
new file mode 100644
index 000000000..33fc5e557
--- /dev/null
+++ b/.dev_scripts/validate_moeblock_tpep_vs_single.py
@@ -0,0 +1,374 @@
+"""Compare real MoEBlock grouped-GEMM outputs with and without TP+EP.
+
+The TP+EP path uses the same token layout as ``validate_xtuner_tpep_md.py``:
+
+    rank 0 -> (ep=0, tp=0): A0, A1
+    rank 1 -> (ep=0, tp=1): A2, A3
+    rank 2 -> (ep=1, tp=0): B0, B1
+    rank 3 -> (ep=1, tp=1): B2, B3
+
+Rank 0 additionally runs a non-parallel reference over all 8 tokens with a full
+MoEBlock.  Each distributed rank runs the TP+EP dispatcher plus a sharded
+MoEBlock.  The local TP+EP outputs are gathered back to rank 0 and compared
+against the non-parallel reference in global-rank token order.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.tensor import DTensor, Shard, distribute_tensor
+
+# The Triton TMA grouped-GEMM kernel can fail to compile on some local Triton/LLVM
+# combinations.  Use XTuner's Cutlass backend by default while still exercising
+# the real grouped-GEMM operator path.
+os.environ.setdefault("XTUNER_USE_CUTLASS_GROUP_GEMM", "1")
+
+from xtuner.v1.module.decoder_layer.moe_decoder_layer import MoEActFnConfig, MoEBlock
+from xtuner.v1.module.dispatcher.base import NaiveDispatcher
+from xtuner.v1.module.dispatcher.torch_all2all_tpep import TorchAll2AllTPEPDispatcher
+
+
+EP_SIZE = 2
+TP_SIZE = 2
+DEFAULT_DP_SIZE = 1
+N_ROUTED_EXPERTS = 6
+HIDDEN_SIZE = 128
+MOE_INTERMEDIATE_SIZE = 256
+DTYPE = torch.bfloat16
+ATOL = 3e-2
+RTOL = 3e-2
+
+
+@dataclass(frozen=True)
+class RankCase:
+    token_values: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    topk_weights: tuple[tuple[float, float], ...]
+
+
+@dataclass(frozen=True)
+class ParallelInfo:
+    global_rank: int
+    ep_rank: int
+    tp_rank: int
+    device: torch.device
+    ep_mesh: DeviceMesh
+    ep_group: dist.ProcessGroup
+    tp_group: dist.ProcessGroup
+
+
+CASES: dict[tuple[int, int], RankCase] = {
+    # (ep, tp) -> RankCase(token_values, topk_ids, topk_weights)
+    (0, 0): RankCase(
+        token_values=(10.0, 11.0),
+        topk_ids=((0, 4), (3, 1)),
+        topk_weights=((0.25, 0.75), (0.4, 0.6)),
+    ),
+    (0, 1): RankCase(
+        token_values=(12.0, 13.0),
+        topk_ids=((2, 5), (4, 0)),
+        topk_weights=((0.7, 0.3), (0.8, 0.2)),
+    ),
+    (1, 0): RankCase(
+        token_values=(20.0, 21.0),
+        topk_ids=((1, 3), (4, 2)),
+        topk_weights=((0.2, 0.8), (0.5, 0.5)),
+    ),
+    (1, 1): RankCase(
+        token_values=(22.0, 23.0),
+        topk_ids=((5, 0), (3, 1)),
+        topk_weights=((0.9, 0.1), (0.35, 0.65)),
+    ),
+}
+
+CASE_ORDER = ((0, 0), (0, 1), (1, 0), (1, 1))
+
+
+def main() -> None:
+    try:
+        parallel_info = _init_distributed()
+        full_w1w3, full_w2 = _make_full_weights(parallel_info.device)
+        local_hidden, local_topk_ids, local_topk_weights = _make_local_inputs(parallel_info)
+
+        local_output = _run_tpep_moeblock(
+            parallel_info=parallel_info,
+            hidden_states=local_hidden,
+            topk_ids=local_topk_ids,
+            topk_weights=local_topk_weights,
+            full_w1w3=full_w1w3,
+            full_w2=full_w2,
+        )
+
+        gathered_outputs: list[torch.Tensor] | None = None
+        if parallel_info.global_rank == 0:
+            gathered_outputs = [torch.empty_like(local_output) for _ in range(dist.get_world_size())]
+        dist.gather(local_output.contiguous(), gather_list=gathered_outputs, dst=0)
+
+        if parallel_info.global_rank == 0:
+            assert gathered_outputs is not None
+            parallel_output = torch.cat(gathered_outputs, dim=0)
+            reference_output = _run_single_moeblock_reference(
+                device=parallel_info.device,
+                full_w1w3=full_w1w3,
+                full_w2=full_w2,
+            )
+            _assert_close(parallel_output, reference_output)
+            max_abs_diff = (parallel_output.float() - reference_output.float()).abs().max().item()
+            print(
+                "真实 MoEBlock grouped-GEMM TP+EP 输出与无并行输出一致，"
+                f"max_abs_diff={max_abs_diff:.6e}。"
+            )
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+def _init_distributed() -> ParallelInfo:
+    if not torch.cuda.is_available():
+        raise RuntimeError("真实 MoEBlock TP+EP 校验依赖 CUDA，请在 GPU 上用 torchrun 运行。")
+
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+
+    ep_size = _get_env_int("EP_SIZE", EP_SIZE)
+    tp_size = _get_env_int("TP_SIZE", TP_SIZE)
+    dp_size = _get_env_int("DP_SIZE", DEFAULT_DP_SIZE)
+    if ep_size != EP_SIZE or tp_size != TP_SIZE:
+        raise RuntimeError("本脚本固定验证 EP=2, TP=2。")
+
+    world_size = dist.get_world_size()
+    if world_size != dp_size * ep_size * tp_size:
+        raise RuntimeError(f"需要 world_size = DP*EP*TP = {dp_size * ep_size * tp_size}，实际为 {world_size}。")
+
+    mesh = init_device_mesh(
+        "cuda",
+        (dp_size, ep_size, tp_size),
+        mesh_dim_names=("dp", "ep", "tp"),
+    )
+    ep_mesh = mesh["ep"]
+    return ParallelInfo(
+        global_rank=dist.get_rank(),
+        ep_rank=ep_mesh.get_local_rank(),
+        tp_rank=mesh["tp"].get_local_rank(),
+        device=torch.device("cuda", local_rank),
+        ep_mesh=ep_mesh,
+        ep_group=ep_mesh.get_group(),
+        tp_group=mesh["tp"].get_group(),
+    )
+
+
+def _make_full_weights(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+    generator = torch.Generator(device=device)
+    generator.manual_seed(20260428)
+    w1w3 = torch.randn(
+        N_ROUTED_EXPERTS * 2 * MOE_INTERMEDIATE_SIZE,
+        HIDDEN_SIZE,
+        generator=generator,
+        device=device,
+        dtype=DTYPE,
+    )
+    w2 = torch.randn(
+        N_ROUTED_EXPERTS * HIDDEN_SIZE,
+        MOE_INTERMEDIATE_SIZE,
+        generator=generator,
+        device=device,
+        dtype=DTYPE,
+    )
+    return w1w3 * 0.02, w2 * 0.02
+
+
+def _make_local_inputs(parallel_info: ParallelInfo) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    case = CASES[(parallel_info.ep_rank, parallel_info.tp_rank)]
+    hidden = _make_full_hidden(parallel_info.device)[_local_slice(parallel_info)]
+    hidden[:, 0] = torch.tensor(case.token_values, dtype=DTYPE, device=parallel_info.device)
+    topk_ids = torch.tensor(case.topk_ids, dtype=torch.long, device=parallel_info.device)
+    topk_weights = torch.tensor(case.topk_weights, dtype=torch.float32, device=parallel_info.device)
+    return hidden, topk_ids, topk_weights
+
+
+def _make_full_hidden(device: torch.device) -> torch.Tensor:
+    generator = torch.Generator(device=device)
+    generator.manual_seed(20260429)
+    hidden = torch.randn(len(CASE_ORDER) * 2, HIDDEN_SIZE, generator=generator, device=device, dtype=DTYPE)
+    token_values = [token for key in CASE_ORDER for token in CASES[key].token_values]
+    hidden[:, 0] = torch.tensor(token_values, dtype=DTYPE, device=device)
+    return hidden
+
+
+def _local_slice(parallel_info: ParallelInfo) -> slice:
+    rank_offset = CASE_ORDER.index((parallel_info.ep_rank, parallel_info.tp_rank))
+    start = rank_offset * 2
+    return slice(start, start + 2)
+
+
+def _run_tpep_moeblock(
+    *,
+    parallel_info: ParallelInfo,
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    full_w1w3: torch.Tensor,
+    full_w2: torch.Tensor,
+) -> torch.Tensor:
+    dispatcher = TorchAll2AllTPEPDispatcher(
+        n_routed_experts=N_ROUTED_EXPERTS,
+        ep_group=parallel_info.ep_group,
+        tp_group=parallel_info.tp_group,
+        training_dtype="bf16",
+    )
+    experts = _build_moeblock(parallel_info.device, ep_mesh=parallel_info.ep_mesh)
+    _load_weights(experts, full_w1w3, full_w2)
+
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+    experts_out = experts(
+        post_dispatched["hidden_states"],
+        post_dispatched["tokens_per_expert"],
+        decoding=False,
+    )
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_out,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        decoding=False,
+    )
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+    post_combined = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+    return post_combined["hidden_states"]
+
+
+def _run_single_moeblock_reference(
+    *,
+    device: torch.device,
+    full_w1w3: torch.Tensor,
+    full_w2: torch.Tensor,
+) -> torch.Tensor:
+    hidden_states = _make_full_hidden(device)
+    topk_ids = torch.tensor(
+        [topk_id for key in CASE_ORDER for topk_id in CASES[key].topk_ids],
+        dtype=torch.long,
+        device=device,
+    )
+    topk_weights = torch.tensor(
+        [topk_weight for key in CASE_ORDER for topk_weight in CASES[key].topk_weights],
+        dtype=torch.float32,
+        device=device,
+    )
+
+    dispatcher = NaiveDispatcher(n_routed_experts=N_ROUTED_EXPERTS)
+    experts = _build_moeblock(device, ep_mesh=None)
+    _load_weights(experts, full_w1w3, full_w2)
+
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+    experts_out = experts(
+        post_dispatched["hidden_states"],
+        post_dispatched["tokens_per_expert"],
+        decoding=False,
+    )
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_out,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        decoding=False,
+    )
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+    post_combined = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+    return post_combined["hidden_states"]
+
+
+def _build_moeblock(device: torch.device, ep_mesh: DeviceMesh | None) -> MoEBlock:
+    block = MoEBlock(
+        hidden_size=HIDDEN_SIZE,
+        moe_intermediate_size=MOE_INTERMEDIATE_SIZE,
+        n_routed_experts=N_ROUTED_EXPERTS,
+        moe_bias=False,
+        ep_mesh=ep_mesh,
+        float8_cfg=None,
+        moe_act_fn_cfg=MoEActFnConfig(),
+    )
+    return block.to(device=device, dtype=DTYPE).eval()
+
+
+def _load_weights(experts: MoEBlock, full_w1w3: torch.Tensor, full_w2: torch.Tensor) -> None:
+    with torch.no_grad():
+        _copy_weight(experts.fused_w1w3.weight, full_w1w3)
+        _copy_weight(experts.fused_w2.weight, full_w2)
+
+
+def _copy_weight(param: torch.Tensor, full_weight: torch.Tensor) -> None:
+    if isinstance(param, DTensor):
+        param.copy_(distribute_tensor(full_weight, param.device_mesh, [Shard(0)]))
+    else:
+        param.copy_(full_weight)
+
+
+def _assert_close(actual: torch.Tensor, expected: torch.Tensor) -> None:
+    try:
+        torch.testing.assert_close(actual.float(), expected.float(), rtol=RTOL, atol=ATOL)
+    except AssertionError as exc:
+        max_abs_diff = (actual.float() - expected.float()).abs().max().item()
+        raise AssertionError(
+            "真实 MoEBlock grouped-GEMM TP+EP 输出与无并行输出不一致："
+            f"max_abs_diff={max_abs_diff:.6f}, actual_first_col={actual[:, 0].float().tolist()}, "
+            f"expected_first_col={expected[:, 0].float().tolist()}"
+        ) from exc
+
+
+def _get_env_int(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return int(value)
+
+
+if __name__ == "__main__":
+    main()

From fa8cabdf1faf161f875e1c545890e576d7d32ebe Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 28 Apr 2026 12:56:09 +0000
Subject: [PATCH 05/25] add tp_mesh into moe model and decoder layer

---
 .../run_test_moe_train_engine_tpep.sh         |  26 ++
 tests/engine/test_moe_train_engine_tpep.py    | 268 ++++++++++++++++++
 xtuner/v1/model/moe/moe.py                    |  69 ++++-
 .../module/decoder_layer/moe_decoder_layer.py |   3 +
 4 files changed, 354 insertions(+), 12 deletions(-)
 create mode 100755 .dev_scripts/run_test_moe_train_engine_tpep.sh
 create mode 100644 tests/engine/test_moe_train_engine_tpep.py

diff --git a/.dev_scripts/run_test_moe_train_engine_tpep.sh b/.dev_scripts/run_test_moe_train_engine_tpep.sh
new file mode 100755
index 000000000..4d1f8811c
--- /dev/null
+++ b/.dev_scripts/run_test_moe_train_engine_tpep.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Run the EP+TP training unit test.
+# Requires 4 GPUs (EP=2 * TP=2 * DP=1).
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+CONDA_ENV="${CONDA_ENV:-fla}"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "${CONDA_ENV}"
+
+XTUNER_USE_CUTLASS_GROUP_GEMM="${XTUNER_USE_CUTLASS_GROUP_GEMM:-1}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+MASTER_PORT="${MASTER_PORT:-29533}"
+
+export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
+export CUDA_VISIBLE_DEVICES
+export XTUNER_USE_CUTLASS_GROUP_GEMM
+
+cd "${REPO_ROOT}"
+python -m pytest \
+  tests/engine/test_moe_train_engine_tpep.py \
+  -v \
+  -x \
+  --no-header
diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
new file mode 100644
index 000000000..c2efec555
--- /dev/null
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -0,0 +1,268 @@
+"""Validate that EP+TP training produces the same forward loss and backward
+gradients as a pure single-GPU (EP=1, TP=1) run.
+
+Test topology: world_size = EP * TP * DP = 2 * 2 * 1 = 4 GPUs.
+
+Strategy
+--------
+1. Build a tiny Qwen3MoE model with EP=2, TP=2.
+2. Build the same model with EP=1, TP=1 (4 identical DP replicas).
+3. Init both engines with ``init_model_weights()``.  Because weights for EP+TP
+   models are Shard(0) on ep_mesh for experts and Replicate for non-experts,
+   and ``init_params`` always initialises the *full* tensor before sharding,
+   the underlying full weight values are identical when the same RNG seed is
+   active on all ranks.
+4. Sync expert weights from EP=1 engine to EP=2 engine via DCP so the two
+   models start from the exact same checkpoint.
+5. Run one ``train_step`` + ``clip_grad_norm`` on both engines with the same
+   input.
+6. Assert:
+   - losses agree within tolerance
+   - gate (router) gradients agree within tolerance (non-expert, replicated
+     on all ranks in both configs)
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import parametrize
+import torch
+import torch.distributed as dist
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.config import AdamWConfig, FSDPConfig
+from xtuner.v1.engine.train_engine import TrainEngine
+from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.model.base import ModelItem
+from xtuner.v1.model.moe.moe import SequenceContext
+from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
+from xtuner.v1.utils.device import get_device
+
+DEVICE = get_device()
+
+# Tolerance for bfloat16 numerical differences between the two configs.
+ATOL = 2e-1
+RTOL = 2e-1
+
+# Use a very small model to keep test runtime manageable.
+_TINY_LAYERS = 2
+_SEQ_LEN = 64
+
+
+def _build_tiny_moe_cfg(ep_size: int = 1, tp_size: int = 1) -> Qwen3MoE30BA3Config:
+    return Qwen3MoE30BA3Config(
+        num_hidden_layers=_TINY_LAYERS,
+        ep_size=ep_size,
+        tp_size=tp_size,
+        dispatcher="all2all" if ep_size > 1 else None,
+        compile_cfg=False,
+        # Disable auxiliary losses to keep the comparison clean.
+        balancing_loss_cfg=None,
+        z_loss_cfg=None,
+    )
+
+
+def _build_engine(ep_size: int, tp_size: int) -> TrainEngine:
+    moe_cfg = _build_tiny_moe_cfg(ep_size, tp_size)
+    optim_cfg = AdamWConfig()
+    fsdp_cfg = FSDPConfig(
+        ep_size=ep_size,
+        tp_size=tp_size,
+        cpu_offload=False,
+    )
+    return TrainEngine(model_cfg=moe_cfg, optim_cfg=optim_cfg, fsdp_cfg=fsdp_cfg)
+
+
+def _make_engine_input(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+    """Return (input_ids [1, SEQ_LEN-1], shifted_labels [1, SEQ_LEN-1]) on *device*."""
+    torch.manual_seed(12345)
+    full_ids = torch.randint(0, 151936, (1, _SEQ_LEN), dtype=torch.long, device=device)
+    input_ids = full_ids[:, :-1]  # [1, SEQ_LEN-1]
+    labels = full_ids[:, 1:]      # [1, SEQ_LEN-1] already shifted
+    return input_ids, labels
+
+
+def _run_one_step(
+    engine: TrainEngine,
+    loss_cfg: CELossConfig,
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+) -> tuple[float, dict[str, torch.Tensor]]:
+    """Run one train step; return (loss_value, {param_name: grad_tensor})."""
+    seq_ctx = SequenceContext.from_input_ids((input_ids,), device=DEVICE)
+    shifted_labels = labels.to(DEVICE)
+
+    LossContext = loss_cfg.loss_ctx_cls
+    loss_ctx = loss_cfg.build(data={"shifted_labels": shifted_labels}, sp_mesh=None)
+    loss_ctx_list = LossContext.build_batches([loss_ctx])
+    loss_ctx = loss_ctx_list[0]
+
+    engine_input = [ModelItem(seq_ctx=seq_ctx, loss_ctx={"lm": loss_ctx})]
+    step_info = engine.train_step(engine_input)
+    engine.clip_grad_norm()
+
+    loss_val: float = step_info["logs_info"]["reduced_llm_loss"]
+
+    # Collect gradients from gate (router) parameters; these are non-expert
+    # parameters replicated on all ranks in both configs, so they're easy to
+    # compare directly.
+    grads: dict[str, torch.Tensor] = {}
+    for name, param in engine.model.named_parameters():
+        if "gate.weight" in name and param.grad is not None:
+            grad = param.grad
+            if hasattr(grad, "full_tensor"):
+                grad = grad.full_tensor()  # type: ignore[attr-defined]
+            grads[name] = grad.detach().float().cpu()
+            break  # one gate layer is sufficient
+
+    return loss_val, grads
+
+
+class TestMoETrainEngineTPEP(DeterministicDDPTestCase):
+    """Verify EP+TP training matches single-GPU (EP=1, TP=1) forward and backward."""
+
+    @parametrize.parametrize(
+        "device,ep_size,tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_forward_backward_matches_single(
+        self, device: str, ep_size: int, tp_size: int
+    ) -> None:
+        """Loss and gate gradients with EP+TP must match the EP=1, TP=1 baseline."""
+        pg = self.create_pg(device)
+
+        # ------------------------------------------------------------------
+        # Build reference engine: EP=1, TP=1 (world acts as pure DP).
+        # ------------------------------------------------------------------
+        engine_ref = _build_engine(ep_size=1, tp_size=1)
+        engine_ref.init_model_weights()
+
+        # ------------------------------------------------------------------
+        # Build EP+TP engine.
+        # ------------------------------------------------------------------
+        engine_tpep = _build_engine(ep_size=ep_size, tp_size=tp_size)
+        engine_tpep.init_model_weights()
+
+        # ------------------------------------------------------------------
+        # Sync weights: save reference engine, load into EP+TP engine.
+        # DCP handles the translation between different tensor layouts.
+        # ------------------------------------------------------------------
+        tmp: list[str] = [tempfile.mkdtemp() if dist.get_rank() == 0 else ""]
+        dist.broadcast_object_list(tmp, src=0)
+        ckpt_root = Path(tmp[0])
+        model_dir = ckpt_root / "model"
+
+        engine_ref.save_dcp(model_dir=model_dir)
+        dist.barrier()
+        engine_tpep.load_dcp(model_dir=model_dir)
+        dist.barrier()
+
+        # ------------------------------------------------------------------
+        # Prepare shared input (identical on all ranks – no SP).
+        # ------------------------------------------------------------------
+        input_ids, labels = _make_engine_input(torch.device(device, dist.get_rank() % torch.cuda.device_count()))
+        loss_cfg = CELossConfig()
+
+        # Run EP+TP step.
+        loss_tpep, grads_tpep = _run_one_step(engine_tpep, loss_cfg, input_ids, labels)
+
+        # Run reference step.
+        loss_ref, grads_ref = _run_one_step(engine_ref, loss_cfg, input_ids, labels)
+
+        # ------------------------------------------------------------------
+        # Assert losses match.
+        # ------------------------------------------------------------------
+        if dist.get_rank() == 0:
+            self.assertAlmostEqual(
+                loss_tpep,
+                loss_ref,
+                delta=ATOL,
+                msg=f"Loss mismatch: EP+TP={loss_tpep:.6f}, ref={loss_ref:.6f}",
+            )
+
+        # ------------------------------------------------------------------
+        # Assert gate gradients match (key non-expert parameter).
+        # ------------------------------------------------------------------
+        if grads_tpep and grads_ref:
+            for name in grads_ref:
+                if name not in grads_tpep:
+                    continue
+                g_tpep = grads_tpep[name]
+                g_ref = grads_ref[name]
+                if dist.get_rank() == 0:
+                    try:
+                        torch.testing.assert_close(
+                            g_tpep,
+                            g_ref,
+                            atol=ATOL,
+                            rtol=RTOL,
+                        )
+                    except AssertionError as exc:
+                        max_diff = (g_tpep - g_ref).abs().max().item()
+                        raise AssertionError(
+                            f"Gate gradient mismatch for '{name}': "
+                            f"max_abs_diff={max_diff:.4e}, EP+TP shape={g_tpep.shape}, ref shape={g_ref.shape}"
+                        ) from exc
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,ep_size,tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_training_stability(self, device: str, ep_size: int, tp_size: int) -> None:
+        """EP+TP training should produce finite losses and decreasing trend."""
+        pg = self.create_pg(device)
+
+        engine = _build_engine(ep_size=ep_size, tp_size=tp_size)
+        engine.init_model_weights()
+
+        input_ids, labels = _make_engine_input(torch.device(device, dist.get_rank() % torch.cuda.device_count()))
+        loss_cfg = CELossConfig()
+
+        losses: list[float] = []
+        for _ in range(4):
+            seq_ctx = SequenceContext.from_input_ids((input_ids,), device=DEVICE)
+            shifted_labels = labels.to(DEVICE)
+            LossContext = loss_cfg.loss_ctx_cls
+            loss_ctx = loss_cfg.build(data={"shifted_labels": shifted_labels}, sp_mesh=None)
+            loss_ctx_list = LossContext.build_batches([loss_ctx])
+            engine_input = [ModelItem(seq_ctx=seq_ctx, loss_ctx={"lm": loss_ctx_list[0]})]
+            step_info = engine.train_step(engine_input)
+            grad_norm = engine.clip_grad_norm()
+            engine.step_optimizer(grad_norm)
+            losses.append(step_info["logs_info"]["reduced_llm_loss"])
+
+        if dist.get_rank() == 0:
+            for i, loss_val in enumerate(losses):
+                self.assertTrue(
+                    torch.isfinite(torch.tensor(loss_val)),
+                    f"Loss at step {i} is not finite: {loss_val}",
+                )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        # EP=2, TP=2, DP=1 → 4 GPUs
+        return 4
+
+    @property
+    def destroy_pg_upon_exit(self) -> bool:
+        return False
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index cb1dcc6fa..9ae7a47c2 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -138,6 +138,7 @@ class MoEConfig(TransformerConfig):
     hidden_factor: Annotated[float, Parameter(group="moe")] = 1.0
     moe_intermediate_size: Annotated[int, Parameter(group="moe")]
     ep_size: Annotated[int, Parameter(group="moe")] = 1
+    tp_size: Annotated[int, Parameter(group="moe")] = 1
     dispatcher: Annotated[Literal["deepep", "all2all", "agrs"] | None, Parameter(group="moe")] = None
     router: GreedyRouterConfig | NoAuxRouterConfig
     balancing_loss_cfg: BalancingLossConfig | None = BalancingLossConfig()
@@ -171,18 +172,37 @@ class MoE(BaseModel):
 
     config: MoEConfig
     ep_mesh: DeviceMesh | None = None
+    tp_mesh: DeviceMesh | None = None
 
     def __init__(self, config: MoEConfig):
         super().__init__(config)
         if config.ep_size is not None and config.ep_size > 1:
             world_size = dist.get_world_size()
-            self.ep_mesh = init_device_mesh(
-                DEVICE,
-                (world_size // config.ep_size, config.ep_size),
-                mesh_dim_names=(f"{self.config.mesh_prefix}.dp", f"{self.config.mesh_prefix}.ep"),
-            )[f"{self.config.mesh_prefix}.ep"]
+            tp_size = config.tp_size if config.tp_size > 1 else 1
+            fsdp_size = world_size // (config.ep_size * tp_size)
+            if tp_size > 1:
+                _init_mesh = init_device_mesh(
+                    DEVICE,
+                    (fsdp_size, config.ep_size, tp_size),
+                    mesh_dim_names=(
+                        f"{self.config.mesh_prefix}.dp",
+                        f"{self.config.mesh_prefix}.ep",
+                        f"{self.config.mesh_prefix}.tp",
+                    ),
+                )
+                self.ep_mesh = _init_mesh[f"{self.config.mesh_prefix}.ep"]
+                self.tp_mesh = _init_mesh[f"{self.config.mesh_prefix}.tp"]
+            else:
+                _init_mesh = init_device_mesh(
+                    DEVICE,
+                    (fsdp_size, config.ep_size),
+                    mesh_dim_names=(f"{self.config.mesh_prefix}.dp", f"{self.config.mesh_prefix}.ep"),
+                )
+                self.ep_mesh = _init_mesh[f"{self.config.mesh_prefix}.ep"]
+                self.tp_mesh = None
         else:
             self.ep_mesh = None
+            self.tp_mesh = None
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, type=config.rms_norm_type)
         self.lm_head = LMHead(config.hidden_size, config.vocab_size, bias=False)
@@ -819,6 +839,7 @@ def build_layers(self, config: MoEConfig) -> nn.ModuleDict:
                     layer_idx=layer_idx,
                     dispatcher=config.dispatcher,
                     ep_mesh=self.ep_mesh,
+                    tp_mesh=self.tp_mesh,
                 )
                 if self.config.freeze_routers:
                     layers[str(layer_idx)].gate.requires_grad_(False)
@@ -883,6 +904,7 @@ def build_mtp_block(self, config: MoEConfig) -> MTPBlock:
                 layer_idx=config.num_hidden_layers + i,
                 dispatcher=config.dispatcher,
                 ep_mesh=self.ep_mesh,
+                tp_mesh=self.tp_mesh,
             )
 
             # Wrap decoder layer in MTPLayer
@@ -920,6 +942,7 @@ def fully_shard(
     ) -> Self:
         self.fsdp_config = fsdp_config
         assert self.fsdp_config.ep_size == self.config.ep_size
+        assert self.fsdp_config.tp_size == self.config.tp_size
         self.mp_policy = MixedPrecisionPolicy(
             param_dtype=self.fsdp_config.param_dtype, reduce_dtype=fsdp_config.reduce_dtype
         )
@@ -1075,9 +1098,16 @@ def scale_and_reduce_grad(self):
                 continue
 
             ep_enabled = self.ep_mesh is not None and self.ep_mesh.size() > 1
+            tp_enabled = self.tp_mesh is not None and self.tp_mesh.size() > 1
             # Scale moe parameters
             if ep_enabled and ".experts" in name:
                 param.grad.div_(self.ep_mesh.size())  # type: ignore
+                # Each TP replica computes an identical expert gradient (redundant computation).
+                # Average across TP replicas so the effective update matches single-GPU.
+                if tp_enabled:
+                    grad = param.grad.to_local() if isinstance(param.grad, DTensor) else param.grad
+                    dist.all_reduce(grad, op=ReduceOp.SUM, group=self.tp_mesh.get_group())  # type: ignore
+                    grad.div_(self.tp_mesh.size())  # type: ignore
                 continue
 
             if isinstance(param, DTensor):
@@ -1105,14 +1135,26 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
 
         device = DEVICE
         world_size = dist.get_world_size()
-        experts_fsdp_size = world_size // self.fsdp_config.ep_size
+        tp_size = self.config.tp_size if self.config.tp_size > 1 else 1
+        experts_fsdp_size = world_size // (self.fsdp_config.ep_size * tp_size)
 
         if self.fsdp_config.hsdp_sharding_size is None:
-            model_mesh = init_device_mesh(
-                device,
-                (experts_fsdp_size, self.fsdp_config.ep_size),
-                mesh_dim_names=(f"{self.config.mesh_prefix}.fsdp", f"{self.config.mesh_prefix}.ep"),
-            )
+            if tp_size > 1:
+                model_mesh = init_device_mesh(
+                    device,
+                    (experts_fsdp_size, self.fsdp_config.ep_size, tp_size),
+                    mesh_dim_names=(
+                        f"{self.config.mesh_prefix}.fsdp",
+                        f"{self.config.mesh_prefix}.ep",
+                        f"{self.config.mesh_prefix}.tp",
+                    ),
+                )
+            else:
+                model_mesh = init_device_mesh(
+                    device,
+                    (experts_fsdp_size, self.fsdp_config.ep_size),
+                    mesh_dim_names=(f"{self.config.mesh_prefix}.fsdp", f"{self.config.mesh_prefix}.ep"),
+                )
             self._world_mesh = model_mesh
             if self.ep_mesh is not None:
                 # WARN: This assertion is **VERY** important.
@@ -1174,10 +1216,13 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
             self.fsdp_mesh = self.hsdp_mesh[f"{self.config.mesh_prefix}.hsdp_shard"]
 
     def _replicate_other_params(self, model: nn.Module):
-        def traverse(module):
+        def traverse(module: nn.Module) -> None:
             if isinstance(module, MoEBlock):
+                # Expert params are already Shard(0) on ep_mesh (from build_grouped_linear).
+                # Gradient averaging across TP replicas is handled in scale_and_reduce_grad.
                 return
             for name, param in module.named_parameters(recurse=False):
+                assert self.ep_mesh is not None
                 dist_param = nn.Parameter(
                     distribute_tensor(param, self.ep_mesh, [Replicate()]), requires_grad=param.requires_grad
                 )
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
index b5972e8e7..80e8986bb 100644
--- a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
+++ b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -215,6 +215,7 @@ def __init__(
         layer_idx: int = 0,
         dispatcher: Literal["deepep", "all2all", "agrs"] | None,
         ep_mesh: DeviceMesh | None = None,
+        tp_mesh: DeviceMesh | None = None,
     ):
         super().__init__()
         self.ep_mesh = ep_mesh
@@ -273,10 +274,12 @@ def __init__(
         )
         # TODO: (yehaochen) Maybe should be replaced by build_dispatcher
         process_group = ep_mesh.get_group() if ep_mesh is not None else None
+        tp_group = tp_mesh.get_group() if tp_mesh is not None else None
         self.dispatcher = build_dispatcher(
             dispatcher=dispatcher,
             n_routed_experts=n_routed_experts,
             ep_group=process_group,
+            tp_group=tp_group,
             training_dtype="fp8" if float8_cfg is not None else "bf16",
             generate_dtype=generate_config.dtype if generate_config is not None else "bf16",
         )

From 30c3fd9e94bd530129b79a63e079619edcc5731f Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 12 May 2026 15:30:52 +0000
Subject: [PATCH 06/25] add more backgroud docs for parallel training

---
 tp_general.md                           |  32 ++
 xtuner_ep.md => xtuner_ep_dispatcher.md |   0
 xtuner_ep_domino.md                     | 601 ++++++++++++++++++++++++
 xtuner_fsdp_ep.md                       | 488 +++++++++++++++++++
 xtuner_fsdp_loss_grad_norm.md           | 312 ++++++++++++
 5 files changed, 1433 insertions(+)
 create mode 100644 tp_general.md
 rename xtuner_ep.md => xtuner_ep_dispatcher.md (100%)
 create mode 100644 xtuner_ep_domino.md
 create mode 100644 xtuner_fsdp_ep.md
 create mode 100644 xtuner_fsdp_loss_grad_norm.md

diff --git a/tp_general.md b/tp_general.md
new file mode 100644
index 000000000..30a899732
--- /dev/null
+++ b/tp_general.md
@@ -0,0 +1,32 @@
+
+TP原理是矩阵乘的权重按列或者按行放在不同的卡上。
+具体的计算流程： 
+以TP=2为例，输入X [N1, D_h] 先经过AllGather得到 [N1+N2, D_h]，
+再经过ColumnParallelLinear（即原始权重 A 按列分 [A1, A2]，每个 rank 一份权重，比如 rank0 持有 A1 权重）的矩阵乘法得到中间分片的Y，
+再经过RowParallelLinear（即原始权重 B 按行分，每个 rank 一份权重，比如 rank0 持有 B1 权重）的矩阵乘法，
+然后做reduce scatter得到最终输出Z。
+
+这在数学上和普通MLP是等价的
+
+$$
+\begin{bmatrix}
+X1 \\
+X2
+\end{bmatrix}  \times 
+\begin{bmatrix}
+A1 & A2
+\end{bmatrix}
+= 
+\begin{bmatrix}
+Y1 & Y2
+\end{bmatrix}
+$$
+
+$$
+\begin{bmatrix}
+Y1 & Y2
+\end{bmatrix} \times \begin{bmatrix}
+B1 \\
+B2
+\end{bmatrix} = Y1B1+Y2B2 
+$$
diff --git a/xtuner_ep.md b/xtuner_ep_dispatcher.md
similarity index 100%
rename from xtuner_ep.md
rename to xtuner_ep_dispatcher.md
diff --git a/xtuner_ep_domino.md b/xtuner_ep_domino.md
new file mode 100644
index 000000000..d26b0eac7
--- /dev/null
+++ b/xtuner_ep_domino.md
@@ -0,0 +1,601 @@
+# XTuner 中 Domino EP 的原理和实现
+
+本文只梳理当前 XTuner 已有实现，重点解释 `intra_layer_micro_batch=2` 时，
+`MoE._micro_batch_forward` 和 `MoEDecoderLayer._micro_batch_forward` 如何把 MoE 层中的 EP 通信拆出来，
+并用异步通信和 autograd hook 在前向/反向中尝试做计算通信重叠。
+
+相关背景：
+
+- EP 单个 micro batch 的 dispatch/combine 数据流见 `xtuner_ep_dispatcher.md`。
+- TP 中专家权重切分和 TP collectives 的背景见 `TP.md`。
+- Domino 论文（https://arxiv.org/html/2409.15241v1）的核心思想是把一个 batch 沿无依赖维度切成多个独立片段，
+  再把这些片段的通信和计算流水起来，从而隐藏通信开销。XTuner 这里采用的是面向 MoE EP 的变种：
+  切的是 layer 内的 micro batch，通信对象从 TP AllReduce 变成 EP dispatch/combine。
+
+## 1. 原版 Domino 论文中的 TP 流程和实现
+
+原版 Domino 主要针对 dense Transformer 的 TP AllReduce。论文把 self-attention 和 MLP 都抽象成两段线性计算：
+
+```text
+X -> A -> B -> AllReduce
+```
+
+在 Megatron-LM 风格 TP 中，`A` 做 column parallel，`B` 做 row parallel。每个 TP rank 持有一份
+`A_i` 和 `B_i`，本地计算得到一份 partial output，最后通过 AllReduce 恢复完整输出。每个 transformer block
+里 self-attention 和 MLP 在前向各有一次 AllReduce，反向也各有一次 AllReduce，所以 TP 通信天然在关键路径上。
+
+Domino 的做法不是改变 TP 的数学等价性，而是在原 TP 切分之上再切出更小的、彼此无依赖的计算单元，
+然后把这些计算单元和 AllReduce 流水起来。
+
+### 1.1 输入 batch 维 row split
+
+第一种切法是在输入 `X` 的 batch 维切分。假设切成两块：
+
+```text
+X = [X0; X1]
+```
+
+因为 batch 维之间没有数据依赖，MLP 的 GeMM、element-wise 激活/dropout，以及 attention 中按 sequence 维做的
+softmax，都可以分别在 `X0` 和 `X1` 上独立计算。前向可以调度成：
+
+```text
+compute stream:
+  attn(X0)  launch AllReduce(attn0)  attn(X1)  launch AllReduce(attn1)
+  LN/dropout/residual(X0, X1)
+  mlp(X0)   launch AllReduce(mlp0)   mlp(X1)   launch AllReduce(mlp1)
+
+comm stream:
+            AllReduce(attn0) ----->            AllReduce(attn1) ----->
+                                           AllReduce(mlp0) ----->      AllReduce(mlp1) ----->
+```
+
+这里的重点是：
+
+- `AllReduce(attn0)` 可以和 `attn(X1)` 重叠。
+- `AllReduce(attn1)` 可以和后面的 layernorm、dropout、residual 等本地算子重叠。
+- `AllReduce(mlp0)` 可以和 `mlp(X1)` 重叠。
+- `AllReduce(mlp1)` 可以和下一层中 `X0` 的计算重叠，因此 row split 同时提供 intra-layer 和 inter-layer 重叠。
+
+论文中提到，batch 维 row split 的通信隐藏比例可以接近 100%。但切得太细会让单个 GeMM 变窄，影响 kernel
+效率，所以实际 partition 数需要通过 benchmark/grid search 选。
+
+### 1.2 权重 `B` 的 column split
+
+第二种切法是在第二段权重 `B` 的输出列维切分。假设 `B` 切成两块：
+
+```text
+B = [B0, B1]
+```
+
+本地可以先算第一半输出，再异步启动这半输出的 AllReduce，同时计算第二半输出：
+
+```text
+compute stream:
+  Y0 = hidden @ B0  launch AllReduce(Y0)  Y1 = hidden @ B1  launch AllReduce(Y1)  concat(Y0, Y1)
+
+comm stream:
+                    AllReduce(Y0) ----->                  AllReduce(Y1) ----->
+```
+
+这种切法的总通信量和原始 TP 一样，因为只是把同一个输出 hidden 维拆成多个 piece 后分别 AllReduce。
+但它有一个同步边界：下一层或后续算子需要完整 hidden 维，所以必须等所有 piece 都完成并拼回完整输出。
+因此 weight column split 主要提供 intra-layer 重叠，不像 input row split 那样自然跨层流水。
+
+实现上，论文没有直接依赖 `torch.cat()` 频繁拼接；它预分配大 buffer，把各个 piece 写到对应位置，以减少额外
+GPU 内存分配和 OOM 风险。论文报告这种切法通常隐藏 50% 到 70% 的通信。
+
+### 1.3 hybrid split
+
+第三种是 hybrid split：同时在输入 batch 维切 `X`，并在第二段权重输出列维切 `B`。这样能得到更细粒度的
+计算通信流水，同时保持总通信量不变。
+
+hybrid 的依赖继承自 `B` 的 column split：row 维上仍然没有跨 chunk 同步，但 hidden 维 piece 最终必须 concat，
+所以整体更偏向 intra-layer 重叠。论文把它作为大模型上的实用方案，因为只切 batch 或只切 hidden 都可能让
+kernel shape 太窄。
+
+### 1.4 反向和工程实现
+
+反向大体按前向的相反顺序执行，但 Domino 额外利用两个重叠窗口：
+
+1. 跨 batch chunk 的重叠：例如一个 chunk 的梯度 AllReduce 和另一个 chunk 的本地反向计算重叠。
+2. 同一个 chunk 内的 sub-module 重叠：把输入梯度 matmul 和权重梯度 matmul 分开，先启动输入梯度相关通信，
+   同时继续计算权重梯度。
+
+论文没有手写完整 backward，因为绕开 PyTorch autograd 会损失高效 kernel。它使用一个 no-op module 保存前向
+阶段的异步通信 handle，并在反向图中控制通信何时等待完成。这样既保留 autograd 生成的 kernel，又能把等待点放到
+真正消费梯度之前。
+
+此外，Domino 还用固定数量的全局 CUDA streams 承载独立计算单元，避免从 stream pool 反复取 stream 的开销。
+配合 `torch.compile()`、CUDA Graph 等优化，可以减少切成小 kernel 后的 launch bubble。
+
+## 2. 原始 EP MoE 的关键路径
+
+对单个 micro batch，一个 MoE decoder layer 的主路径是：
+
+```text
+attention + gate
+  -> dispatch_preprocess      # 本地按 expert 排序
+  -> dispatch                 # EP all2all，把 token copy 发到 expert 所在 rank
+  -> dispatch_postprocess     # 接收端再按 local expert 排序
+  -> experts grouped GEMM
+  -> combine_preprocess       # 恢复 all2all receive 顺序
+  -> combine                  # EP all2all，把 expert 输出送回 source rank
+  -> combine_postprocess      # 按 topK weight 合并回 token
+  -> residual / shared expert
+```
+
+如果完全同步执行，两个 EP all2all 都在本层关键路径上：
+
+```text
+pre_moe -> dispatch_comm -> expert_compute -> combine_comm -> post_moe
+```
+
+这里的 `dispatch_comm` 必须先完成，接收端才能跑本地专家；`combine_comm` 必须完成，source rank 才能得到
+本层 MoE 输出。所以单个 micro batch 内部很难把这两段通信藏在自己的后续计算后面。
+
+## 3. XTuner 的 Domino EP 切分单位
+
+训练引擎在 `intra_layer_micro_batch > 1` 时，每次从 `data_batches` 中取出多个 `seq_ctx`：
+
+```text
+seq_ctx_list = [seq_ctx0, seq_ctx1]
+loss_ctx_list = [loss_ctx0, loss_ctx1]
+output = model(seq_ctx=seq_ctx_list, loss_ctx=loss_ctx_list)
+loss.backward()
+```
+
+模型侧 `xtuner/v1/model/moe/moe.py::MoE._micro_batch_forward` 做两件事：
+
+1. MoE 层之前的 dense 层仍然在 concat 后的大 batch 上执行。
+2. 进入第一层 MoE 后，把 hidden states 沿 batch/sequence 维切回两个 micro batch：
+
+```text
+hidden_states_list = [hidden0, hidden1]
+```
+
+后续每一层 MoE decoder layer 都以这两个独立 hidden state 为输入：
+
+```text
+layer_results = decoder_layer(
+    hidden0,
+    hidden1,
+    position_embeddings=[pos0, pos1],
+    seq_ctx=[seq_ctx0, seq_ctx1],
+)
+```
+
+这就是 XTuner 里 Domino EP 的基本独立性来源：`seq_ctx0` 和 `seq_ctx1` 在同一层的 attention、gate、EP dispatch、
+expert、combine 都是数学上互不依赖的。实现上不改变路由结果和专家计算，只改变两个 micro batch 的调度顺序。
+
+## 4. 单层内的前向调度
+
+核心代码在 `xtuner/v1/module/decoder_layer/moe_decoder_layer.py::MoEDecoderLayer._micro_batch_forward`。
+设 `mb0 = seq_ctx_list[0]`，`mb1 = seq_ctx_list[1]`，当前实现的前向调度可以分成 5 段。
+
+### 4.1 先完成两个 micro batch 的 pre-MoE
+
+第一段循环依次处理 `mb0` 和 `mb1`：
+
+```text
+mb0: attention + residual + post_attention_layernorm + gate
+mb0: dispatch_preprocess(async_op=True)
+
+mb1: attention + residual + post_attention_layernorm + gate
+mb1: dispatch_preprocess(async_op=True)
+```
+
+`dispatch_preprocess` 仍是本地操作，主要是按 expert 对 token copy 做 `permute`，生成：
+
+```text
+pre_dispatched["hidden_states"]
+pre_dispatched["row_id_map"]
+pre_dispatched["topk_ids"]
+```
+
+当 `async_op=True` 时，它额外记录两个事件：
+
+- `forward_finished_event`：在当前 compute stream 上记录，表示本地 pre-dispatch 已经完成。
+- `backward_previous_event`：留给反向使用，表示 dispatch backward 的通信完成点。
+
+注意：当前代码没有在 `mb0` pre-dispatch 后立刻启动 `mb0` 的 dispatch all2all，而是先继续做 `mb1` 的
+attention/gate/pre-dispatch。因此这一步主要完成输入切片和前向事件准备。
+
+### 4.2 再依次做 dispatch、expert、combine_preprocess
+
+第二段循环依次处理两个 micro batch：
+
+```text
+mb0: dispatch(async_op=True)            # 在 dispatcher 的 comm stream 上发起 EP all2all
+mb0: dispatch_postprocess(async_op=True) # compute stream 等 dispatch 完成，再本地重排
+mb0: experts grouped GEMM
+mb0: combine_preprocess(async_op=True)  # 本地 unpermute，准备 combine all2all
+
+mb1: dispatch(async_op=True)
+mb1: dispatch_postprocess(async_op=True)
+mb1: experts grouped GEMM
+mb1: combine_preprocess(async_op=True)
+```
+
+对 `TorchAll2AllDispatcher`，`dispatch(async_op=True)` 会调用 `_AsyncDispatch`：
+
+```text
+comm_stream.wait_event(pre_dispatched.forward_finished_event)
+EP all2all
+forward_finished_event.record(comm_stream)
+```
+
+随后 `dispatch_postprocess(async_op=True)` 会在当前 compute stream 等待这个 `forward_finished_event`。
+也就是说，当前实现保证同一个 micro batch 的 expert 计算一定在 dispatch all2all 完成后开始。
+
+`combine_preprocess(async_op=True)` 是本地重排：
+
+```text
+experts_out --unpermute(row_ids_map)--> pre_combined["hidden_states"]
+```
+
+并记录一个新的 `forward_finished_event`，表示 combine 的输入已经准备好。
+
+### 4.3 批量发起两个 combine all2all
+
+第三段循环只负责发起通信，不立刻做最终 postprocess：
+
+```text
+mb0: combine(async_op=True)  # 在 comm stream 上发起回程 EP all2all
+mb1: combine(async_op=True)
+```
+
+对 `TorchAll2AllDispatcher`，`combine(async_op=True)` 会调用 `_AsyncCombine`：
+
+```text
+comm_stream.wait_event(pre_combined.forward_finished_event)
+EP all2all
+forward_finished_event.record(comm_stream)
+```
+
+这里是前向中最明确的流水点：两个 `combine` 都先被挂到独立 comm stream 上，当前 compute stream 可以继续往下执行。
+
+### 4.4 combine 通信期间计算 shared experts
+
+如果配置了 shared experts，代码会在 `combine` 已经发起后，计算两个 micro batch 的 shared expert：
+
+```text
+mb0: shared_experts(pre_moe_forward_out0)
+mb1: shared_experts(pre_moe_forward_out1)
+```
+
+因此前向中可见的主要重叠是：
+
+```text
+comm stream   : combine(mb0) -> combine(mb1)
+compute stream: shared_expert(mb0) -> shared_expert(mb1)
+```
+
+如果 `n_shared_experts=0`，这一段为空，`combine` 之后会很快进入 `combine_postprocess` 的等待，前向可隐藏的
+通信就会少很多。
+
+### 4.5 等 combine 完成并做 post-MoE
+
+最后一段依次完成两个 micro batch：
+
+```text
+mb0: combine_postprocess(async_op=True)
+mb0: _post_moe_forward(...)
+
+mb1: combine_postprocess(async_op=True)
+mb1: _post_moe_forward(...)
+```
+
+`combine_postprocess(async_op=True)` 会先让 compute stream 等待 `combine.forward_finished_event`，再做：
+
+```text
+combined["hidden_states"]
+  --unpermute(pre_dispatched["row_id_map"], probs=topk_weights)-->
+post_combined["hidden_states"]
+```
+
+这一步把 `[N * topK, hidden]` 的 expert 输出按最初的 topK token copy 顺序 gather 回来，乘以
+`topk_weights` 后对 topK 求和，恢复成 `[N, hidden]`。随后 `_post_moe_forward` 加上 shared expert 输出和
+residual，得到本层输出。
+
+## 5. `intra_layer_micro_batch=2` 的前向时间线
+
+这一节不能简单理解成“CPU 先调用什么，GPU 就一定先执行什么”。CUDA kernel/collective 的 launch 只是把操作放进
+某个 stream 的队列：
+
+- 同一个 stream 内部保持 FIFO 顺序。
+- 不同 stream 之间没有天然先后关系。
+- 跨 stream 的先后只由 `cudaEventRecord` / `cudaStreamWaitEvent` 这类 event 操作建立。
+
+因此，下面更准确地分成两层：CPU 侧调用顺序，以及 CUDA stream 上由 event 建立的偏序。
+表中的 `wait x` 表示 CPU 在对应 CUDA stream 上插入 `cudaStreamWaitEvent(x)`，不是 CPU 阻塞等待
+这个 event 完成。
+
+### 5.1 图一：CPU/host 侧顺序
+
+`MoEDecoderLayer._micro_batch_forward` 在 host 侧大致按下面顺序调用：
+表中加粗的 `A/D/E/C/S` 是相对耗时大的主算子，后续时间线主要围绕它们观察重叠。
+
+
+| CPU/host 操作                                                                                               |
+| ------------------------------------------------------------------------------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`     |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`     |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |
+| **`S0`** -> **`S1`**                                                                                        |
+| `wait Fd0` -> `Cpost0`                                                                                      |
+| `wait Fd1` -> `Cpost1`                                                                                      |
+
+其中：
+
+- `A{i}`：第 `i` 个 micro batch 的 attention + gate，即 `_pre_moe_forward`。
+- `Dpre{i}`：`dispatch_preprocess`，本地 permute。
+- `D{i}`：`dispatch`，EP all2all。
+- `Dpost{i}`：`dispatch_postprocess`，接收端本地按 local expert 重排。
+- `E{i}`：本地 experts grouped GEMM。
+- `Cpre{i}`：`combine_preprocess`，本地 unpermute。
+- `C{i}`：`combine`，EP all2all。
+- `S{i}`：shared experts；如果 `n_shared_experts=0`，这一段不存在。
+- `Cpost{i}`：`combine_postprocess + _post_moe_forward`。
+- `Fa{i}`：`Dpre{i}` 在 compute stream 上完成后记录，`D{i}` 在 comm stream 上等待它。
+- `Fb{i}`：`D{i}` 在 comm stream 上完成后记录，`Dpost{i}` 在 compute stream 上等待它。
+- `Fc{i}`：`Cpre{i}` 在 compute stream 上完成后记录，`C{i}` 在 comm stream 上等待它。
+- `Fd{i}`：`C{i}` 在 comm stream 上完成后记录，`Cpost{i}` 在 compute stream 上等待它。
+
+这里的 `wait Fa0 -> D0 -> record Fb0; wait Fb0 -> Dpost0 -> E0 -> Cpre0 -> record Fc0` 是 CPU 连续调用；
+`Dpost0` 内部会先在 compute stream 上发起
+`wait Fb0`，所以 GPU 上的 `Dpost0/E0/Cpre0` 仍必须等 comm stream 上的 `D0` 完成。`D1` 同理。
+
+但这个 host 顺序不能直接当作 GPU 执行顺序。例如 CPU 上先在 compute stream 上发起 `A1/Dpre1`，再在
+comm stream 上发起 `D0`，并不意味着 `D0` 一定在 `A1/Dpre1` 之后执行。`D0` 只需要等待 `Dpre0` 后记录的
+event；如果 `Dpre0` 已完成，而 `A1/Dpre1` 还在 compute stream 中排队或执行，`D0` 就可能和
+`A1/Dpre1` 重叠。
+
+### 5.2 图二：CUDA stream 上的实际依赖顺序
+
+对 `TorchAll2AllDispatcher`，CUDA 侧更接近下面这张图。这里画的是 event 约束下的一种典型执行偏序，
+不是一个所有机器都完全相同的绝对时间轴。
+
+`record Fa0` 表示在 compute stream 上记录 `mb0` 的 `dispatch_preprocess.forward_finished_event`，
+`wait Fa0` 表示 comm stream 等这个 event。其他 event 同理。
+
+
+| 计算 stream                                                                       | 通信 stream                                  |
+| ----------------------------------------------------------------------------------- | ---------------------------------------------- |
+| **`A0`**                                                                          |                                              |
+| `Dpre0` -> `record Fa0`                                                           |                                              |
+| **`A1`**                                                                          | `wait Fa0` -> **`D0`** -> `record Fb0`      |
+| `Dpre1` -> `record Fa1`                                                           |                                              |
+| `wait Fb0` -> `Dpost0`                                                            |                                              |
+| **`E0`** -> `Cpre0` -> `record Fc0`                                               | `wait Fa1` -> **`D1`** -> `record Fb1`      |
+| `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                     | `wait Fc0` -> **`C0`** -> `record Fd0`      |
+| **`S0`** -> **`S1`**                                                              | `wait Fc1` -> **`C1`** -> `record Fd1`      |
+| `wait Fd0` -> `Cpost0`                                                            |                                              |
+| `wait Fd1` -> `Cpost1`                                                            |                                              |
+
+同一行两列表示这两个 stream 上的操作可以重叠；长通信可能延续到后面的行。每一行到下一行的顺序只表达同一
+stream FIFO 或 event 约束能保证的偏序。为避免表格过长，主算子和紧邻的 event `record/wait` 写在同一个
+单元格里，单元格内部按左到右顺序执行。
+
+如果没有 shared experts，则 compute stream 中的 **`S0`** -> **`S1`** 为空，`record Fc1` 后会直接进入 `wait Fd0`。
+
+从这个依赖图可以看出：
+
+- `D0` 只依赖 `Fa0`，不依赖 `Fa1`。所以即使 CPU 是在 `A1/Dpre1` launch 之后才调用 `dispatch(mb0)`，
+  CUDA 上 `D0` 仍然可以在 `A1/Dpre1` 完成前开始。
+- `D1` 依赖 `Fa1`，并且因为和 `D0` 在同一个 comm stream 上，所以不能越过 `D0`。一旦 `D0` 完成且 `Fa1`
+  已记录，`D1` 可以和 compute stream 上的 `E0/Cpre0` 重叠。
+- `C0` 只依赖 `Fc0`，不依赖 `Fc1`。虽然 CPU 是在两个 micro batch 的 `Cpre` 都调用完以后才进入
+  `combine` 循环，但 CUDA 上 `C0` 可以在 `Dpost1/E1/Cpre1` 完成前执行，因为 `Fc0` 早在 `Cpre0` 后就记录了。
+- `C1` 依赖 `Fc1`，并且在同一个 comm stream 上排在 `C0` 后面。它可以和 **`S0`**/**`S1`**、甚至 `Cpost0` 的一部分重叠；
+  `Cpost1` 必须等 `Fd1`。
+
+因此，前向的重叠不应理解成一条严格线性的时间轴，而应理解成 event 约束下的跨 stream 流水：
+
+- `dispatch` 的 `D0` 可以覆盖 `A1/Dpre1`，`D1` 可以覆盖 `E0/Cpre0`。
+- `combine` 的 `C0` 可以覆盖 `Dpost1/E1/Cpre1`，`C1` 还可以覆盖 shared expert 和后续 postprocess 的一部分。
+- 当前代码仍会在 `dispatch_postprocess` / `combine_postprocess` 处插入 compute stream 对对应通信完成 event 的等待，
+  所以每个 micro batch 真正消费通信结果前仍有明确同步点。
+- 这种实现仍保留了 Domino 的关键前提：两个 micro batch 沿 batch/sequence 维独立，通信和计算可以用事件显式串依赖。
+
+### 5.3 图三：CPU 与 CUDA stream 合并表
+
+下表第一列是严格 CPU 时间轴，行内容和 5.1 的单列表一致。第二、三列展示这一 CPU 步之后，
+compute/comm stream 上已经允许出现的操作。某个 GPU 操作可以出现在其 CPU 行之后的后续行；
+这样才能表达 CUDA 异步执行导致的计算通信重叠。
+
+
+| CPU/host 严格时间轴                                                                                         | 计算 stream                                                                       | 通信 stream                                  |
+| ------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | ---------------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |                                                                                   |                                              |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         | **`A0`** -> `Dpre0` -> `record Fa0`                                               |                                              |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`     | **`A1`** -> `Dpre1` -> `record Fa1`                                               | `wait Fa0` -> **`D0`** -> `record Fb0`      |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`     | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`                    | `wait Fa1` -> **`D1`** -> `record Fb1`      |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                    | `wait Fc0` -> **`C0`** -> `record Fd0`      |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |                                                                                   |                                              |
+| **`S0`** -> **`S1`**                                                                                        | **`S0`** -> **`S1`**                                                              | `wait Fc1` -> **`C1`** -> `record Fd1`      |
+| `wait Fd0` -> `Cpost0`                                                                                      | `wait Fd0` -> `Cpost0`                                                            |                                              |
+| `wait Fd1` -> `Cpost1`                                                                                      | `wait Fd1` -> `Cpost1`                                                            |                                              |
+
+## 6. 反向中的事件链
+
+反向不在 `MoEDecoderLayer._micro_batch_forward` 里手写循环，而是通过 dispatcher 的 autograd `Function` 和
+hook 串起依赖。以 `TorchAll2AllDispatcher` 为例，前向 `async_op=True` 会布置四类事件：
+
+```text
+dispatch_preprocess.forward_finished_event
+dispatch.backward_previous_event
+combine_preprocess.forward_finished_event
+combine.backward_previous_event
+```
+
+它们在反向中的含义和前向相反：
+
+1. `combine_postprocess` 的 backward hook 在当前 compute stream 上记录 `combine.backward_previous_event`，
+   表示 `combine` 的反向通信输入梯度已经准备好。
+2. `_AsyncCombine.backward` 在 comm stream 上等待 `combine.backward_previous_event`，
+   然后执行 forward combine 的反向 all2all；完成后记录 `combine_preprocess.backward_previous_event`。
+3. `combine_preprocess` 的 backward pre-hook 让当前 compute stream 等
+   `combine_preprocess.backward_previous_event`，确保 expert 输出梯度已经从 comm stream 回来，然后才继续专家反向。
+4. `dispatch_postprocess` 的 backward hook 在 expert 反向结束后记录 `dispatch.backward_previous_event`。
+5. `_AsyncDispatch.backward` 在 comm stream 上等待这个事件，执行 forward dispatch 的反向 all2all；
+   完成后记录 `dispatch_preprocess.backward_previous_event`。
+6. `dispatch_preprocess` 的 backward pre-hook 等 `dispatch_preprocess.backward_previous_event`，
+   然后才把梯度传回 pre-MoE 的 attention/gate 部分。
+
+反向单个 micro batch 的依赖关系可以写成：
+
+```text
+grad Cpost
+  -> combine_postprocess backward
+  -> [comm stream] combine backward all2all
+  -> combine_preprocess backward
+  -> experts backward
+  -> dispatch_postprocess backward
+  -> [comm stream] dispatch backward all2all
+  -> dispatch_preprocess backward
+  -> pre_moe backward
+```
+
+## 7. `intra_layer_micro_batch=2` 的反向重叠
+
+反向同样不能只看 CPU/autograd 的调用顺序。autograd engine 在 host 上访问到某个 backward node 时，只是向当前
+compute stream 或 dispatcher 的 comm stream 继续写入待执行操作。真正的 GPU 先后关系仍然由同 stream FIFO 和
+event 决定。
+本节表格里的 `wait Ba*` / `wait Bb*` / `wait Bc*` / `wait Bd*` 也表示向 CUDA stream 插入 event wait，
+不表示 host 线程同步等待。
+
+下面用一个例子画图：假设 autograd 先处理 `mb1` 的 combine 反向，再处理 `mb0` 的 combine 反向。
+如果 autograd 实际遍历顺序相反，comm stream 上同类通信的排队顺序也会相反。
+
+### 7.1 图一：CPU/autograd 侧顺序
+
+CPU/autograd 侧看到的是 backward node 的遍历顺序：
+表中加粗的 `A/D/E/C/S` 同样表示反向中相对耗时大的主算子。
+
+
+| CPU/autograd 操作示例                                                                                                     |
+| ---------------------------------------------------------------------------------------------------------------------------- |
+| `Cpost1_bwd` -> `record Bd1`; `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                                   |
+| `Cpost0_bwd` -> `record Bd0`; `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                                   |
+| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1`     |
+| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0`     |
+| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                                  |
+| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                                  |
+
+其中：
+
+- `Ba{i}` 和前向 `Fa{i}` 对应：`D{i}_bwd` 在 comm stream 上完成后记录，`Dpre{i}_bwd` 在 compute stream 上等待它。
+- `Bb{i}` 和前向 `Fb{i}` 对应：`Dpost{i}_bwd` 在 compute stream 上完成后记录，`D{i}_bwd` 在 comm stream 上等待它。
+- `Bc{i}` 和前向 `Fc{i}` 对应：`C{i}_bwd` 在 comm stream 上完成后记录，`Cpre{i}_bwd` 在 compute stream 上等待它。
+- `Bd{i}` 和前向 `Fd{i}` 对应：`Cpost{i}_bwd` 在 compute stream 上完成后记录，`C{i}_bwd` 在 comm stream 上等待它。
+
+这张图仍然只是 CPU 发起顺序，不等价于 CUDA 实际执行顺序。比如 CPU 先发起 `C1_bwd`，后发起某些
+compute stream 上的 `Cpost0_bwd`，只要 `Bd1` 已经被记录，`C1_bwd` 就可以在 `Cpost0_bwd` 还没完成时开始。
+
+### 7.2 图二：CUDA stream 上的实际依赖顺序
+
+在上述 autograd 发起顺序下，CUDA 侧更接近下面这张 event 依赖图：
+
+
+| 计算 stream                                                                                         | 通信 stream                                  |
+| ------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
+| `Cpost1_bwd` -> `record Bd1`                                                                          |                                              |
+| `Cpost0_bwd` -> `record Bd0`                                                                          | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`  |
+| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`                              | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`  |
+| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`                              | `wait Bb1` -> **`D1_bwd`** -> `record Ba1`  |
+| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                            | `wait Bb0` -> **`D0_bwd`** -> `record Ba0`  |
+| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                            |                                              |
+
+同一行两列表示可重叠窗口；长通信可能延续到后面的行。每个 `wait Ba*` / `wait Bc*` 都位于对应
+`record Ba*` / `record Bc*` 同一行或之后，每个 `wait Bb*` / `wait Bd*` 都位于对应
+`record Bb*` / `record Bd*` 同一行或之后。为避免表格过长，主算子和紧邻
+的 event `record/wait` 写在同一个单元格里，单元格内部按左到右顺序执行。
+
+上图只表达 event 约束下的一种可能执行。两个 micro batch 之间没有额外的显式 event 依赖，除了共享同一条
+`comm_stream`，因此通信在 comm stream 上按发起顺序串行执行。这个发起顺序由 autograd 实际遍历到
+backward node 的顺序决定，不能仅凭 `hidden0, hidden1` 的返回顺序推断。若 autograd 先发起 `mb0` 的
+`C0_bwd`，再发起 `mb1` 的 `C1_bwd`，则 comm stream 上会变成 `C0_bwd -> C1_bwd`。
+
+### 7.3 图三：前向/反向六列对齐视图
+
+下表把 5.3 的前向三列表和 7.2 的反向 stream 表放在一起。前三列按前向实际时间正序排列；
+后三列把反向 GPU 时间线按实际执行的相反方向排列，并尽量让第 2/3 列和第 5/6 列的主算子在同一行：
+**`A`** 对 **`A_bwd`**，**`D`** 对 **`D_bwd`**，**`E`** 对 **`E_bwd`**，**`C`** 对 **`C_bwd`**。
+第 4 列是反向 CPU/autograd 的对应阶段，它相对第 1 列整体滞后一行；第 4 列内部仍保持“对应前向阶段”的顺序。
+
+注意：第 5/6 列是反向实际执行顺序的反向视图，所以其中 event 的 `wait/record` 在视觉上可能和 7.2 的正向
+反向时间线相反；严格 event 约束以 7.2 为准。
+
+
+| 前向 CPU/host 严格时间轴                                                                                      | 前向计算 stream                                                                  | 前向通信 stream                                 | 反向 CPU/autograd 对应阶段（滞后）                                                                                                               | 反向计算 stream（逆序，对齐前向 GPU）                                                         | 反向通信 stream（逆序，对齐前向 GPU）             |
+| -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |                                                                                  |                                                 |                                                                                                                                                   |                                                                                               |                                                   |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         | **`A0`** -> `Dpre0` -> `record Fa0`                                             |                                                 | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                                                       | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                   |                                                   |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`      | **`A1`** -> `Dpre1` -> `record Fa1`                                             | `wait Fa0` -> **`D0`** -> `record Fb0`         | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                                                       | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                   | `wait Bb0` -> **`D0_bwd`** -> `record Ba0`       |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`      | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`                  | `wait Fa1` -> **`D1`** -> `record Fb1`         | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0`                            | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`                | `wait Bb1` -> **`D1_bwd`** -> `record Ba1`       |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                  | `wait Fc0` -> **`C0`** -> `record Fd0`         | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1`                            | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`                | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`       |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |                                                                                  |                                                 | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                                                                                     |                                                                                               |                                                   |
+| **`S0`** -> **`S1`**                                                                                        | **`S0`** -> **`S1`**                                                            | `wait Fc1` -> **`C1`** -> `record Fd1`         | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                                                                                     | **`S*_bwd`**                                                                                | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`       |
+| `wait Fd0` -> `Cpost0`                                                                                      | `wait Fd0` -> `Cpost0`                                                          |                                                 | `S*_bwd`，如果开启 shared experts                                                                                                                | `Cpost0_bwd` -> `record Bd0`                                                               |                                                   |
+| `wait Fd1` -> `Cpost1`                                                                                      | `wait Fd1` -> `Cpost1`                                                          |                                                 | `Cpost0_bwd` -> `record Bd0`                                                                                                                    | `Cpost1_bwd` -> `record Bd1`                                                               |                                                   |
+|                                                                                                                |                                                                                  |                                                 | `Cpost1_bwd` -> `record Bd1`                                                                                                                    |                                                                                               |                                                   |
+
+shared experts 的反向本地计算没有在上面的 EP dispatcher event 链中单独展开；如果开启 `n_shared_experts`，
+`S*_bwd` 也是 compute stream 上的耗时计算，能否覆盖某段 EP 通信取决于 autograd 对 shared 分支和 MoE 分支的实际调度。
+
+重叠的关键也在 event：
+
+- 如果 `Bd1` 已经在 compute stream 上记录，而 compute stream 后面还排着 `Cpost0_bwd` 或其他本地反向计算，
+  那么 comm stream 上的 `C1_bwd` 可以在这些后续 compute 操作完成前开始。
+- compute stream 只有走到 `Cpre1_bwd` 前的 pre-hook 时，才会等待 `Bc1`。因此 `C1_bwd` 的等待点靠近
+  梯度真正被消费的位置，而不是通信发起位置。
+- `D{i}_bwd` 同理：它等待 `Bb{i}`，但 pre-MoE 的反向只在 `dispatch_preprocess` 的 backward pre-hook 处等待 `Ba{i}`。
+- 由于 `C0_bwd/C1_bwd/D0_bwd/D1_bwd` 都在同一条 comm stream 上，较早排队且尚未满足 event 的通信会挡住
+  后面通信，后面的通信不能绕过它。这也是判断实际重叠时必须看 event 和 stream 队列的原因。
+
+这里的重叠来自两点：
+
+- comm stream 上的反向 EP all2all 不阻塞 CPU 继续构建/执行其他 autograd 节点。
+- compute stream 只在 `combine_preprocess` / `dispatch_preprocess` 的 backward pre-hook 处等待对应事件，
+  等待位置尽量靠近梯度真正被消费的地方。
+
+因此，反向比前向更依赖 autograd 图的调度，但事件链的目标很明确：把 `combine` 和 `dispatch` 的反向通信从
+compute stream 中剥离出来，让它们尽可能和另一个 micro batch 的本地反向计算重叠。
+
+## 8. TP+EP 情况下的差异
+
+当同时打开 TP 和 EP 时，`build_dispatcher` 会选择 `TorchAll2AllTPEPDispatcher`。它继承 EP-only 的
+`dispatch_preprocess`、`dispatch`、`combine`、`combine_postprocess`，只改两处：
+
+1. `dispatch_postprocess`：EP all2all 后先做 TP AllGather，把同一 EP rank 上不同 TP rank 的 token slice 拼成
+   `[M_total, hidden]`，再按 local expert 排序给 grouped GEMM。
+2. `combine_preprocess`：expert 输出先按 local expert 的 `row_ids_map` unpermute 回 TP AllGather 顺序，再做
+   TP ReduceScatterSum，恢复每个 TP rank 自己的 `[M_ep_recv, hidden]`，最后进入 EP combine all2all。
+
+专家权重本身由 `GroupedLinear` 按 TP 切分：
+
+- `fused_w1w3` 是 column parallel。
+- `fused_w2` 是 row parallel。
+
+需要注意的是，当前 TPEP dispatcher 的 TP AllGather / ReduceScatterSum 仍是同步实现；`async_op=True` 只复用
+EP all2all 的事件链。也就是说，Domino EP 的异步重叠主要作用在 EP dispatch/combine 上，TP collectives 还没有
+被同样地放到独立通信 stream 中流水。
+
+## 9. 小结
+
+XTuner 当前 Domino EP 实现可以概括为：
+
+- 用 `intra_layer_micro_batch` 把一个 layer 的输入沿 batch/sequence 维切成多个独立 micro batch。
+- 模型级 `MoE._micro_batch_forward` 负责在进入 MoE 层后维护 `hidden_states_list`，逐层调用 decoder layer 的
+  micro-batch forward。
+- 层级 `MoEDecoderLayer._micro_batch_forward` 负责重新排列单层内两个 micro batch 的 attention/gate、EP
+  dispatch、expert、combine、shared expert、postprocess。
+- dispatcher 的 `async_op=True` 负责把 EP all2all 放到独立 comm stream，并用 CUDA event 和 autograd hook
+  维持正确依赖。
+- 前向重叠需要按 event 判断：`D0` 可覆盖 `A1/Dpre1`，`D1` 可覆盖 `E0/Cpre0`，`C0/C1` 可覆盖后续
+  compute；但每个 micro batch 在 `dispatch_postprocess` / `combine_postprocess` 消费通信结果前仍会等待。
+- 反向通过 `_AsyncDispatch.backward`、`_AsyncCombine.backward` 和 backward hook，把 dispatch/combine 的反向
+  all2all 延后到梯度准备好后异步发起，并只在梯度消费点等待，从而给两个 micro batch 之间的反向计算通信重叠留下空间。
diff --git a/xtuner_fsdp_ep.md b/xtuner_fsdp_ep.md
new file mode 100644
index 000000000..ee48d7b31
--- /dev/null
+++ b/xtuner_fsdp_ep.md
@@ -0,0 +1,488 @@
+# XTuner FSDP + EP 机制说明
+
+本文说明 XTuner v1 MoE 模型中 FSDP 和 EP 如何配合。EP dispatcher 内部的 token
+排序、all2all、combine 细节已经在 `xtuner_ep_dispatcher.md` 中展开，本文只说明这些 dispatcher
+步骤在 FSDP 并行体系中的位置和边界。
+
+主要代码入口：
+
+- `xtuner/v1/model/moe/moe.py`
+- `xtuner/v1/module/decoder_layer/moe_decoder_layer.py`
+- `xtuner/v1/module/grouped_linear/moe_group_linear.py`
+- `xtuner/v1/module/dispatcher/torch_all2all.py`
+
+## 1. 并行维度
+
+记：
+
+```text
+world_size = 全部训练 rank 数
+EP         = ep_size
+FSDP       = world_size / EP
+E          = n_routed_experts
+E_local    = E / EP
+```
+
+FSDP + EP 的核心约定是：
+
+- EP 维负责专家归属，不同 EP rank 拥有不同 routed experts。
+- FSDP 维负责数据并行和参数切分，同一个 EP rank 列上的 FSDP ranks 拥有同一批专家的不同 FSDP shard。
+- 非 expert 参数在 EP 维是 replicated，在 FSDP 维由 FSDP shard。
+- expert 参数在 EP 维是 sharded，在 FSDP 维继续被 FSDP shard。
+
+例如 `world_size=8, EP=4` 时，`FSDP=2`，FSDP 模式下的 root mesh 逻辑上是：
+
+```text
+mesh shape = (FSDP=2, EP=4)
+
+          ep0   ep1   ep2   ep3
+fsdp0      0     1     2     3
+fsdp1      4     5     6     7
+```
+
+对应的通信组：
+
+```text
+EP group:
+  fsdp0 行: [0, 1, 2, 3]
+  fsdp1 行: [4, 5, 6, 7]
+
+FSDP group:
+  ep0 列: [0, 4]
+  ep1 列: [1, 5]
+  ep2 列: [2, 6]
+  ep3 列: [3, 7]
+```
+
+也就是说，dispatcher 的 all2all 只发生在同一 FSDP 数据副本内部的 EP group
+里；FSDP 的参数 all-gather / reduce-scatter 只发生在同一 EP rank 对应的 FSDP
+group 里。
+
+## 2. mesh 建立
+
+### 2.1 `MoE.__init__` 先建立 EP mesh
+
+`MoE.__init__` 在 `config.ep_size > 1` 时先建立一个用于 MoE 模块构造的 mesh：
+
+```python
+fsdp_size = world_size // config.ep_size
+init_device_mesh(DEVICE, (fsdp_size, config.ep_size), mesh_dim_names=("*.dp", "*.ep"))
+```
+
+这一阶段虽然变量名叫 `fsdp_size`，但 mesh 维度名是 `*.dp`。它的作用主要是让模型在
+FSDP 之前也能拿到 EP group：
+
+- `GroupedLinear` 构造 expert 参数时要知道 `ep_mesh`。
+- `MoEDecoderLayer` 构造 dispatcher 时要传入 `ep_mesh.get_group()`。
+- 推理或非 FSDP 运行也可以直接使用这个 EP mesh。
+
+### 2.2 `fully_shard()` 重新建立 FSDP root mesh
+
+训练引擎会在 meta device 上构造模型，然后调用：
+
+```python
+model = model.fully_shard(fsdp_cfg)
+```
+
+`MoE.fully_shard()` 首先要求：
+
+```python
+fsdp_config.ep_size == model.config.ep_size
+```
+
+然后 `_init_device_mesh()` 建立真正的 FSDP root mesh：
+
+```python
+model_mesh = init_device_mesh(
+    DEVICE,
+    (FSDP, EP),
+    mesh_dim_names=("*.fsdp", "*.ep"),
+)
+self.fsdp_mesh = model_mesh["*.fsdp"]
+self.ep_mesh = model_mesh["*.ep"]
+```
+
+这里有一个关键细节：模型在 `__init__` 中已经创建过旧的 `ep_mesh`，而 FSDP 要求参与
+组合的 submesh 来自同一个 root mesh。`_init_device_mesh()` 会从新的 `model_mesh`
+中访问同名 EP submesh，并检查它和旧 `ep_mesh` 的 rank layout 完全一致，然后把
+`self.ep_mesh` 绑定到新的 submesh。这样 FSDP 看到的是同一个 root mesh 下的
+`fsdp` 和 `ep` 维。
+
+当前代码中 HSDP 与 EP 不同时支持：
+
+```python
+assert fsdp_config.ep_size == 1, "Currently, HSDP requires expert parallel size to be 1"
+```
+
+## 3. 参数切分
+
+参数可以分为 expert 参数和非 expert 参数。
+
+### 3.1 expert 参数：EP shard 后再 FSDP shard
+
+routed experts 位于 `MoEBlock`：
+
+```text
+MoEBlock.experts.fused_w1w3
+MoEBlock.experts.fused_w2
+```
+
+它们由 `build_grouped_linear()` 创建。`GroupedLinear.__init__` 先构造全局排布的融合权重：
+
+```text
+fused_w1w3.weight: [E * 2 * moe_intermediate_size, hidden_size]
+fused_w2.weight:   [E * hidden_size, moe_intermediate_size]
+```
+
+如果 `ep_mesh.size() > 1`，权重会被：
+
+```python
+distribute_tensor(weight, ep_mesh, [Shard(0)])
+```
+
+因为 dim0 按 expert 连续排布，`Shard(0)` 等价于按专家范围切分。每个 EP rank 只保留：
+
+```text
+E_local = E / EP
+local_expert_start = ep_rank * E_local
+local_expert_end   = local_expert_start + E_local
+```
+
+本地 shape 变成：
+
+```text
+fused_w1w3.weight local: [E_local * 2 * moe_intermediate_size, hidden_size]
+fused_w2.weight local:   [E_local * hidden_size, moe_intermediate_size]
+```
+
+随后 `MoE.fully_shard()` 对每个 decoder layer 调用 FSDP `fully_shard()`。因此 expert
+参数的逻辑布局是：
+
+```text
+EP 维:   Shard(0), 不同 EP rank 拥有不同专家
+FSDP 维: Shard(0), 同一批本地专家的参数继续被 FSDP 切分
+```
+
+前向时，FSDP 在 FSDP group 内 all-gather 当前 layer 的本地专家参数；`GroupedLinear.forward()`
+再通过：
+
+```python
+weight = self.weight.to_local() if isinstance(self.weight, DTensor) else self.weight
+weight = weight.view(-1, self.local_out_features, self.local_in_features)
+```
+
+把当前 rank 可见的本地 expert 权重交给 grouped GEMM。
+
+### 3.2 非 expert 参数：EP replicated 后再 FSDP shard
+
+非 expert 参数包括：
+
+- embedding、final norm、lm head
+- attention、layer norm
+- router gate
+- shared experts, 如果 `n_shared_experts > 0`
+
+这些参数不是按 expert 归属切开的。开启 EP 时，`MoE.fully_shard()` 会先调用：
+
+```python
+self._replicate_other_params(self)
+```
+
+该函数递归遍历模型，但遇到 `MoEBlock` 会直接返回，因为 routed expert 参数已经由
+`GroupedLinear` 负责 EP 切分。其余参数会被替换为：
+
+```python
+distribute_tensor(param, self.ep_mesh, [Replicate()])
+```
+
+然后再由 FSDP 在 `fsdp_mesh` 上切分。逻辑布局是：
+
+```text
+EP 维:   Replicate(), 每个 EP rank 都有同一份逻辑参数
+FSDP 维: Shard(0), FSDP 负责参数分片和梯度同步
+```
+
+router gate 也属于这一类。每个 EP rank 都要用完整 gate 权重计算对全部 `E` 个专家的
+logits，这样 `topk_ids` 才是全局 expert id，dispatcher 才能按 global expert id 把
+token 发到正确的 EP rank。
+
+### 3.3 FSDP 包裹顺序
+
+`MoE.fully_shard()` 的大致顺序是：
+
+1. 初始化 FSDP/EP mesh。
+2. 必要时把可训练参数转成 fp32 参数。
+3. EP 开启时复制非 expert 参数到 EP 维。
+4. 按 layer 逐个调用 `_fully_shard()`，可按 `recompute_ratio` 加 checkpoint wrapper。
+5. 对相邻 layer 设置 FSDP forward prefetch。
+6. 分别 shard `embed_tokens`、`norm`、`lm_head`。
+7. 最后对 root model 调用一次 `_fully_shard()`。
+8. 对 embedding patch forward，让 DTensor weight 先 `to_local()` 再进入 `F.embedding()`。
+9. `_to_empty_meta()` 只物化本 rank 需要的 local shard。
+
+这种顺序的目标是：构造阶段可以在 meta device 上完成，真正占显存的是 FSDP/EP 切分后的
+本地 shard。
+
+## 4. HF 权重加载与保存
+
+`BaseModel._init_load_spec()` 在模型初始化末尾执行。对 MoE 来说，这发生在 EP 参数已经
+由 `GroupedLinear` 切好之后、FSDP 切分之前。
+
+因此 load spec 表达的是“EP 切分后、FSDP 切分前”的参数布局。后续 FSDP 再根据
+`self.fsdp_mesh` 做第二次 slicing。
+
+### 4.1 fused expert 权重
+
+Qwen3 MoE 的 HF 权重是逐 expert 保存的：
+
+```text
+experts.{i}.gate_proj.weight
+experts.{i}.up_proj.weight
+experts.{i}.down_proj.weight
+```
+
+XTuner 内部为了 grouped GEMM 使用融合参数：
+
+```text
+fused_w1w3.weight
+fused_w2.weight
+```
+
+`Qwen3MoE.to_hf_key_list()` 会把一个融合参数映射到多个 HF key。开启 EP 后，
+`_init_load_spec()` 看到 expert 参数是 `Shard(0)` DTensor，会根据当前 EP rank 的
+global offset 只保留本地专家对应的 HF keys。
+
+开启 FSDP 后，`_load_fused_hf_param()` 再根据：
+
+```python
+compute_local_shape_and_global_offset(load_spec.shape, self.fsdp_mesh, [Shard(0)])
+```
+
+计算本 FSDP rank 在 EP-local 参数中的 dim0 范围，只加载和拷贝这一段。代码里明确要求：
+
+```python
+assert load_spec.dim == self.FSDP_SHARD_DIM
+```
+
+也就是当前只支持 FSDP 和专家并行都沿同一个维度切 fused expert 参数。
+
+### 4.2 非 expert 权重
+
+非 expert 参数通常只有一个 HF key。EP 维是 `Replicate()`，所以每个 EP rank 逻辑上加载同一份
+参数；FSDP 再按本 rank 的 local offset 取 dim0 shard。
+
+保存 HF 时，fused 参数和普通参数分开处理。fused expert 参数可以由多个 rank 分摊保存，
+普通 replicated 参数只需要避免重复写同一个 HF key。
+
+## 5. 前向流程
+
+下面只描述 FSDP 与 EP 的交界，不展开 dispatcher 内部 token 排列。dispatcher 细节见
+`xtuner_ep_dispatcher.md`。
+
+### 5.1 layer 进入前
+
+每个 FSDP-wrapped module 前向时，FSDP 会在对应 `fsdp_mesh` group 内 all-gather 当前
+module 的参数。对一个 MoE decoder layer 来说：
+
+- attention、norm、gate 等非 expert 参数是 EP replicated + FSDP sharded。
+- routed expert 参数是 EP sharded + FSDP sharded。
+
+所以当前 layer 前向开始时，本 rank 可以使用：
+
+- 本 EP rank 对应的完整 local experts 参数。
+- 本 EP rank 上 replicated 的非 expert 参数。
+
+这里的“完整”只是在当前 FSDP group 内 all-gather 后完整，不表示跨 EP 收集了所有专家。
+
+### 5.2 `_pre_moe_forward`
+
+`MoEDecoderLayer._pre_moe_forward()` 做三件事：
+
+1. input layernorm。
+2. self attention。
+3. post attention layernorm + gate。
+
+gate 在每个 EP rank 上都会计算完整的 `[N, E]` router logits，并输出：
+
+```text
+topk_ids:     [N, K], global expert id
+topk_weights: [N, K]
+```
+
+因为 gate 参数是 EP replicated，所以同一个输入 token 在不同 EP rank 上看到的是同一套
+router 参数。
+
+### 5.3 dispatcher
+
+之后进入 dispatcher：
+
+```python
+pre_dispatched = dispatcher.dispatch_preprocess(...)
+dispatched = dispatcher.dispatch(...)
+post_dispatched = dispatcher.dispatch_postprocess(...)
+```
+
+FSDP + EP 下需要注意两点：
+
+- dispatcher 使用的是 `ep_mesh.get_group()`，只在同一 FSDP 行内做 EP 通信。
+- dispatcher 只搬 activation 和 routing 信息，不搬 expert 参数。
+
+经过 `dispatch_postprocess()` 后，每个 EP rank 得到的 hidden states 都已经按本地
+experts 排好，并提供：
+
+```text
+post_dispatched["hidden_states"]:     [M_local, hidden_size]
+post_dispatched["tokens_per_expert"]: [E_local]
+```
+
+这里的 `E_local` 正好和当前 EP rank 持有的 local experts 数一致。
+
+### 5.4 local experts grouped GEMM
+
+`MoEBlock.forward()` 只计算本 EP rank 的 local experts：
+
+```python
+gate_up_out = self.fused_w1w3(x, tokens_per_expert, decoding)
+out = self.moe_act(gate_up_out, split_dim=-1)
+res = self.fused_w2(out, tokens_per_expert, decoding)
+```
+
+`GroupedLinear.forward()` 取本地权重：
+
+```python
+weight = self.weight.to_local() if isinstance(self.weight, DTensor) else self.weight
+```
+
+然后按：
+
+```text
+weight:            [E_local, out_features, in_features]
+tokens_per_expert: [E_local]
+```
+
+调用 grouped GEMM。由于 dispatcher 已经保证输入按 local expert 连续分组，grouped GEMM
+不需要再跨 EP 通信。
+
+### 5.5 combine 和 layer 输出
+
+expert 输出再经过：
+
+```python
+pre_combined = dispatcher.combine_preprocess(...)
+combined = dispatcher.combine(...)
+post_combined = dispatcher.combine_postprocess(...)
+```
+
+被送回 token 的 source EP rank，并按 `topk_weights` 合并回 `[N, hidden_size]`。这部分
+的行号映射和 all2all 反向 split 见 `xtuner_ep_dispatcher.md`。
+
+如果有 shared experts，它们是非 routed dense MLP，属于 EP replicated + FSDP sharded 参数，
+在本 rank 本地计算，不经过 dispatcher。最后：
+
+```python
+hidden_states = (routed_out + shared_out) * hidden_factor + residual
+```
+
+## 6. 反向流程
+
+反向可以看成前向的逆序。
+
+### 6.1 activation 梯度
+
+`combine_postprocess -> combine -> combine_preprocess` 的 autograd 会把 source token 上的
+梯度送回 expert 输出所在的 EP rank。随后 grouped GEMM 计算：
+
+- 对输入 activation 的梯度。
+- 对当前 EP rank local expert 参数的梯度。
+
+接着 `dispatch_postprocess -> dispatch -> dispatch_preprocess` 的 autograd 再把 activation
+梯度送回原 token 所在 rank。
+
+dispatcher 内部 all2all 的反向通信仍然只在 EP group 内发生，具体顺序见 `xtuner_ep_dispatcher.md`。
+
+### 6.2 expert 参数梯度
+
+expert 参数不是 EP replicated 参数。每个 EP rank 只拥有自己那段 experts，所以不能对
+expert 参数在 EP 维 all-reduce。
+
+FSDP 会在同一 EP rank 列对应的 FSDP group 内对 expert 参数梯度做 reduce-scatter。
+这会聚合不同 FSDP 数据副本上同一批 local experts 的梯度。
+
+在 `TrainEngine.clip_grad_norm()` 开始时会调用：
+
+```python
+self.model.scale_and_reduce_grad()
+```
+
+`MoE.scale_and_reduce_grad()` 对 expert 参数有特殊逻辑：
+
+```python
+if ep_enabled and ".experts" in name:
+    param.grad.div_(self.ep_mesh.size())
+    continue
+```
+
+它只除以 `EP`，不做 EP all-reduce。原因是 expert 参数在 EP 维不是同一个参数的多个副本；
+不同 EP rank 上是不同专家。这里的除法用于抵消全局 loss/backward 在 EP 维带来的重复缩放，
+而不是同步专家参数。
+
+### 6.3 非 expert 参数梯度
+
+非 expert 参数在 EP 维是 replicated。FSDP backward 已经处理了 FSDP 维的梯度同步，但
+EP 维上的多个 replicas 仍然需要得到一致梯度。
+
+`scale_and_reduce_grad()` 会检查 DTensor placement 中的 `Replicate()` 维度，并在这些维度
+上执行平均 all-reduce：
+
+```python
+grad.div_(replicate_world_size)
+dist.all_reduce(grad, ReduceOp.SUM, group=replicate_group)
+```
+
+因此：
+
+- router、attention、norm、embedding、lm head 等 replicated 参数在 EP ranks 上保持一致更新。
+- expert 参数不经过这个分支，因为前面已经按 `".experts"` 单独处理。
+
+### 6.4 grad norm 和 clip
+
+所有 micro-batch 都 backward 完之后，训练流程才进入：
+
+```python
+grad_norm = engine.clip_grad_norm()
+engine.step_optimizer(grad_norm)
+```
+
+`clip_grad_norm()` 的顺序是：
+
+1. `model.scale_and_reduce_grad()` 处理 EP expert 缩放和 replicated 参数同步。
+2. 收集所有 trainable 参数的 `.grad`。
+3. `cal_grad_norm()` 按 DTensor placement 计算全局 grad norm。
+4. 如需 clip，对各组梯度乘同一个 clip 系数。
+
+所以 optimizer step 看到的是已经完成 FSDP 同步、EP replicated 参数同步、expert 梯度缩放后的
+梯度。
+
+## 7. 关键约束
+
+- `model.config.ep_size` 必须和 `FSDPConfig.ep_size` 一致。Trainer 会在其中一个为 1 时做一次
+  自动对齐，`MoE.fully_shard()` 内部仍然会 assert。
+- `n_routed_experts % ep_size == 0`，否则 `GroupedLinear` 无法按 EP 均分 experts。
+- HSDP 当前要求 `ep_size == 1`，所以不能和 EP 同时使用。
+- routed expert 参数的 EP shard 和 FSDP shard 当前都沿 dim0，`BaseModel.FSDP_SHARD_DIM = 0`。
+- dispatcher 只处理 activation，不处理参数。参数归属由 `GroupedLinear` 和 FSDP 决定。
+- 非 expert 参数必须在 EP 维 replicated，否则不同 EP rank 的 router/attention 等参数会分叉。
+- expert 参数不能在 EP 维 all-reduce，因为不同 EP rank 上不是同一批 experts。
+
+## 8. 一句话总结
+
+XTuner 的 FSDP + EP 可以理解为二维并行：
+
+```text
+EP 维决定“这个 rank 负责哪些 experts”
+FSDP 维决定“这些参数在数据并行副本之间如何切片、all-gather 和 reduce-scatter”
+```
+
+前向时 dispatcher 在 EP 维移动 token，FSDP 在 FSDP 维移动参数；反向时 dispatcher 把
+activation 梯度送回 token/source 和 expert/destination，FSDP 聚合同一专家 shard 的数据并行
+梯度，`scale_and_reduce_grad()` 再补齐 EP 维上 expert 梯度缩放和 replicated 参数同步。
diff --git a/xtuner_fsdp_loss_grad_norm.md b/xtuner_fsdp_loss_grad_norm.md
new file mode 100644
index 000000000..7bba60a3b
--- /dev/null
+++ b/xtuner_fsdp_loss_grad_norm.md
@@ -0,0 +1,312 @@
+# XTuner FSDP Loss 校准与 Grad Norm 机制
+
+## 背景
+
+XTuner 的 loss 校准目标是：在相同 global batch 下，不论使用多少张卡、是否使用 FSDP、是否使用 SP、以及一个 optimizer step 内拆成多少个 micro-batch，最终用于 optimizer update 的梯度都应等价于单卡一次性计算同一批数据的梯度。
+
+这里有一个关键前提：FSDP 反向阶段对参数梯度做 `ReduceScatter` 时采用的是 reduce mean。也就是说，如果上游 loss 只按普通的全局平均来构造，FSDP 的梯度同步会额外除以 FSDP world size，导致梯度比期望值小。
+
+## 相关代码入口
+
+- 训练前准备 loss ctx：`xtuner/v1/train/trainer.py::_prepare_model_input`
+- 模型批量构建并校准 loss ctx：`xtuner/v1/model/base.py::build_loss_ctx_batch`
+- CE loss 校准核心：`xtuner/v1/loss/ce_loss.py::LMHeadLossContext.build_batches`
+- CE loss 前向与 autograd all-reduce：`xtuner/v1/loss/ce_loss.py::LMHeadLossContext.forward`
+- 逐 micro-batch backward：`xtuner/v1/engine/train_engine.py::train_step`
+- grad norm/clip：`xtuner/v1/engine/train_engine.py::clip_grad_norm`
+- MoE FSDP + EP mesh：`xtuner/v1/model/moe/moe.py::MoE._init_device_mesh`
+- MoE EP 参数复制与梯度修正：`xtuner/v1/model/moe/moe.py::_replicate_other_params`、`xtuner/v1/model/moe/moe.py::scale_and_reduce_grad`
+
+## Step 内一次性构建 loss ctx
+
+Trainer 在拿到一个 optimizer step 对应的 `data_batch` 后，会先把每个样本的 `seq_ctx` 移到设备上，并在 SP 开启时切分序列：
+
+```python
+if self.sp_mesh.size() > 1:
+    seq_ctx = seq_ctx.split(sequence_parallel_mesh=self.sp_mesh)
+```
+
+随后调用：
+
+```python
+loss_ctx_dict_list = self._engine.model.build_loss_ctx_batch(data_batch, sp_mesh=self.sp_mesh)
+```
+
+这里的重点是：loss ctx 不是在每个 micro-batch forward 时临时独立构建，而是对当前 step 的所有 micro-batch 一次性构建并校准。这样梯度累积的分母天然覆盖整个 optimizer step。
+
+## loss weight 的构造
+
+CE loss 使用 `shifted_labels` 和 `loss_weight`。`CELossConfig.loss_reduction` 支持三种模式：
+
+- `token`：每个有效 token 的原始权重为 1。
+- `sample`：每个样本内有效 token 的原始权重为 `1 / num_grad_tokens`。
+- `square`：每个样本内有效 token 的原始权重为 `1 / sqrt(num_grad_tokens)`。
+
+所有 `label == ignore_idx`，默认 `-100`，的位置都会被置为 0：
+
+```python
+loss_weight[shifted_labels == loss_cfg.ignore_idx] = 0.0
+```
+
+SP 下需要注意 `sample` 和 `square`：因为样本被按 sequence 维切到不同 SP rank 上，代码会先 gather 出完整 `shifted_labels` 来统计每个样本真实有效 token 数，再把算好的 `loss_weight` split 回各个 SP rank。
+
+## 全局分母
+
+构造完当前 rank 上、当前 step 内所有 micro-batch 的原始 `loss_weight` 后，XTuner 计算：
+
+```python
+rank_denominator = sum(loss_weight.sum() for loss_weight in loss_weight_list)
+global_denominator = rank_denominator
+if dist.is_initialized():
+    dist.all_reduce(global_denominator, op=dist.ReduceOp.SUM)
+```
+
+然后对每个 loss ctx 的权重做归一化：
+
+```python
+loss_ctx.loss_kwargs.loss_weight /= global_denominator + 1e-12
+```
+
+因此：
+
+- `token` 模式下，`global_denominator` 等价于当前 step 内所有 rank、所有 micro-batch 的有效 token 数。
+- `sample/square` 模式下，`global_denominator` 是当前 step 内所有 rank、所有 micro-batch 的原始 loss weight 总和，而不是简单 token 数。
+
+## 本地 loss 计算
+
+前向时，CE loss 先以 `reduction="none"` 算出逐 token loss：
+
+```python
+loss = F.cross_entropy(
+    logits,
+    shifted_labels,
+    reduction="none",
+    ignore_index=self.loss_cfg.ignore_idx,
+)
+loss = (loss * loss_weight).sum()
+```
+
+由于 `loss_weight` 已经除过 `global_denominator`，这个 `local_loss` 表示当前 rank 当前 micro-batch 对全局 loss 的局部贡献。
+
+`eager`、`chunk`、`liger` 的差异主要在实现方式：
+
+- `eager`：直接算 logits 和 CE。
+- `chunk`：按 sequence chunk 计算，降低 lm_head logits 和 CE backward 的显存峰值。
+- `liger`：用 fused linear CE，只支持 `token` reduction。
+
+这三种模式的校准目标是一致的。
+
+## autograd all-reduce 与 FSDP reduce mean 的抵消
+
+本地 loss 算完后，XTuner 会在返回前做 autograd 版 all-reduce sum：
+
+```python
+if dist.is_initialized():
+    loss = all_reduce(loss, op=dist.ReduceOp.SUM, group=dist.group.WORLD)
+```
+
+这是 FSDP loss 校准里最关键的一步。
+
+先看没有 EP 的普通 FSDP 情况。
+
+记：
+
+- `N` 为 FSDP mesh size。
+- `L_r` 为 rank `r` 上已经除过 `global_denominator` 的本地 loss。
+- 期望的全局 loss 为 `L = sum_r L_r`。
+
+如果不做 autograd all-reduce，而是直接对 `L_r` backward，FSDP 反向的 `ReduceScatter(mean)` 会让最终梯度变成期望值的 `1 / N`。
+
+XTuner 做了：
+
+```text
+forward:  L = all_reduce_sum(L_r)
+backward: dL/dL_r = N
+```
+
+于是每个 rank 本地 loss 的反向梯度先被放大 `N` 倍。随后 FSDP `ReduceScatter(mean)` 再除以 `N`。两者抵消后，参数梯度等价于：
+
+```text
+sum_over_all_ranks_and_micro_batches grad(ce * raw_loss_weight / global_denominator)
+```
+
+也就是单卡一次性在同一 global batch 上计算校准后 loss 的梯度。
+
+开启 EP 后，loss 仍然对 `dist.group.WORLD` 做 autograd all-reduce，但 FSDP 的 `ReduceScatter(mean)` 只发生在同一 EP rank 对应的 FSDP group 内。因此 EP 情况下不能只看这一处抵消，剩余的 EP 缩放会在 `MoE.scale_and_reduce_grad()` 中处理，详见后文。
+
+## 梯度累积
+
+训练循环中，XTuner 对每个 micro-batch 直接执行：
+
+```python
+loss.backward()
+```
+
+没有再除以 `grad_accumulation_steps`。原因是 `build_batches` 的 `global_denominator` 已经覆盖了当前 optimizer step 内的所有 micro-batch。
+
+因此一个 step 内多个 micro-batch 的 backward 累积结果为：
+
+```text
+step_grad =
+  sum_over_micro_batches_and_ranks grad(ce * raw_loss_weight / global_denominator)
+```
+
+这正是全局 batch 一次性 backward 的结果。
+
+## FSDP + EP 下的 Loss 校准
+
+MoE 开启 EP 后，训练 mesh 可以简化成二维：
+
+```text
+F = fsdp_mesh.size()
+E = ep_mesh.size()
+world_size = F * E
+```
+
+忽略 TP 时，逻辑布局类似：
+
+```text
+          ep0   ep1   ...   ep(E-1)
+fsdp0      *     *            *
+fsdp1      *     *            *
+...
+fsdp(F-1)  *     *            *
+```
+
+EP 维负责 expert 归属，FSDP 维负责同一批参数在数据并行副本之间的 shard、all-gather 和 reduce-scatter。
+
+参数分两类：
+
+- routed expert 参数：EP 维 `Shard(0)`，每个 EP rank 只拥有一部分 experts；FSDP 维继续 shard。
+- 非 expert 参数：EP 维 `Replicate()`，包括 embedding、attention、norm、router、lm head、shared experts；FSDP 维 shard。
+
+Loss 分母仍然按全局 rank 统计。`LMHeadLossContext.build_batches()` 对当前 step 内所有 micro-batch 构造 raw `loss_weight` 后，直接用默认分布式组做：
+
+```python
+dist.all_reduce(global_denominator, op=dist.ReduceOp.SUM)
+```
+
+这意味着 `global_denominator` 覆盖所有 FSDP rank、EP rank 和 micro-batch。EP dispatcher 后续会移动 activation，但 label/loss ctx 仍按 source token 所在 rank 构造；每个 token 在分母中只贡献一次。
+
+前向返回前的 loss 也仍然做 `WORLD` 范围的 autograd all-reduce：
+
+```text
+L = sum_{f=0}^{F-1} sum_{e=0}^{E-1} L_{f,e}
+```
+
+因此 backward 时，每个本地 `L_{f,e}` 收到的上游缩放是：
+
+```text
+world_size = F * E
+```
+
+而 FSDP 对参数梯度的 reduce mean 只在 FSDP 维发生，会除以 `F`。所以 FSDP 反向后还会剩下一个 `E` 倍缩放。这个剩余缩放不能在 loss 里统一处理，因为 expert 参数和非 expert 参数在 EP 维的语义不同。
+
+## FSDP + EP 下的 Expert 梯度
+
+routed expert 参数在 EP 维不是副本。不同 EP rank 上是不同专家，所以不能在 EP 维 all-reduce。
+
+前向时，dispatcher 在同一 FSDP 行内把 token 发送到 owning EP rank。本地 expert grouped GEMM 计算当前 EP rank 持有的 `E_local` 个 experts。反向时，dispatcher 的 autograd 会把来自所有 source EP rank 的 token 梯度送回对应 expert owner。因此某个 expert 参数在一个 FSDP 行上已经收到了这一行内所有 EP source token 对它的贡献。
+
+但 loss 的 autograd all-reduce 是 `WORLD` 范围，给每个本地 loss 带来 `F * E` 的 backward 缩放；FSDP reduce mean 只除以 `F`。所以 expert 参数梯度还多了 `E` 倍。
+
+`MoE.scale_and_reduce_grad()` 对 expert 参数的处理是：
+
+```python
+if ep_enabled and ".experts" in name:
+    param.grad.div_(self.ep_mesh.size())
+    continue
+```
+
+这里的语义是：
+
+- `div_(E)`：消掉 loss `WORLD` all-reduce 相对 FSDP mean 多出来的 EP 倍数。
+- `continue`：不做 EP all-reduce，因为 EP 维上不是同一个参数的多个副本，而是不同 experts。
+
+修正后，expert 参数梯度等价于：
+
+```text
+sum_over_fsdp_rows_and_source_ep_ranks grad(local_expert_loss / global_denominator)
+```
+
+即该 expert 在整个 global batch 中实际接收到的 token 对它的梯度。
+
+## FSDP + EP 下的 Replicated 参数梯度
+
+非 expert 参数在 EP 维是 replicated，例如 router、attention、norm、embedding、lm head。每个 EP rank 上是同一个逻辑参数的副本，但它们处理的 source token 不同，所以反向后各 EP replica 的梯度先是各自数据切片上的贡献。
+
+对某个 EP rank `e` 的非 expert 参数副本，FSDP reduce mean 后梯度形如：
+
+```text
+E * sum_f grad(L_{f,e})
+```
+
+还多了一个 `E`。但和 expert 不同，replicated 参数需要聚合所有 EP rank 的数据贡献，并让每个 replica 得到一致梯度。
+
+`MoE.scale_and_reduce_grad()` 会检查 DTensor placement 中的 `Replicate()` 维度，并在 replicate mesh 上做平均 all-reduce：
+
+```python
+grad.div_(replicate_world_size)
+dist.all_reduce(grad, ReduceOp.SUM, group=replicate_group)
+```
+
+对单个 EP replicate 维来说，这等价于：
+
+```text
+sum_e (E * sum_f grad(L_{f,e}) / E)
+= sum_e sum_f grad(L_{f,e})
+```
+
+因此它同时完成两件事：
+
+- 消掉 EP 维多出来的 `E` 倍缩放。
+- 聚合所有 EP rank 的数据贡献，使 replicated 参数的各个副本保持一致。
+
+如果一个参数有多个 `Replicate()` 维，代码会 flatten 对应 submesh 后做同样的平均 all-reduce。
+
+## Grad Norm 与 Clip
+
+一个 train step 内所有 micro-batch 都 backward 完后，Trainer 调用：
+
+```python
+grad_norm = self._engine.clip_grad_norm(do_clip=self._do_clip, dtype=self._grad_norm_dtype)
+self._engine.step_optimizer(grad_norm)
+```
+
+`clip_grad_norm` 里会先调用：
+
+```python
+self.model.scale_and_reduce_grad()
+```
+
+随后收集所有可训练参数的 `.grad`，调用 `cal_grad_norm` 计算全局 grad norm。
+
+对 Dense FSDP 模型，`BaseModel.scale_and_reduce_grad()` 默认是空操作。常规 FSDP 参数的梯度同步已经由 FSDP backward 完成，且 loss 校准已经处理了 reduce mean 的缩放问题。
+
+对 MoE 模型，`MoE.scale_and_reduce_grad()` 会额外处理 EP/replicated 参数：
+
+- expert 参数在 EP 下只除以 `ep_mesh.size()`，不做 EP all-reduce。
+- replicated DTensor 参数会在 replicate mesh 上做平均 all-reduce，使这些未按普通 FSDP shard 语义同步的参数也得到一致梯度。
+
+然后 `cal_grad_norm` 会按 DTensor 的 mesh 和 placement 分组计算 norm。对于 sharded placement，会对局部 norm square 做 all-reduce sum，再开方得到全局 norm。这样 clip 使用的是全局参数梯度范数，而不是单 rank 的局部范数。
+
+在 FSDP + EP 下，这个顺序很重要：grad norm 是在 expert 梯度除 EP、replicated 参数 EP 平均 all-reduce 之后计算的。`cal_grad_norm()` 对 `Shard()` 维度累加 norm square，对 `Replicate()` 维度不重复计数。因此：
+
+- expert 参数的 norm 会覆盖所有 EP shard 上的 experts。
+- replicated 参数的 norm 只按一份逻辑参数计数，不会因为 EP replica 数量而重复放大。
+- clip 系数作用在已经完成 FSDP/EP 校准后的梯度上，optimizer step 看到的是校准后的全局梯度。
+
+## 总结
+
+XTuner FSDP loss 校准可以概括为三步：
+
+1. 在当前 optimizer step 的所有 micro-batch 上构造 raw `loss_weight`，并跨 rank 求 `global_denominator`。
+2. 每个 rank/micro-batch 计算 `sum(ce_per_token * raw_loss_weight / global_denominator)`。
+3. 对 loss 做 autograd `all_reduce(SUM)`，用其 backward 放大效应抵消 FSDP `ReduceScatter(mean)`。
+
+FSDP + EP 时还要再区分两类参数：
+
+- expert 参数：FSDP mean 后剩余的 EP 倍数通过 `grad.div_(ep_size)` 消掉，不能 EP all-reduce。
+- EP replicated 参数：通过 replicate mesh 上的平均 all-reduce 同时消掉 EP 倍数并聚合所有 EP rank 的数据贡献。
+
+最终效果是：FSDP、EP、SP、梯度累积和不同卡数不应改变同一 global batch 对参数更新的数学含义；grad norm/clip 发生在所有 micro-batch backward 完成之后，基于已经校准和同步后的全局梯度计算。

From 6c18915b419718b41904017a6f9204457fbd6eae Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Tue, 12 May 2026 16:06:25 +0000
Subject: [PATCH 07/25] [WIP] GroupedLinear support real TP shard; but loss
 grad is wrong now

---
 .../validate_moeblock_tpep_vs_single.py       |  63 ++++++++--
 tests/engine/test_moe_train_engine_tpep.py    | 109 +++++++++++++++---
 xtuner/v1/model/moe/moe.py                    |  38 +++---
 .../module/decoder_layer/moe_decoder_layer.py |   6 +
 .../module/dispatcher/torch_all2all_tpep.py   |  58 +++++-----
 .../module/grouped_linear/moe_group_linear.py |  83 +++++++++++--
 6 files changed, 278 insertions(+), 79 deletions(-)

diff --git a/.dev_scripts/validate_moeblock_tpep_vs_single.py b/.dev_scripts/validate_moeblock_tpep_vs_single.py
index 33fc5e557..679cfb53d 100644
--- a/.dev_scripts/validate_moeblock_tpep_vs_single.py
+++ b/.dev_scripts/validate_moeblock_tpep_vs_single.py
@@ -58,6 +58,7 @@ class ParallelInfo:
     tp_rank: int
     device: torch.device
     ep_mesh: DeviceMesh
+    tp_mesh: DeviceMesh
     ep_group: dist.ProcessGroup
     tp_group: dist.ProcessGroup
 
@@ -152,14 +153,16 @@ def _init_distributed() -> ParallelInfo:
         mesh_dim_names=("dp", "ep", "tp"),
     )
     ep_mesh = mesh["ep"]
+    tp_mesh = mesh["tp"]
     return ParallelInfo(
         global_rank=dist.get_rank(),
         ep_rank=ep_mesh.get_local_rank(),
-        tp_rank=mesh["tp"].get_local_rank(),
+        tp_rank=tp_mesh.get_local_rank(),
         device=torch.device("cuda", local_rank),
         ep_mesh=ep_mesh,
+        tp_mesh=tp_mesh,
         ep_group=ep_mesh.get_group(),
-        tp_group=mesh["tp"].get_group(),
+        tp_group=tp_mesh.get_group(),
     )
 
 
@@ -222,7 +225,7 @@ def _run_tpep_moeblock(
         tp_group=parallel_info.tp_group,
         training_dtype="bf16",
     )
-    experts = _build_moeblock(parallel_info.device, ep_mesh=parallel_info.ep_mesh)
+    experts = _build_moeblock(parallel_info.device, ep_mesh=parallel_info.ep_mesh, tp_mesh=parallel_info.tp_mesh)
     _load_weights(experts, full_w1w3, full_w2)
 
     pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
@@ -283,7 +286,7 @@ def _run_single_moeblock_reference(
     )
 
     dispatcher = NaiveDispatcher(n_routed_experts=N_ROUTED_EXPERTS)
-    experts = _build_moeblock(device, ep_mesh=None)
+    experts = _build_moeblock(device, ep_mesh=None, tp_mesh=None)
     _load_weights(experts, full_w1w3, full_w2)
 
     pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
@@ -325,13 +328,14 @@ def _run_single_moeblock_reference(
     return post_combined["hidden_states"]
 
 
-def _build_moeblock(device: torch.device, ep_mesh: DeviceMesh | None) -> MoEBlock:
+def _build_moeblock(device: torch.device, ep_mesh: DeviceMesh | None, tp_mesh: DeviceMesh | None) -> MoEBlock:
     block = MoEBlock(
         hidden_size=HIDDEN_SIZE,
         moe_intermediate_size=MOE_INTERMEDIATE_SIZE,
         n_routed_experts=N_ROUTED_EXPERTS,
         moe_bias=False,
         ep_mesh=ep_mesh,
+        tp_mesh=tp_mesh,
         float8_cfg=None,
         moe_act_fn_cfg=MoEActFnConfig(),
     )
@@ -340,17 +344,60 @@ def _build_moeblock(device: torch.device, ep_mesh: DeviceMesh | None) -> MoEBloc
 
 def _load_weights(experts: MoEBlock, full_w1w3: torch.Tensor, full_w2: torch.Tensor) -> None:
     with torch.no_grad():
-        _copy_weight(experts.fused_w1w3.weight, full_w1w3)
-        _copy_weight(experts.fused_w2.weight, full_w2)
+        _copy_weight(experts.fused_w1w3, full_w1w3, fused_gate_up=True)
+        _copy_weight(experts.fused_w2, full_w2, fused_gate_up=False)
 
 
-def _copy_weight(param: torch.Tensor, full_weight: torch.Tensor) -> None:
+def _copy_weight(grouped_linear: torch.nn.Module, full_weight: torch.Tensor, *, fused_gate_up: bool) -> None:
+    param = grouped_linear.weight
     if isinstance(param, DTensor):
         param.copy_(distribute_tensor(full_weight, param.device_mesh, [Shard(0)]))
+    elif getattr(grouped_linear, "tp_enabled", False):
+        param.copy_(_slice_tpep_weight(grouped_linear, full_weight, fused_gate_up=fused_gate_up))
     else:
         param.copy_(full_weight)
 
 
+def _slice_tpep_weight(grouped_linear: torch.nn.Module, full_weight: torch.Tensor, *, fused_gate_up: bool) -> torch.Tensor:
+    num_experts = grouped_linear.num_routed_experts
+    out_features = grouped_linear.out_features
+    in_features = grouped_linear.in_features
+    expert_weight = full_weight.view(num_experts, out_features, in_features)
+    expert_weight = expert_weight[grouped_linear.local_expert_start : grouped_linear.local_expert_end]
+
+    tp_rank = grouped_linear.tp_rank
+    tp_size = grouped_linear.tp_size
+    if grouped_linear.parallel_style == "column":
+        if fused_gate_up:
+            intermediate_size = out_features // 2
+            local_intermediate_size = intermediate_size // tp_size
+            gate_start = tp_rank * local_intermediate_size
+            gate_end = gate_start + local_intermediate_size
+            up_start = intermediate_size + gate_start
+            up_end = intermediate_size + gate_end
+            expert_weight = torch.cat(
+                [
+                    expert_weight[:, gate_start:gate_end, :],
+                    expert_weight[:, up_start:up_end, :],
+                ],
+                dim=1,
+            )
+        else:
+            local_out_features = out_features // tp_size
+            out_start = tp_rank * local_out_features
+            out_end = out_start + local_out_features
+            expert_weight = expert_weight[:, out_start:out_end, :]
+    elif grouped_linear.parallel_style == "row":
+        local_in_features = in_features // tp_size
+        in_start = tp_rank * local_in_features
+        in_end = in_start + local_in_features
+        expert_weight = expert_weight[:, :, in_start:in_end]
+    else:
+        raise RuntimeError(f"Unexpected grouped linear parallel style: {grouped_linear.parallel_style}.")
+
+    return expert_weight.reshape(grouped_linear.weight.shape)
+
+
 def _assert_close(actual: torch.Tensor, expected: torch.Tensor) -> None:
     try:
         torch.testing.assert_close(actual.float(), expected.float(), rtol=RTOL, atol=ATOL)
diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
index c2efec555..290278e68 100644
--- a/tests/engine/test_moe_train_engine_tpep.py
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -24,17 +24,16 @@
 
 from __future__ import annotations
 
-import tempfile
-from pathlib import Path
-
 import parametrize
 import torch
 import torch.distributed as dist
+from torch.distributed.tensor import DTensor, distribute_tensor
 
 from xtuner._testing import DeterministicDDPTestCase
 from xtuner.v1.config import AdamWConfig, FSDPConfig
 from xtuner.v1.engine.train_engine import TrainEngine
 from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.module.grouped_linear.moe_group_linear import GroupedLinear
 from xtuner.v1.model.base import ModelItem
 from xtuner.v1.model.moe.moe import SequenceContext
 from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
@@ -120,6 +119,97 @@ def _run_one_step(
     return loss_val, grads
 
 
+def _full_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    if isinstance(tensor, DTensor):
+        return tensor.full_tensor()
+    return tensor
+
+
+def _copy_param_from_full(param: torch.nn.Parameter, full_tensor: torch.Tensor) -> None:
+    if isinstance(param, DTensor):
+        param.copy_(distribute_tensor(full_tensor, param.device_mesh, param.placements))
+    else:
+        param.copy_(full_tensor)
+
+
+def _sync_engine_weights(engine_ref: TrainEngine, engine_tpep: TrainEngine) -> None:
+    """Synchronize a non-TP reference model into the EP+TP model layout."""
+    ref_params = dict(engine_ref.model.named_parameters())
+    ref_modules = dict(engine_ref.model.named_modules())
+    tpep_modules = dict(engine_tpep.model.named_modules())
+
+    with torch.no_grad():
+        for name, param in engine_tpep.model.named_parameters():
+            ref_param = ref_params[name]
+            full_param = _full_tensor(ref_param.detach()).to(device=param.device, dtype=param.dtype)
+
+            module_name, _, param_name = name.rpartition(".")
+            module = tpep_modules[module_name]
+            ref_module = ref_modules[module_name]
+            if isinstance(module, GroupedLinear) and getattr(module, "tp_enabled", False):
+                if param_name == "weight":
+                    shard = _slice_tpep_weight(module, full_param, fused_gate_up="fused_w1w3" in module_name)
+                    _copy_param_from_full(param, shard)
+                elif param_name == "bias":
+                    shard = _slice_tpep_bias(module, full_param)
+                    _copy_param_from_full(param, shard)
+                else:
+                    raise RuntimeError(f"Unexpected GroupedLinear parameter: {name}.")
+            else:
+                ref_full = _full_tensor(getattr(ref_module, param_name).detach()).to(device=param.device, dtype=param.dtype)
+                _copy_param_from_full(param, ref_full)
+
+
+def _slice_tpep_weight(grouped_linear: GroupedLinear, full_weight: torch.Tensor, *, fused_gate_up: bool) -> torch.Tensor:
+    num_experts = grouped_linear.num_routed_experts
+    out_features = grouped_linear.out_features
+    in_features = grouped_linear.in_features
+    expert_weight = full_weight.view(num_experts, out_features, in_features)
+    expert_weight = expert_weight[grouped_linear.local_expert_start : grouped_linear.local_expert_end]
+
+    tp_rank = grouped_linear.tp_rank
+    tp_size = grouped_linear.tp_size
+    if grouped_linear.parallel_style == "column":
+        if fused_gate_up:
+            intermediate_size = out_features // 2
+            local_intermediate_size = intermediate_size // tp_size
+            gate_start = tp_rank * local_intermediate_size
+            gate_end = gate_start + local_intermediate_size
+            up_start = intermediate_size + gate_start
+            up_end = intermediate_size + gate_end
+            expert_weight = torch.cat(
+                [
+                    expert_weight[:, gate_start:gate_end, :],
+                    expert_weight[:, up_start:up_end, :],
+                ],
+                dim=1,
+            )
+        else:
+            local_out_features = out_features // tp_size
+            out_start = tp_rank * local_out_features
+            out_end = out_start + local_out_features
+            expert_weight = expert_weight[:, out_start:out_end, :]
+    elif grouped_linear.parallel_style == "row":
+        local_in_features = in_features // tp_size
+        in_start = tp_rank * local_in_features
+        in_end = in_start + local_in_features
+        expert_weight = expert_weight[:, :, in_start:in_end]
+    else:
+        raise RuntimeError(f"Unexpected grouped linear parallel style: {grouped_linear.parallel_style}.")
+
+    return expert_weight.reshape(grouped_linear.weight.shape)
+
+
+def _slice_tpep_bias(grouped_linear: GroupedLinear, full_bias: torch.Tensor) -> torch.Tensor:
+    expert_bias = full_bias[grouped_linear.local_expert_start : grouped_linear.local_expert_end]
+    if grouped_linear.parallel_style == "column":
+        local_out_features = grouped_linear.out_features // grouped_linear.tp_size
+        out_start = grouped_linear.tp_rank * local_out_features
+        out_end = out_start + local_out_features
+        expert_bias = expert_bias[:, out_start:out_end]
+    return expert_bias.reshape(grouped_linear.bias.shape)
+
+
 class TestMoETrainEngineTPEP(DeterministicDDPTestCase):
     """Verify EP+TP training matches single-GPU (EP=1, TP=1) forward and backward."""
 
@@ -148,17 +238,10 @@ def test_tpep_forward_backward_matches_single(
         engine_tpep.init_model_weights()
 
         # ------------------------------------------------------------------
-        # Sync weights: save reference engine, load into EP+TP engine.
-        # DCP handles the translation between different tensor layouts.
+        # Sync weights by explicitly slicing full expert weights into the real
+        # TP column/row shards used by GroupedLinear.
         # ------------------------------------------------------------------
-        tmp: list[str] = [tempfile.mkdtemp() if dist.get_rank() == 0 else ""]
-        dist.broadcast_object_list(tmp, src=0)
-        ckpt_root = Path(tmp[0])
-        model_dir = ckpt_root / "model"
-
-        engine_ref.save_dcp(model_dir=model_dir)
-        dist.barrier()
-        engine_tpep.load_dcp(model_dir=model_dir)
+        _sync_engine_weights(engine_ref, engine_tpep)
         dist.barrier()
 
         # ------------------------------------------------------------------
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index 9ae7a47c2..e19a3520b 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -942,7 +942,8 @@ def fully_shard(
     ) -> Self:
         self.fsdp_config = fsdp_config
         assert self.fsdp_config.ep_size == self.config.ep_size
-        assert self.fsdp_config.tp_size == self.config.tp_size
+        # TODO: self.config.tp_size is expert tp size, which can be different from fsdp_config.tp_size. Rename it to expert_tp_size.
+        # assert self.fsdp_config.tp_size == self.config.tp_size
         self.mp_policy = MixedPrecisionPolicy(
             param_dtype=self.fsdp_config.param_dtype, reduce_dtype=fsdp_config.reduce_dtype
         )
@@ -1098,16 +1099,9 @@ def scale_and_reduce_grad(self):
                 continue
 
             ep_enabled = self.ep_mesh is not None and self.ep_mesh.size() > 1
-            tp_enabled = self.tp_mesh is not None and self.tp_mesh.size() > 1
             # Scale moe parameters
             if ep_enabled and ".experts" in name:
                 param.grad.div_(self.ep_mesh.size())  # type: ignore
-                # Each TP replica computes an identical expert gradient (redundant computation).
-                # Average across TP replicas so the effective update matches single-GPU.
-                if tp_enabled:
-                    grad = param.grad.to_local() if isinstance(param.grad, DTensor) else param.grad
-                    dist.all_reduce(grad, op=ReduceOp.SUM, group=self.tp_mesh.get_group())  # type: ignore
-                    grad.div_(self.tp_mesh.size())  # type: ignore
                 continue
 
             if isinstance(param, DTensor):
@@ -1117,13 +1111,22 @@ def scale_and_reduce_grad(self):
                     if isinstance(p, Replicate)
                 )
                 if replicate_dim_names:
-                    # `DeviceMesh.get_group()` only supports a single mesh dimension,
-                    # so calling it directly on a multi-dim sub-mesh raises RuntimeError.
-                    # `_flatten()` collapses all Replicate dims into a 1D mesh whose
-                    # process group covers every rank across those dimensions, allowing
-                    # a single all_reduce regardless of how many Replicate dims exist.
-                    flat_mesh = param.device_mesh[replicate_dim_names]._flatten()
                     grad = param.grad.to_local() if isinstance(param.grad, DTensor) else param.grad
+                    if len(replicate_dim_names) == 1:
+                        replicate_dim = replicate_dim_names[0]
+                        replicate_dim_idx = param.device_mesh.mesh_dim_names.index(replicate_dim)
+                        group = param.device_mesh.get_group(replicate_dim)
+                        grad.div_(param.device_mesh.size(replicate_dim_idx))  # type: ignore
+                        dist.all_reduce(grad, ReduceOp.SUM, group=group)
+                        continue
+                    # DTensor 的 device_mesh 可能已经是从全局 mesh 切出来的 submesh。
+                    # 当所有维度都是 Replicate 时，可以直接 flatten 当前 submesh；
+                    # 否则才继续按 Replicate 维度切子 mesh。这样可以避免对已经
+                    # 覆盖目标维度的 submesh 再切一次，触发 PyTorch 的限制。
+                    if len(replicate_dim_names) == len(param.device_mesh.mesh_dim_names):
+                        flat_mesh = param.device_mesh._flatten()
+                    else:
+                        flat_mesh = param.device_mesh[replicate_dim_names]._flatten()
                     dist.all_reduce(
                         grad.div_(flat_mesh.size()),  # type: ignore
                         ReduceOp.SUM,
@@ -1218,13 +1221,14 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
     def _replicate_other_params(self, model: nn.Module):
         def traverse(module: nn.Module) -> None:
             if isinstance(module, MoEBlock):
-                # Expert params are already Shard(0) on ep_mesh (from build_grouped_linear).
-                # Gradient averaging across TP replicas is handled in scale_and_reduce_grad.
+                # Expert params are already partitioned by build_grouped_linear.
                 return
             for name, param in module.named_parameters(recurse=False):
                 assert self.ep_mesh is not None
                 dist_param = nn.Parameter(
-                    distribute_tensor(param, self.ep_mesh, [Replicate()]), requires_grad=param.requires_grad
+                    # TODO: replicate on ep_tp_mesh instead of ep_mesh?
+                    distribute_tensor(param, self.ep_mesh, [Replicate()]),
+                    requires_grad=param.requires_grad,
                 )
                 module.register_parameter(name, dist_param)
             for child in module.children():
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
index 80e8986bb..b0deb154a 100644
--- a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
+++ b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -150,6 +150,7 @@ def __init__(
         n_routed_experts: int,
         moe_bias: bool = False,
         ep_mesh: DeviceMesh | None = None,
+        tp_mesh: DeviceMesh | None = None,
         float8_cfg: Float8Config | None = None,
         moe_act_fn_cfg: MoEActFnConfig,
     ):
@@ -166,6 +167,8 @@ def __init__(
             self.num_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=self.ep_mesh,
+            tp_mesh=tp_mesh,
+            parallel_style="column",
             float8_cfg=float8_cfg,
         )
         self.fused_w2 = build_grouped_linear(
@@ -174,6 +177,8 @@ def __init__(
             self.num_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=self.ep_mesh,
+            tp_mesh=tp_mesh,
+            parallel_style="row",
             float8_cfg=float8_cfg,
         )
         self.moe_act = moe_act_fn_cfg.build()
@@ -269,6 +274,7 @@ def __init__(
             n_routed_experts=n_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=ep_mesh,
+            tp_mesh=tp_mesh,
             float8_cfg=float8_cfg,
             moe_act_fn_cfg=moe_act_fn_cfg,
         )
diff --git a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
index d53905afd..225e8956e 100644
--- a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
+++ b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
@@ -4,23 +4,21 @@
 
     dispatch_preprocess : permute by expert (each TP rank independently, N_local tokens)
     dispatch            : EP AlltoAll (each TP rank independently, routing N_local token copies)
-    dispatch_postprocess: TP AllGather → merge TP slices into M_total tokens
+    dispatch_postprocess: TP AllGather → merge TP token slices into M_total tokens
                           then permute by local expert (for grouped GEMM)
-    [Expert GEMM]       : each TP rank computes full expert output (redundant across TP)
+    [Expert GEMM]       : column-parallel gate/up + row-parallel down projection
     combine_preprocess  : unpermute back to TP-AllGather order
-                          then TP ReduceScatterMean → restore M_ep_recv per TP rank
+                          then TP ReduceScatterSum → restore M_ep_recv per TP rank
     combine             : EP AlltoAll reverse (each TP rank independently)
     combine_postprocess : unpermute with topk_weights → [N_local, H] per TP rank
 
 Design rationale (mirrors Megatron MoEAlltoAllTokenDispatcher with TP+EP):
-  - Expert weights are NOT sharded by TP; each TP rank holds a full copy.
-  - TP AllGather before experts and TP ReduceScatterMean after experts form a symmetric pair
-    that keeps the forward values numerically identical to the EP-only case.
-  - ReduceScatterMean (avg reduce) is used so that the redundant expert outputs from all TP
-    ranks reduce back to the original values without a TP-factor scaling in the forward pass.
-  - The backward of ReduceScatterMean (AllGather) and AllGather backward (AllReduce+slice)
-    introduce a 1/TP scaling in the gradient. This is a known design trade-off consistent
-    with the Megatron approach; the model learns to compensate via weight initialisation.
+  - Expert weights are sharded by TP: gate/up use column parallelism, down uses row
+    parallelism.
+  - TP AllGather before experts gives every TP rank the same token batch for its local
+    expert weight shard.
+  - TP ReduceScatterSum after the row-parallel down projection sums partial hidden states
+    across TP ranks, then returns each rank's original token slice.
 """
 
 from __future__ import annotations
@@ -49,7 +47,7 @@ class TorchAll2AllTPEPPostDispatchResult(TorchAll2AllPostDispatchResult):
     """Post-dispatch result for TP+EP dispatcher.
 
     Extends the EP-only result with per-TP-rank token counts needed to perform the
-    TP ReduceScatterMean in ``combine_preprocess``.
+    TP ReduceScatterSum in ``combine_preprocess``.
     """
 
     output_splits_tp: list[int]
@@ -59,10 +57,8 @@ class _TPAllGather(torch.autograd.Function):
     """TP AllGather with autograd support.
 
     Forward : ``all_gather`` across the TP group, concatenating along the token dim.
-    Backward: ``all_reduce`` (SUM) the gradient then slice — equivalent to a reduce-scatter
-              sum in the unequal-size case.  This introduces a 1/TP factor relative to the
-              mathematically exact gradient when computation is redundant across TP ranks,
-              consistent with the Megatron redundant-TP-expert design.
+    Backward: ``all_reduce`` (SUM) the gradient then slice, accumulating gradients from
+              each TP weight shard into the original local token slice.
     """
 
     @staticmethod
@@ -87,20 +83,20 @@ def backward(
         ctx: Any,
         grad: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
+        # TODO: use reduce_scatter instead of all_reduce
         grad = grad.contiguous()
         dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=ctx.tp_group)
         offset = sum(ctx.all_sizes[: ctx.tp_rank])
         return grad[offset : offset + ctx.all_sizes[ctx.tp_rank]].clone(), None, None, None, None
 
 
-class _TPReduceScatterMean(torch.autograd.Function):
-    """TP ReduceScatterMean with autograd support.
+class _TPReduceScatterSum(torch.autograd.Function):
+    """TP ReduceScatterSum with autograd support.
 
-    Forward : ``all_reduce`` (SUM) / TP_size then slice — equivalent to a mean reduce-scatter.
-              When all TP ranks hold identical tensors (redundant expert computation), this
-              returns the original un-scaled value for each rank's slice.
+    Forward : ``all_reduce`` (SUM) then slice — equivalent to a sum reduce-scatter
+              for the unequal-size token case used here.
     Backward: ``all_gather`` the gradient slices to reconstruct the full gradient tensor,
-              then divide by TP_size (chain rule through the /TP_size division).
+              matching the sum reduction in the forward pass.
     """
 
     @staticmethod
@@ -112,9 +108,9 @@ def forward(
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
+        # TODO: use reduce_scatter instead of all_reduce
         hidden = hidden.clone()
         dist.all_reduce(hidden, op=dist.ReduceOp.SUM, group=tp_group)
-        hidden = hidden / tp_size
         offset = sum(all_sizes[:tp_rank])
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
@@ -132,7 +128,7 @@ def backward(
             for s in ctx.all_sizes
         ]
         dist.all_gather(chunks, grad_slice.contiguous(), group=ctx.tp_group)
-        full_grad = torch.cat(chunks, dim=0) / ctx.tp_size
+        full_grad = torch.cat(chunks, dim=0)
         return full_grad, None, None, None, None
 
 
@@ -156,19 +152,19 @@ def _tp_all_gather(
     return gathered, all_sizes
 
 
-def _tp_reduce_scatter_mean(
+def _tp_reduce_scatter_sum(
     hidden: torch.Tensor,
     all_sizes: list[int],
     tp_group: dist.ProcessGroup,
 ) -> torch.Tensor:
-    """Mean-reduce-scatter ``hidden`` across the TP group, returning this
-    rank's slice."""
+    """Sum-reduce-scatter ``hidden`` across the TP group, returning this rank's
+    slice."""
     tp_size = tp_group.size()
     if tp_size == 1:
         return hidden
 
     tp_rank = dist.get_rank(group=tp_group)
-    return _TPReduceScatterMean.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
+    return _TPReduceScatterSum.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
 
 
 def _tp_all_gather_tokens_per_expert_group(
@@ -188,7 +184,7 @@ def _tp_all_gather_tokens_per_expert_group(
 
 class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
     """TP+EP dispatcher: wraps ``TorchAll2AllDispatcher`` with TP AllGather and
-    ReduceScatterMean.
+    ReduceScatterSum.
 
     Overrides only ``dispatch_postprocess`` and ``combine_preprocess``; all other steps
     (dispatch_preprocess, dispatch, combine, combine_postprocess) are unchanged from the
@@ -296,8 +292,8 @@ def combine_preprocess(
         # Unpermute [M_total, H] back to TP-AllGather order (tp0_block | tp1_block | ...).
         hidden_states = unpermute(hidden_states, tpep_post["row_ids_map"])
 
-        # TP ReduceScatterMean: [M_total, H] → [M_ep_recv, H] for this TP rank.
-        hidden_states = _tp_reduce_scatter_mean(
+        # TP ReduceScatterSum: [M_total, H] → [M_ep_recv, H] for this TP rank.
+        hidden_states = _tp_reduce_scatter_sum(
             hidden_states,
             all_sizes=tpep_post["output_splits_tp"],
             tp_group=self._tp_group,
diff --git a/xtuner/v1/module/grouped_linear/moe_group_linear.py b/xtuner/v1/module/grouped_linear/moe_group_linear.py
index 71f819504..2887c1958 100644
--- a/xtuner/v1/module/grouped_linear/moe_group_linear.py
+++ b/xtuner/v1/module/grouped_linear/moe_group_linear.py
@@ -1,3 +1,5 @@
+from typing import Literal
+
 import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
@@ -8,6 +10,9 @@
 from xtuner.v1.ops import group_gemm
 
 
+GroupedLinearParallelStyle = Literal["column", "row"]
+
+
 class GroupedLinear(nn.Module):
     # TODO:Missng example docs
     def __init__(
@@ -17,34 +22,80 @@ def __init__(
         num_routed_experts: int,
         moe_bias: bool = False,
         ep_mesh: DeviceMesh | None = None,
+        tp_mesh: DeviceMesh | None = None,
+        parallel_style: GroupedLinearParallelStyle | None = None,
     ):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.num_routed_experts = num_routed_experts
-        weight = torch.empty(num_routed_experts * out_features, in_features)
 
         self.ep_mesh = ep_mesh
-        if self.ep_mesh is not None and self.ep_mesh.size() > 1:
-            self.weight = nn.Parameter(distribute_tensor(weight, ep_mesh, [Shard(0)]))
-        else:
+        self.tp_mesh = tp_mesh
+        self.parallel_style: GroupedLinearParallelStyle | None = parallel_style
+        self.ep_size = ep_mesh.size() if ep_mesh is not None else 1
+        self.tp_size = tp_mesh.size() if tp_mesh is not None else 1
+        self.ep_rank = ep_mesh.get_local_rank() if ep_mesh is not None else 0
+        self.tp_rank = tp_mesh.get_local_rank() if tp_mesh is not None else 0
+        self.tp_enabled = self.tp_mesh is not None and self.tp_size > 1 and self.parallel_style is not None
+        if self.tp_mesh is not None and self.tp_mesh.size() > 1 and self.parallel_style is None:
+            raise ValueError("parallel_style must be set when tp_mesh size is greater than 1.")
+        if self.num_routed_experts % self.ep_size != 0:
+            raise ValueError(
+                f"num_routed_experts ({self.num_routed_experts}) must be divisible by ep_size ({self.ep_size})."
+            )
+
+        self.local_num_routed_experts = self.num_routed_experts // self.ep_size
+        self.local_expert_start = self.ep_rank * self.local_num_routed_experts
+        self.local_expert_end = self.local_expert_start + self.local_num_routed_experts
+        self.local_in_features = in_features
+        self.local_out_features = out_features
+        if self.tp_enabled:
+            if self.parallel_style == "column":
+                if out_features % self.tp_size != 0:
+                    raise ValueError(f"out_features ({out_features}) must be divisible by tp_size ({self.tp_size}).")
+                self.local_out_features = out_features // self.tp_size
+            elif self.parallel_style == "row":
+                if in_features % self.tp_size != 0:
+                    raise ValueError(f"in_features ({in_features}) must be divisible by tp_size ({self.tp_size}).")
+                self.local_in_features = in_features // self.tp_size
+            else:
+                raise ValueError(f"Unsupported parallel_style: {self.parallel_style}.")
+
+            # TODO: use DTensor instead of Tensor? for weight load?
+            weight = torch.empty(
+                self.local_num_routed_experts * self.local_out_features,
+                self.local_in_features,
+            )
             self.weight = nn.Parameter(weight)
+        else:
+            weight = torch.empty(num_routed_experts * out_features, in_features)
+            if self.ep_mesh is not None and self.ep_mesh.size() > 1:
+                self.weight = nn.Parameter(distribute_tensor(weight, ep_mesh, [Shard(0)]))
+            else:
+                self.weight = nn.Parameter(weight)
 
         self.moe_bias = moe_bias
         if self.moe_bias:
-            bias = torch.zeros(num_routed_experts, out_features)
-            if self.ep_mesh is not None and self.ep_mesh.size() > 1:
-                self.bias = nn.Parameter(distribute_tensor(bias, ep_mesh, [Shard(0)]))
+            if self.tp_enabled:
+                bias_out_features = self.local_out_features if self.parallel_style == "column" else self.out_features
+                self.bias = nn.Parameter(torch.zeros(self.local_num_routed_experts, bias_out_features))
             else:
-                self.bias = nn.Parameter(torch.zeros(num_routed_experts, out_features))
+                bias = torch.zeros(num_routed_experts, out_features)
+                if self.ep_mesh is not None and self.ep_mesh.size() > 1:
+                    self.bias = nn.Parameter(distribute_tensor(bias, ep_mesh, [Shard(0)]))
+                else:
+                    self.bias = nn.Parameter(torch.zeros(num_routed_experts, out_features))
 
     def forward(self, x: torch.Tensor, tokens_per_expert: torch.Tensor, decoding: bool = False):
         weight = self.weight.to_local() if isinstance(self.weight, DTensor) else self.weight
-        weight = weight.view(-1, self.out_features, self.in_features)
+        weight = weight.view(-1, self.local_out_features, self.local_in_features)
         out = group_gemm(x, weight, tokens_per_expert)
 
         if self.moe_bias:
             bias = self.bias.to_local() if isinstance(self.bias, DTensor) else self.bias
+            if self.tp_enabled and self.parallel_style == "row" and self.tp_rank != 0:
+                return out
             out = out + bias.repeat_interleave(tokens_per_expert, dim=0)  # TODO: 无法 compile
         return out
 
@@ -55,12 +106,24 @@ def build_grouped_linear(
     num_routed_experts: int,
     moe_bias: bool = False,
     ep_mesh: DeviceMesh | None = None,
+    tp_mesh: DeviceMesh | None = None,
+    parallel_style: GroupedLinearParallelStyle | None = None,
     float8_cfg: Float8Config | None = None,
 ):
     """Build a grouped linear layer with optional float8 support."""
     if float8_cfg is None or float8_cfg.scaling_granularity_gemm is None:
-        return GroupedLinear(in_features, out_features, num_routed_experts, moe_bias=moe_bias, ep_mesh=ep_mesh)
+        return GroupedLinear(
+            in_features,
+            out_features,
+            num_routed_experts,
+            moe_bias=moe_bias,
+            ep_mesh=ep_mesh,
+            tp_mesh=tp_mesh,
+            parallel_style=parallel_style,
+        )
     elif float8_cfg.scaling_granularity_grouped_gemm == ScalingGranularity.TILEWISE:
+        if tp_mesh is not None and tp_mesh.size() > 1:
+            raise NotImplementedError("Tile-wise float8 grouped linear does not support TP sharding yet.")
         return TileWiseFloat8GroupedLinear(
             in_features, out_features, num_routed_experts, moe_bias=moe_bias, ep_mesh=ep_mesh
         )

From 71c40ae9d6b9860ee4de89f379eaa357e5f80907 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Wed, 13 May 2026 04:36:57 +0000
Subject: [PATCH 08/25] [Fix] ETP calculates correct loss grad; Tighten the
 numerical precision tolerance for tests

---
 .../validate_moeblock_tpep_vs_single.py       |   8 +-
 .dev_scripts/validate_xtuner_tpep_md.py       |  42 +--
 tests/engine/test_moe_train_engine_tpep.py    | 308 +++++++++++++++---
 xtuner/v1/engine/train_engine.py              |   3 +-
 xtuner/v1/model/base.py                       |   5 +
 xtuner/v1/model/moe/moe.py                    |  82 ++++-
 xtuner_fsdp_loss_grad_norm.md                 |  95 +++++-
 7 files changed, 463 insertions(+), 80 deletions(-)

diff --git a/.dev_scripts/validate_moeblock_tpep_vs_single.py b/.dev_scripts/validate_moeblock_tpep_vs_single.py
index 679cfb53d..16f8d3dc5 100644
--- a/.dev_scripts/validate_moeblock_tpep_vs_single.py
+++ b/.dev_scripts/validate_moeblock_tpep_vs_single.py
@@ -40,8 +40,10 @@
 HIDDEN_SIZE = 128
 MOE_INTERMEDIATE_SIZE = 256
 DTYPE = torch.bfloat16
-ATOL = 3e-2
-RTOL = 3e-2
+# TP 分片会改变 bf16 grouped-GEMM 的累加/规约顺序；这里比较整块输出矩阵，
+# atol 使用约 1 个 bf16 ulp 的量级，rtol 仍对齐 torch.testing 的 bf16 默认值。
+BF16_GEMM_ATOL = 1e-3
+BF16_GEMM_RTOL = 1.6e-2
 
 
 @dataclass(frozen=True)
@@ -400,7 +402,7 @@ def _slice_tpep_weight(grouped_linear: torch.nn.Module, full_weight: torch.Tenso
 
 def _assert_close(actual: torch.Tensor, expected: torch.Tensor) -> None:
     try:
-        torch.testing.assert_close(actual.float(), expected.float(), rtol=RTOL, atol=ATOL)
+        torch.testing.assert_close(actual, expected, rtol=BF16_GEMM_RTOL, atol=BF16_GEMM_ATOL)
     except AssertionError as exc:
         max_abs_diff = (actual.float() - expected.float()).abs().max().item()
         raise AssertionError(
diff --git a/.dev_scripts/validate_xtuner_tpep_md.py b/.dev_scripts/validate_xtuner_tpep_md.py
index 33de6ab38..cef1b40ff 100644
--- a/.dev_scripts/validate_xtuner_tpep_md.py
+++ b/.dev_scripts/validate_xtuner_tpep_md.py
@@ -16,7 +16,7 @@
     dispatch_postprocess: TP AllGather → 将 TP slices 合并成 M_total token
                           + 按 local expert 再排序（供 grouped GEMM）
     [Expert GEMM]       : 冗余计算（同一 EP rank 内各 TP rank 计算结果相同）
-    combine_preprocess  : unpermute → TP ReduceScatterMean → 恢复每 TP rank M_ep_recv
+    combine_preprocess  : unpermute → TP ReduceScatterSum → 恢复每 TP rank M_ep_recv
     combine             : EP AlltoAll 逆向
     combine_postprocess : unpermute + topk 加权求和 → [N_local, H]
 
@@ -139,11 +139,11 @@ class ParallelInfo:
         tokens_per_expert=(3.0, 3.0, 2.0),
         # expert adds global_expert_id * 100
         experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
-        # after ReduceScatterMean — tp0 slice [0:4]
-        pre_combine_hidden=(10.0, 111.0, 120.0, 221.0),
-        # after EP A2A reverse: from self=[10,111], from ep1_tp0=[311,410]
-        combine_hidden=(10.0, 111.0, 311.0, 410.0),
-        post_combine_hidden=(310.0, 191.0),
+        # after ReduceScatterSum — tp0 slice [0:4]
+        pre_combine_hidden=(20.0, 222.0, 240.0, 442.0),
+        # after EP A2A reverse: from self=[20,222], from ep1_tp0=[622,820]
+        combine_hidden=(20.0, 222.0, 622.0, 820.0),
+        post_combine_hidden=(620.0, 382.0),
     ),
     # rank 1: (ep=0, tp=1) — tokens A2, A3
     (0, 1): RankExpected(
@@ -163,11 +163,11 @@ class ParallelInfo:
         post_row_ids_map=(0, 3, 4, 6, 1, 7, 2, 5),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
-        # after ReduceScatterMean — tp1 slice [4:8]
-        pre_combine_hidden=(13.0, 212.0, 22.0, 123.0),
-        # after EP A2A reverse: from self=[13,212], from ep1_tp1=[413,512]
-        combine_hidden=(13.0, 212.0, 413.0, 512.0),
-        post_combine_hidden=(302.0, 333.0),
+        # after ReduceScatterSum — tp1 slice [4:8]
+        pre_combine_hidden=(26.0, 424.0, 44.0, 246.0),
+        # after EP A2A reverse: from self=[26,424], from ep1_tp1=[826,1024]
+        combine_hidden=(26.0, 424.0, 826.0, 1024.0),
+        post_combine_hidden=(604.0, 666.0),
     ),
     # rank 2: (ep=1, tp=0) — tokens B0, B1
     (1, 0): RankExpected(
@@ -187,11 +187,11 @@ class ParallelInfo:
         post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
-        # after ReduceScatterMean — tp0 slice [0:4]
-        pre_combine_hidden=(311.0, 410.0, 320.0, 421.0),
-        # after EP A2A reverse: from ep0_tp0=[120,221], from self=[320,421]
-        combine_hidden=(120.0, 221.0, 320.0, 421.0),
-        post_combine_hidden=(280.0, 321.0),
+        # after ReduceScatterSum — tp0 slice [0:4]
+        pre_combine_hidden=(622.0, 820.0, 640.0, 842.0),
+        # after EP A2A reverse: from ep0_tp0=[240,442], from self=[640,842]
+        combine_hidden=(240.0, 442.0, 640.0, 842.0),
+        post_combine_hidden=(560.0, 642.0),
     ),
     # rank 3: (ep=1, tp=1) — tokens B2, B3
     (1, 1): RankExpected(
@@ -210,11 +210,11 @@ class ParallelInfo:
         post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
-        # after ReduceScatterMean — tp1 slice [4:8]
-        pre_combine_hidden=(413.0, 512.0, 323.0, 522.0),
-        # after EP A2A reverse: from ep0_tp1=[22,123], from self=[323,522]
-        combine_hidden=(22.0, 123.0, 323.0, 522.0),
-        post_combine_hidden=(472.0, 193.0),
+        # after ReduceScatterSum — tp1 slice [4:8]
+        pre_combine_hidden=(826.0, 1024.0, 646.0, 1044.0),
+        # after EP A2A reverse: from ep0_tp1=[44,246], from self=[646,1044]
+        combine_hidden=(44.0, 246.0, 646.0, 1044.0),
+        post_combine_hidden=(944.0, 386.0),
     ),
 }
 
diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
index 290278e68..733a15ec2 100644
--- a/tests/engine/test_moe_train_engine_tpep.py
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -24,6 +24,13 @@
 
 from __future__ import annotations
 
+import os
+
+# 本测试关注 FSDP + EP + expert TP 的 loss/梯度校准。
+# Triton TMA grouped-GEMM 在部分本地 Triton/LLVM 组合下会编译失败，
+# 因此沿用 .dev_scripts 的做法，用 Cutlass 后端跑真实 grouped-GEMM。
+os.environ.setdefault("XTUNER_USE_CUTLASS_GROUP_GEMM", "1")
+
 import parametrize
 import torch
 import torch.distributed as dist
@@ -33,28 +40,55 @@
 from xtuner.v1.config import AdamWConfig, FSDPConfig
 from xtuner.v1.engine.train_engine import TrainEngine
 from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.module.attention import MHAConfig
 from xtuner.v1.module.grouped_linear.moe_group_linear import GroupedLinear
+from xtuner.v1.module.router.greedy import GreedyRouterConfig
 from xtuner.v1.model.base import ModelItem
 from xtuner.v1.model.moe.moe import SequenceContext
-from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
+from xtuner.v1.model.moe.qwen3 import Qwen3MoEConfig
 from xtuner.v1.utils.device import get_device
 
 DEVICE = get_device()
 
-# Tolerance for bfloat16 numerical differences between the two configs.
-ATOL = 2e-1
-RTOL = 2e-1
+# 本测试的模型参数和主要计算是 bf16，容忍度对齐 torch.testing 的
+# bf16 默认值，避免过宽阈值掩盖 expert TP 维度缺失这类校准错误。
+BF16_ATOL = 1e-5
+BF16_RTOL = 1.6e-2
+# grouped-GEMM 和 TP 分片规约会改变 bf16 的累加顺序；逐元素梯度矩阵
+# 在接近 0 的位置会有数个 ulp 的差异，不能用它承载 loss/norm 校准红灯。
+BF16_GEMM_ATOL = 1e-4
+BF16_GEMM_RTOL = BF16_RTOL
 
 # Use a very small model to keep test runtime manageable.
 _TINY_LAYERS = 2
-_SEQ_LEN = 64
+_SEQ_LEN = 32
+_VOCAB_SIZE = 128
 
 
-def _build_tiny_moe_cfg(ep_size: int = 1, tp_size: int = 1) -> Qwen3MoE30BA3Config:
-    return Qwen3MoE30BA3Config(
+def _build_tiny_moe_cfg(ep_size: int = 1, expert_tp_size: int = 1) -> Qwen3MoEConfig:
+    return Qwen3MoEConfig(
+        vocab_size=_VOCAB_SIZE,
+        max_position_embeddings=128,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
         num_hidden_layers=_TINY_LAYERS,
+        hidden_size=128,
+        intermediate_size=256,
+        rms_norm_eps=1e-6,
+        rope_theta=1e6,
+        hidden_act="silu",
+        attention=MHAConfig(num_attention_heads=4, num_key_value_heads=2, head_dim=32, qk_norm=True),
+        tie_word_embeddings=False,
+        n_routed_experts=4,
+        n_shared_experts=0,
+        num_experts_per_tok=2,
+        first_k_dense_replace=0,
+        hidden_factor=1.0,
+        moe_intermediate_size=64,
+        router=GreedyRouterConfig(scoring_func="softmax", norm_topk_prob=True, router_scaling_factor=1.0),
         ep_size=ep_size,
-        tp_size=tp_size,
+        expert_tp_size=expert_tp_size,
         dispatcher="all2all" if ep_size > 1 else None,
         compile_cfg=False,
         # Disable auxiliary losses to keep the comparison clean.
@@ -63,21 +97,21 @@ def _build_tiny_moe_cfg(ep_size: int = 1, tp_size: int = 1) -> Qwen3MoE30BA3Conf
     )
 
 
-def _build_engine(ep_size: int, tp_size: int) -> TrainEngine:
-    moe_cfg = _build_tiny_moe_cfg(ep_size, tp_size)
+def _build_engine(ep_size: int, expert_tp_size: int, data_tp_size: int = 1) -> TrainEngine:
+    moe_cfg = _build_tiny_moe_cfg(ep_size, expert_tp_size)
     optim_cfg = AdamWConfig()
     fsdp_cfg = FSDPConfig(
         ep_size=ep_size,
-        tp_size=tp_size,
+        tp_size=data_tp_size,
         cpu_offload=False,
     )
     return TrainEngine(model_cfg=moe_cfg, optim_cfg=optim_cfg, fsdp_cfg=fsdp_cfg)
 
 
-def _make_engine_input(device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+def _make_engine_input(device: torch.device, seed_offset: int = 0) -> tuple[torch.Tensor, torch.Tensor]:
     """Return (input_ids [1, SEQ_LEN-1], shifted_labels [1, SEQ_LEN-1]) on *device*."""
-    torch.manual_seed(12345)
-    full_ids = torch.randint(0, 151936, (1, _SEQ_LEN), dtype=torch.long, device=device)
+    torch.manual_seed(12345 + seed_offset)
+    full_ids = torch.randint(0, _VOCAB_SIZE, (1, _SEQ_LEN), dtype=torch.long, device=device)
     input_ids = full_ids[:, :-1]  # [1, SEQ_LEN-1]
     labels = full_ids[:, 1:]      # [1, SEQ_LEN-1] already shifted
     return input_ids, labels
@@ -90,6 +124,41 @@ def _run_one_step(
     labels: torch.Tensor,
 ) -> tuple[float, dict[str, torch.Tensor]]:
     """Run one train step; return (loss_value, {param_name: grad_tensor})."""
+    loss_val, grads, _ = _run_one_step_with_norm(engine, loss_cfg, input_ids, labels)
+    return loss_val, grads
+
+
+def _run_one_step_with_norm(
+    engine: TrainEngine,
+    loss_cfg: CELossConfig,
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+) -> tuple[float, dict[str, torch.Tensor], torch.Tensor]:
+    """Run one train step; return loss, gate grads and un-clipped grad norm."""
+    loss_val = _run_train_step_without_clip(engine, loss_cfg, input_ids, labels)
+    grad_norm = engine.clip_grad_norm(do_clip=False)
+
+    # Collect gradients from gate (router) parameters; these are non-expert
+    # parameters replicated on all ranks in both configs, so they're easy to
+    # compare directly.
+    grads: dict[str, torch.Tensor] = {}
+    for name, param in engine.model.named_parameters():
+        if "gate.weight" in name and param.grad is not None:
+            grad = param.grad
+            if hasattr(grad, "full_tensor"):
+                grad = grad.full_tensor()  # type: ignore[attr-defined]
+            grads[name] = grad.detach().float().cpu()
+            break  # one gate layer is sufficient
+
+    return loss_val, grads, grad_norm.detach().float().cpu()
+
+
+def _run_train_step_without_clip(
+    engine: TrainEngine,
+    loss_cfg: CELossConfig,
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+) -> float:
     seq_ctx = SequenceContext.from_input_ids((input_ids,), device=DEVICE)
     shifted_labels = labels.to(DEVICE)
 
@@ -100,23 +169,38 @@ def _run_one_step(
 
     engine_input = [ModelItem(seq_ctx=seq_ctx, loss_ctx={"lm": loss_ctx})]
     step_info = engine.train_step(engine_input)
-    engine.clip_grad_norm()
+    return step_info["logs_info"]["reduced_llm_loss"]
 
-    loss_val: float = step_info["logs_info"]["reduced_llm_loss"]
 
-    # Collect gradients from gate (router) parameters; these are non-expert
-    # parameters replicated on all ranks in both configs, so they're easy to
-    # compare directly.
-    grads: dict[str, torch.Tensor] = {}
+def _get_param_grad(engine: TrainEngine, name_suffix: str) -> torch.Tensor:
     for name, param in engine.model.named_parameters():
-        if "gate.weight" in name and param.grad is not None:
+        if _canonical_name(name).endswith(name_suffix):
             grad = param.grad
+            assert grad is not None, f"Missing gradient for {name}"
             if hasattr(grad, "full_tensor"):
                 grad = grad.full_tensor()  # type: ignore[attr-defined]
-            grads[name] = grad.detach().float().cpu()
-            break  # one gate layer is sufficient
+            return grad.detach().float().cpu()
+    raise AssertionError(f"Cannot find parameter ending with {name_suffix}")
+
+
+def _get_tpep_grouped_linear(engine: TrainEngine, module_suffix: str) -> GroupedLinear:
+    for name, module in engine.model.named_modules():
+        if _canonical_name(name).endswith(module_suffix):
+            assert isinstance(module, GroupedLinear)
+            return module
+    raise AssertionError(f"Cannot find grouped linear module ending with {module_suffix}")
 
-    return loss_val, grads
+
+def _canonical_name(name: str) -> str:
+    # 第一层会被 activation checkpoint wrapper 包一层，比较逻辑不关心该包装。
+    return name.replace("._checkpoint_wrapped_module", "")
+
+
+def _zero_non_expert_grads(engine: TrainEngine) -> None:
+    with torch.no_grad():
+        for name, param in engine.model.named_parameters():
+            if ".experts" not in _canonical_name(name) and param.grad is not None:
+                param.grad.zero_()
 
 
 def _full_tensor(tensor: torch.Tensor) -> torch.Tensor:
@@ -214,13 +298,13 @@ class TestMoETrainEngineTPEP(DeterministicDDPTestCase):
     """Verify EP+TP training matches single-GPU (EP=1, TP=1) forward and backward."""
 
     @parametrize.parametrize(
-        "device,ep_size,tp_size",
+        "device,ep_size,expert_tp_size",
         [
             ("cuda", 2, 2),
         ],
     )
     def test_tpep_forward_backward_matches_single(
-        self, device: str, ep_size: int, tp_size: int
+        self, device: str, ep_size: int, expert_tp_size: int
     ) -> None:
         """Loss and gate gradients with EP+TP must match the EP=1, TP=1 baseline."""
         pg = self.create_pg(device)
@@ -228,13 +312,13 @@ def test_tpep_forward_backward_matches_single(
         # ------------------------------------------------------------------
         # Build reference engine: EP=1, TP=1 (world acts as pure DP).
         # ------------------------------------------------------------------
-        engine_ref = _build_engine(ep_size=1, tp_size=1)
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
         engine_ref.init_model_weights()
 
         # ------------------------------------------------------------------
         # Build EP+TP engine.
         # ------------------------------------------------------------------
-        engine_tpep = _build_engine(ep_size=ep_size, tp_size=tp_size)
+        engine_tpep = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
         engine_tpep.init_model_weights()
 
         # ------------------------------------------------------------------
@@ -260,11 +344,11 @@ def test_tpep_forward_backward_matches_single(
         # Assert losses match.
         # ------------------------------------------------------------------
         if dist.get_rank() == 0:
-            self.assertAlmostEqual(
-                loss_tpep,
-                loss_ref,
-                delta=ATOL,
-                msg=f"Loss mismatch: EP+TP={loss_tpep:.6f}, ref={loss_ref:.6f}",
+            torch.testing.assert_close(
+                torch.tensor(loss_tpep),
+                torch.tensor(loss_ref),
+                atol=BF16_ATOL,
+                rtol=BF16_RTOL,
             )
 
         # ------------------------------------------------------------------
@@ -281,8 +365,8 @@ def test_tpep_forward_backward_matches_single(
                         torch.testing.assert_close(
                             g_tpep,
                             g_ref,
-                            atol=ATOL,
-                            rtol=RTOL,
+                            atol=BF16_GEMM_ATOL,
+                            rtol=BF16_GEMM_RTOL,
                         )
                     except AssertionError as exc:
                         max_diff = (g_tpep - g_ref).abs().max().item()
@@ -299,16 +383,164 @@ def test_tpep_forward_backward_matches_single(
             pass
 
     @parametrize.parametrize(
-        "device,ep_size,tp_size",
+        "device,ep_size,expert_tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_expert_gradients_match_single_with_distinct_expert_tp_data(
+        self, device: str, ep_size: int, expert_tp_size: int
+    ) -> None:
+        """Expert TP shards should match the corresponding single-model expert gradients."""
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
+        engine_ref.init_model_weights()
+
+        engine_tpep = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
+        engine_tpep.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_tpep)
+        dist.barrier()
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        _run_one_step(engine_tpep, loss_cfg, input_ids, labels)
+        _run_one_step(engine_ref, loss_cfg, input_ids, labels)
+
+        ref_grad = _get_param_grad(engine_ref, "layers.0.experts.fused_w1w3.weight")
+        tpep_grad = _get_param_grad(engine_tpep, "layers.0.experts.fused_w1w3.weight")
+        tpep_module = _get_tpep_grouped_linear(engine_tpep, "layers.0.experts.fused_w1w3")
+        expected_tpep_grad = _slice_tpep_weight(tpep_module, ref_grad, fused_gate_up=True)
+
+        torch.testing.assert_close(
+            tpep_grad,
+            expected_tpep_grad,
+            atol=BF16_GEMM_ATOL,
+            rtol=BF16_GEMM_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,ep_size,expert_tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_replicated_gradients_and_norm_match_single_with_distinct_expert_tp_data(
+        self, device: str, ep_size: int, expert_tp_size: int
+    ) -> None:
+        """Non-expert replicas and grad norm should match the single-model baseline."""
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
+        engine_ref.init_model_weights()
+
+        engine_tpep = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
+        engine_tpep.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_tpep)
+        dist.barrier()
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        _, _, norm_tpep = _run_one_step_with_norm(engine_tpep, loss_cfg, input_ids, labels)
+        _, _, norm_ref = _run_one_step_with_norm(engine_ref, loss_cfg, input_ids, labels)
+
+        gate_grad_ref = _get_param_grad(engine_ref, "layers.0.gate.weight")
+        gate_grad_tpep = _get_param_grad(engine_tpep, "layers.0.gate.weight")
+
+        torch.testing.assert_close(
+            gate_grad_tpep,
+            gate_grad_ref,
+            atol=BF16_GEMM_ATOL,
+            rtol=BF16_GEMM_RTOL,
+        )
+        torch.testing.assert_close(
+            norm_tpep,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,ep_size,expert_tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_expert_only_grad_norm_matches_single_with_distinct_expert_tp_data(
+        self, device: str, ep_size: int, expert_tp_size: int
+    ) -> None:
+        """Expert-only grad norm must sum norm square across EP and expert TP shards."""
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
+        engine_ref.init_model_weights()
+
+        engine_tpep = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
+        engine_tpep.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_tpep)
+        dist.barrier()
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        _run_train_step_without_clip(engine_tpep, loss_cfg, input_ids, labels)
+        _run_train_step_without_clip(engine_ref, loss_cfg, input_ids, labels)
+        _zero_non_expert_grads(engine_tpep)
+        _zero_non_expert_grads(engine_ref)
+
+        norm_tpep = engine_tpep.clip_grad_norm(do_clip=False).detach().float().cpu()
+        norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+
+        torch.testing.assert_close(
+            norm_tpep,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,ep_size,expert_tp_size",
         [
             ("cuda", 2, 2),
         ],
     )
-    def test_tpep_training_stability(self, device: str, ep_size: int, tp_size: int) -> None:
+    def test_tpep_training_stability(self, device: str, ep_size: int, expert_tp_size: int) -> None:
         """EP+TP training should produce finite losses and decreasing trend."""
         pg = self.create_pg(device)
 
-        engine = _build_engine(ep_size=ep_size, tp_size=tp_size)
+        engine = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
         engine.init_model_weights()
 
         input_ids, labels = _make_engine_input(torch.device(device, dist.get_rank() % torch.cuda.device_count()))
diff --git a/xtuner/v1/engine/train_engine.py b/xtuner/v1/engine/train_engine.py
index 8414d8dc1..939f28445 100644
--- a/xtuner/v1/engine/train_engine.py
+++ b/xtuner/v1/engine/train_engine.py
@@ -34,7 +34,6 @@
 )
 from xtuner.v1.profiler.prober import ProberList
 from xtuner.v1.utils import get_device, get_logger, get_torch_device_module, profile_time_and_memory
-from xtuner.v1.utils.grad_norm import cal_grad_norm
 
 
 class TrainStepInfo(DataBatchInfo, BatchForwardInfo):
@@ -244,7 +243,7 @@ def clip_grad_norm(self, do_clip: bool = True, dtype=torch.float32):
         self.model.scale_and_reduce_grad()
         params = self.model.trainable_parameters()
         grads = [p.grad for _, p in params if p.grad is not None]
-        grad_norm, grouped_grads = cal_grad_norm(grads, dtype=dtype)
+        grad_norm, grouped_grads = self.model.cal_grad_norm(grads, dtype=dtype)
         if do_clip:
             clip_coef = self.optim_cfg.max_grad_norm / (grad_norm + 1e-6)
             clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py
index 8d1def4a9..f8f7e94a1 100644
--- a/xtuner/v1/model/base.py
+++ b/xtuner/v1/model/base.py
@@ -403,6 +403,11 @@ def from_hf(
     def scale_and_reduce_grad(self):
         return
 
+    def cal_grad_norm(self, grads: list[DTensor], dtype=torch.float32):
+        from xtuner.v1.utils.grad_norm import cal_grad_norm
+
+        return cal_grad_norm(grads, dtype=dtype)
+
     def to_hf_key_list(self, key: str) -> list[str]:
         raise NotImplementedError()
 
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index e19a3520b..3a27d6054 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -18,7 +18,7 @@
     CPUOffloadPolicy,
     MixedPrecisionPolicy,
 )
-from torch.distributed.tensor import DTensor, Replicate, distribute_tensor
+from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_tensor
 from tqdm import tqdm
 from typing_extensions import overload, override
 
@@ -138,7 +138,7 @@ class MoEConfig(TransformerConfig):
     hidden_factor: Annotated[float, Parameter(group="moe")] = 1.0
     moe_intermediate_size: Annotated[int, Parameter(group="moe")]
     ep_size: Annotated[int, Parameter(group="moe")] = 1
-    tp_size: Annotated[int, Parameter(group="moe")] = 1
+    expert_tp_size: Annotated[int, Parameter(group="moe")] = 1
     dispatcher: Annotated[Literal["deepep", "all2all", "agrs"] | None, Parameter(group="moe")] = None
     router: GreedyRouterConfig | NoAuxRouterConfig
     balancing_loss_cfg: BalancingLossConfig | None = BalancingLossConfig()
@@ -178,12 +178,12 @@ def __init__(self, config: MoEConfig):
         super().__init__(config)
         if config.ep_size is not None and config.ep_size > 1:
             world_size = dist.get_world_size()
-            tp_size = config.tp_size if config.tp_size > 1 else 1
-            fsdp_size = world_size // (config.ep_size * tp_size)
-            if tp_size > 1:
+            expert_tp_size = config.expert_tp_size if config.expert_tp_size > 1 else 1
+            fsdp_size = world_size // (config.ep_size * expert_tp_size)
+            if expert_tp_size > 1:
                 _init_mesh = init_device_mesh(
                     DEVICE,
-                    (fsdp_size, config.ep_size, tp_size),
+                    (fsdp_size, config.ep_size, expert_tp_size),
                     mesh_dim_names=(
                         f"{self.config.mesh_prefix}.dp",
                         f"{self.config.mesh_prefix}.ep",
@@ -942,8 +942,6 @@ def fully_shard(
     ) -> Self:
         self.fsdp_config = fsdp_config
         assert self.fsdp_config.ep_size == self.config.ep_size
-        # TODO: self.config.tp_size is expert tp size, which can be different from fsdp_config.tp_size. Rename it to expert_tp_size.
-        # assert self.fsdp_config.tp_size == self.config.tp_size
         self.mp_policy = MixedPrecisionPolicy(
             param_dtype=self.fsdp_config.param_dtype, reduce_dtype=fsdp_config.reduce_dtype
         )
@@ -1101,7 +1099,7 @@ def scale_and_reduce_grad(self):
             ep_enabled = self.ep_mesh is not None and self.ep_mesh.size() > 1
             # Scale moe parameters
             if ep_enabled and ".experts" in name:
-                param.grad.div_(self.ep_mesh.size())  # type: ignore
+                param.grad.div_(self.ep_mesh.size() * self.config.expert_tp_size)  # type: ignore
                 continue
 
             if isinstance(param, DTensor):
@@ -1133,19 +1131,56 @@ def scale_and_reduce_grad(self):
                         group=flat_mesh.get_group(),  # type: ignore
                     )
 
+    def cal_grad_norm(self, grads: list[DTensor], dtype=torch.float32):
+        from xtuner.v1.utils.grad_norm import group_tensors_by_device_mesh_and_placements
+
+        grouped_grads = group_tensors_by_device_mesh_and_placements(grads)
+        if len(grads) == 0:
+            return torch.tensor(0.0, dtype=dtype), grouped_grads
+
+        total_norm_squared = torch.zeros((), dtype=dtype, device=grads[0].device)
+        for name, param in self.trainable_parameters():
+            grad = param.grad
+            if grad is None:
+                continue
+
+            local_grad = grad.to_local() if isinstance(grad, DTensor) else grad
+            local_norm_squared = torch.linalg.vector_norm(local_grad, ord=2.0, dtype=dtype) ** 2
+            if isinstance(grad, DTensor):
+                for i, placement in enumerate(grad.placements):
+                    if isinstance(placement, Shard):
+                        dist.all_reduce(local_norm_squared, group=grad.device_mesh.get_group(i))
+                    elif isinstance(placement, Replicate):
+                        pass
+                    else:
+                        raise ValueError(f"Unsupported placement type {placement} in clip_grad_norm")
+
+            if self.config.expert_tp_size > 1 and ".experts" in name:
+                assert self.ep_mesh is not None and self.tp_mesh is not None
+                # expert 参数的 EP / expert TP 分片不是 DTensor placement，
+                # norm square 需要显式跨这两个维度求和，clip 系数才是全局的。
+                dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=self.ep_mesh.get_group())
+                dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=self.tp_mesh.get_group())
+
+            total_norm_squared += local_norm_squared
+
+        grad_norm = total_norm_squared**0.5
+        grad_norm = grad_norm.to(grads[0].dtype)
+        return grad_norm, grouped_grads
+
     def _init_device_mesh(self, fsdp_config: FSDPConfig):
         self.fsdp_config = fsdp_config
 
         device = DEVICE
         world_size = dist.get_world_size()
-        tp_size = self.config.tp_size if self.config.tp_size > 1 else 1
-        experts_fsdp_size = world_size // (self.fsdp_config.ep_size * tp_size)
+        expert_tp_size = self.config.expert_tp_size if self.config.expert_tp_size > 1 else 1
+        experts_fsdp_size = world_size // (self.fsdp_config.ep_size * expert_tp_size)
 
         if self.fsdp_config.hsdp_sharding_size is None:
-            if tp_size > 1:
+            if expert_tp_size > 1:
                 model_mesh = init_device_mesh(
                     device,
-                    (experts_fsdp_size, self.fsdp_config.ep_size, tp_size),
+                    (experts_fsdp_size, self.fsdp_config.ep_size, expert_tp_size),
                     mesh_dim_names=(
                         f"{self.config.mesh_prefix}.fsdp",
                         f"{self.config.mesh_prefix}.ep",
@@ -1196,6 +1231,14 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
             else:
                 self.ep_mesh = model_mesh[f"{self.config.mesh_prefix}.ep"]
 
+            if expert_tp_size > 1:
+                new_tp_mesh = model_mesh[f"{self.config.mesh_prefix}.tp"]
+                if self.tp_mesh is not None:
+                    assert new_tp_mesh.mesh_dim_names == self.tp_mesh.mesh_dim_names
+                    assert torch.equal(self.tp_mesh.mesh, new_tp_mesh.mesh)
+                else:
+                    self.tp_mesh = new_tp_mesh
+
             self.fsdp_mesh = model_mesh[f"{self.config.mesh_prefix}.fsdp"]
         else:
             assert self.fsdp_config.ep_size == 1, "Currently, HSDP requires expert parallel size to be 1"
@@ -1225,9 +1268,18 @@ def traverse(module: nn.Module) -> None:
                 return
             for name, param in module.named_parameters(recurse=False):
                 assert self.ep_mesh is not None
+                replicate_mesh = self.ep_mesh
+                placements = [Replicate()]
+                if self.tp_mesh is not None and self.tp_mesh.size() > 1:
+                    assert self._world_mesh is not None
+                    # 非 expert 参数在 EP 和 expert TP 上都是逻辑副本。
+                    # FSDP 只支持一维 TP/Replicate 布局，所以这里先把
+                    # EP x expert TP 子网格压平成一个 Replicate 维度。
+                    replicate_mesh = self._world_mesh[
+                        (f"{self.config.mesh_prefix}.ep", f"{self.config.mesh_prefix}.tp")
+                    ]._flatten(mesh_dim_name=f"{self.config.mesh_prefix}.ep_tp")
                 dist_param = nn.Parameter(
-                    # TODO: replicate on ep_tp_mesh instead of ep_mesh?
-                    distribute_tensor(param, self.ep_mesh, [Replicate()]),
+                    distribute_tensor(param, replicate_mesh, placements),
                     requires_grad=param.requires_grad,
                 )
                 module.register_parameter(name, dist_param)
diff --git a/xtuner_fsdp_loss_grad_norm.md b/xtuner_fsdp_loss_grad_norm.md
index 7bba60a3b..3978c279b 100644
--- a/xtuner_fsdp_loss_grad_norm.md
+++ b/xtuner_fsdp_loss_grad_norm.md
@@ -288,7 +288,7 @@ self.model.scale_and_reduce_grad()
 - expert 参数在 EP 下只除以 `ep_mesh.size()`，不做 EP all-reduce。
 - replicated DTensor 参数会在 replicate mesh 上做平均 all-reduce，使这些未按普通 FSDP shard 语义同步的参数也得到一致梯度。
 
-然后 `cal_grad_norm` 会按 DTensor 的 mesh 和 placement 分组计算 norm。对于 sharded placement，会对局部 norm square 做 all-reduce sum，再开方得到全局 norm。这样 clip 使用的是全局参数梯度范数，而不是单 rank 的局部范数。
+通用 `cal_grad_norm` 会按 DTensor 的 mesh 和 placement 分组计算 norm。对于 sharded placement，会对局部 norm square 做 all-reduce sum，再开方得到全局 norm。这样 clip 使用的是全局参数梯度范数，而不是单 rank 的局部范数。
 
 在 FSDP + EP 下，这个顺序很重要：grad norm 是在 expert 梯度除 EP、replicated 参数 EP 平均 all-reduce 之后计算的。`cal_grad_norm()` 对 `Shard()` 维度累加 norm square，对 `Replicate()` 维度不重复计数。因此：
 
@@ -296,6 +296,93 @@ self.model.scale_and_reduce_grad()
 - replicated 参数的 norm 只按一份逻辑参数计数，不会因为 EP replica 数量而重复放大。
 - clip 系数作用在已经完成 FSDP/EP 校准后的梯度上，optimizer step 看到的是校准后的全局梯度。
 
+## FSDP + EP + expert TP 相对 FSDP + EP 的差异
+
+新增的 TP 指 `MoEConfig.expert_tp_size`，这里称为 `T`。它是 expert tensor parallel，用来切分 routed expert 的 column/row 权重 shard；它和 `FSDPConfig.tp_size` 不是同一个概念。当前语境下，不同 expert TP rank 拿到的是不同数据。
+
+相对 FSDP + EP，mesh 从二维变为三维：
+
+```text
+F = fsdp_mesh.size()
+E = ep_mesh.size()
+T = expert_tp_size
+world_size = F * E * T
+```
+
+核心差异只有三类。
+
+### 参数布局多了一维 expert TP
+
+FSDP + EP 下，routed expert 参数只在 EP 维切 expert；开启 expert TP 后，同一个 expert 的权重还会在 expert TP 维继续切 shard：
+
+```text
+expert weight: EP 切 expert, expert TP 切 column/row, FSDP 继续 shard
+```
+
+非 expert 参数在 EP 和 expert TP 维都是 replicated。实现上会把 `EP x expert TP` 子网格 flatten 成一维 replicate mesh，避免 PyTorch FSDP 不支持二维 `Replicate(), Replicate()` TP 布局。
+
+### loss 分母和 autograd all-reduce 覆盖更大的 world
+
+FSDP + EP 下：
+
+```text
+L = sum_{f,e} L_{f,e}
+backward scale = F * E
+FSDP reduce mean 除以 F
+剩余缩放 = E
+```
+
+FSDP + EP + expert TP 下：
+
+```text
+L = sum_{f,e,t} L_{f,e,t}
+backward scale = F * E * T
+FSDP reduce mean 除以 F
+剩余缩放 = E * T
+```
+
+因此，所有 EP-only 里出现的剩余 `E`，在 expert TP 开启后都变成 `E * T`。loss 分母仍然按默认分布式组统计，覆盖所有 FSDP rank、EP rank、expert TP rank 和 micro-batch；每个 token 仍只按 source rank 贡献一次。
+
+### expert 与 replicated 参数的梯度修正多乘一个 T
+
+expert 参数在 expert TP 维不是副本，而是同一个 expert 权重的不同 shard。因此它和 EP 维一样，不能 all-reduce 成一份完整梯度，只能消掉多出来的缩放：
+
+```python
+if ep_enabled and ".experts" in name:
+    param.grad.div_(self.ep_mesh.size() * self.config.expert_tp_size)
+    continue
+```
+
+相对 EP-only 的 `div_(E)`，这里变成 `div_(E * T)`。
+
+非 expert 参数在 `EP x expert TP` 上是 replica，需要聚合所有 source 数据贡献，并让每个 replica 得到一致梯度。EP-only 是在 EP replicate mesh 上平均 all-reduce；开启 expert TP 后是在 flatten 后的 `EP x expert TP` replicate mesh 上平均 all-reduce：
+
+```text
+sum_{e,t} (E * T * sum_f grad(L_{f,e,t}) / (E * T))
+= sum_{e,t} sum_f grad(L_{f,e,t})
+```
+
+这同时完成两件事：
+
+- 消掉 `E * T` 倍缩放。
+- 聚合所有 EP / expert TP rank 的数据贡献。
+
+### grad norm 需要额外覆盖 expert TP shard
+
+FSDP + EP 下，通用 `cal_grad_norm()` 能根据 DTensor placement 汇总 `Shard()` 维的 norm square，并对 `Replicate()` 维不重复计数。
+
+开启 expert TP 后，grouped expert 权重的 EP / expert TP shard 是本地 tensor 布局，并没有编码成 DTensor 的 EP / TP `Shard()` placement。如果继续只用通用逻辑，expert 参数的 global norm 会漏掉跨 `expert_tp_size` 的 norm square 汇总，clip 系数也会偏小或偏大。
+
+因此 MoE 覆盖模型级 `cal_grad_norm()`：在普通 DTensor shard 汇总之外，对 expert 参数的 local norm square 额外沿 `ep_mesh` 和 `tp_mesh` 做 `SUM all_reduce`：
+
+```python
+if expert_tp_size > 1 and ".experts" in name:
+    dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=ep_mesh.get_group())
+    dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=tp_mesh.get_group())
+```
+
+这样 clip 使用的是覆盖所有 EP / expert TP shard 的 expert norm，同时 replicated 参数仍只按一份逻辑参数计数。
+
 ## 总结
 
 XTuner FSDP loss 校准可以概括为三步：
@@ -309,4 +396,10 @@ FSDP + EP 时还要再区分两类参数：
 - expert 参数：FSDP mean 后剩余的 EP 倍数通过 `grad.div_(ep_size)` 消掉，不能 EP all-reduce。
 - EP replicated 参数：通过 replicate mesh 上的平均 all-reduce 同时消掉 EP 倍数并聚合所有 EP rank 的数据贡献。
 
+FSDP + EP + expert TP 不改变上述主线，只是在 EP 之外多了一维 expert TP：
+
+- expert 参数：剩余缩放从 `E` 变为 `E * T`，通过 `grad.div_(ep_size * expert_tp_size)` 消掉。
+- replicated 参数：replicate mesh 从 EP 扩展为 flatten 后的 `EP x expert TP`。
+- grad norm：expert shard 没有用 DTensor placement 表达 expert TP shard，因此 MoE 需要额外跨 EP 和 expert TP 汇总 expert norm square。
+
 最终效果是：FSDP、EP、SP、梯度累积和不同卡数不应改变同一 global batch 对参数更新的数学含义；grad norm/clip 发生在所有 micro-batch backward 完成之后，基于已经校准和同步后的全局梯度计算。

From c2638b0f8560d0a81cb27ba22fac6c9fba5b8a9a Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Wed, 13 May 2026 13:36:51 +0000
Subject: [PATCH 09/25] Enhance TP/EP dispatcher with async operations for
 donimo

---
 .../test_torch_all2all_tpep_async.py          | 266 +++++++++
 .../module/dispatcher/torch_all2all_tpep.py   | 542 +++++++++++++++---
 xtuner_ep_domino.md                           | 158 ++---
 3 files changed, 825 insertions(+), 141 deletions(-)
 create mode 100644 tests/module/dispatcher/test_torch_all2all_tpep_async.py

diff --git a/tests/module/dispatcher/test_torch_all2all_tpep_async.py b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
new file mode 100644
index 000000000..ce3eceb84
--- /dev/null
+++ b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
@@ -0,0 +1,266 @@
+import pytest
+import torch
+import torch.distributed as dist
+
+from xtuner.v1.module.dispatcher import torch_all2all
+from xtuner.v1.module.dispatcher.torch_all2all_tpep import (
+    TorchAll2AllTPEPDispatcher,
+    _async_tp_all_gather,
+    _async_tp_reduce_scatter_sum,
+)
+
+
+pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for stream assertions.")
+
+
+class _FakeTPGroup:
+    def __init__(self, size: int = 2, rank: int = 0) -> None:
+        self._size = size
+        self.rank = rank
+
+    def size(self) -> int:
+        return self._size
+
+
+class _FakeEPGroup(_FakeTPGroup):
+    pass
+
+
+def _stream_id() -> int:
+    return torch.cuda.current_stream().cuda_stream
+
+
+def test_async_tpep_dispatch_returns_tp_gathered_payload(monkeypatch) -> None:
+    dispatcher = TorchAll2AllTPEPDispatcher(
+        n_routed_experts=4,
+        ep_group=_FakeEPGroup(size=1),  # type: ignore[arg-type]
+        tp_group=_FakeTPGroup(size=2),  # type: ignore[arg-type]
+    )
+
+    def fake_get_rank(group=None) -> int:
+        return getattr(group, "rank", 0)
+
+    def fake_all_to_all_single(output, input, *args, **kwargs) -> None:
+        output.copy_(input)
+
+    def fake_ep_all_to_all_single_autograd(input, *args, **kwargs):
+        return input.clone()
+
+    def fake_all_gather_into_tensor(output, input, group=None) -> None:
+        if output.numel() == 2 and input.numel() == 1:
+            output.fill_(input.item())
+        else:
+            output[0].copy_(input)
+            output[1].copy_(input)
+
+    def fake_all_gather(chunks, tensor, group=None) -> None:
+        chunks[0].copy_(tensor)
+        chunks[1].copy_(tensor + 10)
+
+    monkeypatch.setattr(dist, "get_rank", fake_get_rank)
+    monkeypatch.setattr(dist, "all_to_all_single", fake_all_to_all_single)
+    monkeypatch.setattr(torch_all2all, "all_to_all_single_autograd", fake_ep_all_to_all_single_autograd)
+    monkeypatch.setattr(dist, "all_gather_into_tensor", fake_all_gather_into_tensor)
+    monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+
+    hidden = torch.randn(32, 128, device="cuda", dtype=torch.float32, requires_grad=True)
+    topk_ids = torch.randint(0, 4, (32, 1), device="cuda", dtype=torch.float32)
+    topk_weights = torch.ones(32, 1, device="cuda", dtype=torch.float32)
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden, topk_ids=topk_ids, async_op=True)
+
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        async_op=True,
+    )
+    torch.cuda.current_stream().wait_event(dispatched["forward_finished_event"])
+    torch.cuda.synchronize()
+
+    # 中文注释：TP 通信的归属边界是 dispatch，postprocess 只能看到已经 gather 好的 token。
+    assert dispatched["hidden_states"].shape == (64, 128)
+    assert dispatched["output_splits_tp"] == [32, 32]
+    torch.testing.assert_close(dispatched["hidden_states"][32:], pre_dispatched["hidden_states"] + 10)
+
+
+def test_async_tpep_combine_owns_tp_reduce_scatter(monkeypatch) -> None:
+    dispatcher = TorchAll2AllTPEPDispatcher(
+        n_routed_experts=4,
+        ep_group=_FakeEPGroup(size=1),  # type: ignore[arg-type]
+        tp_group=_FakeTPGroup(size=2),  # type: ignore[arg-type]
+    )
+
+    def fake_get_rank(group=None) -> int:
+        return getattr(group, "rank", 0)
+
+    def fake_all_to_all_single(output, input, *args, **kwargs) -> None:
+        output.copy_(input)
+
+    def fake_ep_all_to_all_single_autograd(input, *args, **kwargs):
+        return input.clone()
+
+    def fake_all_gather_into_tensor(output, input, group=None) -> None:
+        if output.numel() == 2 and input.numel() == 1:
+            output.fill_(input.item())
+        else:
+            output[0].copy_(input)
+            output[1].copy_(input)
+
+    def fake_all_gather(chunks, tensor, group=None) -> None:
+        chunks[0].copy_(tensor)
+        chunks[1].copy_(tensor + 10)
+
+    def fake_all_reduce(tensor, op=None, group=None) -> None:
+        return None
+
+    monkeypatch.setattr(dist, "get_rank", fake_get_rank)
+    monkeypatch.setattr(dist, "all_to_all_single", fake_all_to_all_single)
+    monkeypatch.setattr(torch_all2all, "all_to_all_single_autograd", fake_ep_all_to_all_single_autograd)
+    monkeypatch.setattr(dist, "all_gather_into_tensor", fake_all_gather_into_tensor)
+    monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+    monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
+
+    hidden = torch.randn(32, 128, device="cuda", dtype=torch.float32, requires_grad=True)
+    topk_ids = torch.randint(0, 4, (32, 1), device="cuda", dtype=torch.float32)
+    topk_weights = torch.ones(32, 1, device="cuda", dtype=torch.float32)
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden, topk_ids=topk_ids, async_op=True)
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        async_op=True,
+    )
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        async_op=True,
+    )
+
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=post_dispatched["hidden_states"],
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        async_op=True,
+    )
+    torch.cuda.current_stream().wait_event(pre_combined["forward_finished_event"])
+    torch.cuda.synchronize()
+
+    # 中文注释：preprocess 只做本地 layout，还保持 TP-gather 后的完整 token 数。
+    assert pre_combined["hidden_states"].shape == (64, 128)
+
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        async_op=True,
+    )
+    torch.cuda.current_stream().wait_event(combined["forward_finished_event"])
+    torch.cuda.synchronize()
+
+    # 中文注释：TP ReduceScatter 属于 combine，combine 后才回到当前 TP rank 的 token slice。
+    assert combined["hidden_states"].shape == (32, 128)
+
+
+def test_async_tp_all_gather_uses_comm_stream(monkeypatch) -> None:
+    comm_stream = torch.cuda.Stream()
+    group = _FakeTPGroup()
+    calls: list[tuple[str, int]] = []
+
+    def fake_get_rank(group=None) -> int:
+        return getattr(group, "rank", 0)
+
+    def fake_all_gather(chunks, tensor, group=None) -> None:
+        calls.append(("all_gather", _stream_id()))
+        for chunk in chunks:
+            chunk.copy_(tensor[: chunk.shape[0]])
+
+    def fake_all_reduce(tensor, op=None, group=None) -> None:
+        calls.append(("all_reduce", _stream_id()))
+
+    monkeypatch.setattr(dist, "get_rank", fake_get_rank)
+    monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+    monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
+
+    hidden = torch.randn(2, 3, device="cuda", requires_grad=True)
+    forward_previous_event = torch.cuda.Event()
+    forward_finished_event = torch.cuda.Event()
+    backward_previous_event = torch.cuda.Event()
+    backward_finished_event = torch.cuda.Event()
+    forward_previous_event.record()
+
+    out = _async_tp_all_gather(
+        hidden,
+        all_sizes=[2, 2],
+        tp_group=group,  # type: ignore[arg-type]
+        forward_previous_event=forward_previous_event,
+        forward_finished_event=forward_finished_event,
+        backward_previous_event=backward_previous_event,
+        backward_finished_event=backward_finished_event,
+        comm_stream=comm_stream,
+    )
+    torch.cuda.current_stream().wait_event(forward_finished_event)
+    loss = out.sum()
+
+    # 中文注释：直接调用私有 helper 时没有 dispatcher hook，这里手动模拟梯度已就绪事件。
+    backward_previous_event.record()
+    loss.backward()
+    torch.cuda.current_stream().wait_event(backward_finished_event)
+    torch.cuda.synchronize()
+
+    assert hidden.grad is not None
+    assert calls == [
+        ("all_gather", comm_stream.cuda_stream),
+        ("all_reduce", comm_stream.cuda_stream),
+    ]
+
+
+def test_async_tp_reduce_scatter_uses_comm_stream(monkeypatch) -> None:
+    comm_stream = torch.cuda.Stream()
+    group = _FakeTPGroup()
+    calls: list[tuple[str, int]] = []
+
+    def fake_get_rank(group=None) -> int:
+        return getattr(group, "rank", 0)
+
+    def fake_all_reduce(tensor, op=None, group=None) -> None:
+        calls.append(("all_reduce", _stream_id()))
+
+    def fake_all_gather(chunks, tensor, group=None) -> None:
+        calls.append(("all_gather", _stream_id()))
+        for chunk in chunks:
+            chunk.copy_(tensor[: chunk.shape[0]])
+
+    monkeypatch.setattr(dist, "get_rank", fake_get_rank)
+    monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
+    monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+
+    hidden = torch.randn(4, 3, device="cuda", requires_grad=True)
+    forward_previous_event = torch.cuda.Event()
+    forward_finished_event = torch.cuda.Event()
+    backward_previous_event = torch.cuda.Event()
+    backward_finished_event = torch.cuda.Event()
+    forward_previous_event.record()
+
+    out = _async_tp_reduce_scatter_sum(
+        hidden,
+        all_sizes=[2, 2],
+        tp_group=group,  # type: ignore[arg-type]
+        forward_previous_event=forward_previous_event,
+        forward_finished_event=forward_finished_event,
+        backward_previous_event=backward_previous_event,
+        backward_finished_event=backward_finished_event,
+        comm_stream=comm_stream,
+    )
+    torch.cuda.current_stream().wait_event(forward_finished_event)
+    loss = out.sum()
+
+    backward_previous_event.record()
+    loss.backward()
+    torch.cuda.current_stream().wait_event(backward_finished_event)
+    torch.cuda.synchronize()
+
+    assert hidden.grad is not None
+    assert calls == [
+        ("all_reduce", comm_stream.cuda_stream),
+        ("all_gather", comm_stream.cuda_stream),
+    ]
diff --git a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
index 225e8956e..c6ac2f7e8 100644
--- a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
+++ b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
@@ -3,13 +3,11 @@
 Forward data flow (adds two TP collectives around the existing EP dispatcher):
 
     dispatch_preprocess : permute by expert (each TP rank independently, N_local tokens)
-    dispatch            : EP AlltoAll (each TP rank independently, routing N_local token copies)
-    dispatch_postprocess: TP AllGather → merge TP token slices into M_total tokens
-                          then permute by local expert (for grouped GEMM)
+    dispatch            : EP AlltoAll → TP AllGather, merging TP token slices into M_total tokens
+    dispatch_postprocess: permute by local expert (for grouped GEMM)
     [Expert GEMM]       : column-parallel gate/up + row-parallel down projection
     combine_preprocess  : unpermute back to TP-AllGather order
-                          then TP ReduceScatterSum → restore M_ep_recv per TP rank
-    combine             : EP AlltoAll reverse (each TP rank independently)
+    combine             : TP ReduceScatterSum → EP AlltoAll reverse
     combine_postprocess : unpermute with topk_weights → [N_local, H] per TP rank
 
 Design rationale (mirrors Megatron MoEAlltoAllTokenDispatcher with TP+EP):
@@ -33,6 +31,7 @@
 
 from . import XTUNER_DISPATCHER_DEBUG
 from .torch_all2all import (
+    TorchAll2AllCombineResult,
     TorchAll2AllDispatcher,
     TorchAll2AllDispatchResult,
     TorchAll2AllPostDispatchResult,
@@ -43,16 +42,88 @@
 )
 
 
-class TorchAll2AllTPEPPostDispatchResult(TorchAll2AllPostDispatchResult):
-    """Post-dispatch result for TP+EP dispatcher.
+class TorchAll2AllTPEPDispatchResult(TorchAll2AllDispatchResult):
+    """Dispatch result after EP AlltoAll and TP AllGather.
 
-    Extends the EP-only result with per-TP-rank token counts needed to perform the
-    TP ReduceScatterSum in ``combine_preprocess``.
+    ``output_splits_tp`` records the pre-AllGather token count per TP rank.  The
+    later combine phase uses it to restore this TP rank's slice after the
+    row-parallel expert output is summed.
+
+    中文注释：TP size meta 指的就是 ``output_splits_tp``。例如 ``tp_size=2``，
+    EP dispatch 后 TP rank0 的 hidden 是 ``[3, H]``，rank1 是 ``[5, H]``，
+    两个 rank 都会拿到 ``output_splits_tp=[3, 5]``。TP AllGather 用它把
+    变长 hidden 拼成 ``[8, H]``，combine 再按相同边界切回本 rank 的
+    ``[3, H]`` 或 ``[5, H]``。
     """
 
     output_splits_tp: list[int]
 
 
+class TorchAll2AllTPEPPostDispatchResult(TorchAll2AllPostDispatchResult): ...
+
+
+def _record_stream(value: Any, stream: torch.cuda.Stream) -> None:
+    if isinstance(value, torch.Tensor):
+        value.record_stream(stream)
+    elif isinstance(value, (list, tuple)):
+        for item in value:
+            _record_stream(item, stream)
+
+
+def _local_tp_offset(all_sizes: list[int], tp_rank: int) -> int:
+    return sum(all_sizes[:tp_rank])
+
+
+def _tp_all_gather_forward_impl(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    """Run TP AllGather forward and return tensors whose lifetime may need
+    recording."""
+    hidden = hidden.contiguous()
+    chunks = [torch.empty(s, hidden.shape[1], dtype=hidden.dtype, device=hidden.device) for s in all_sizes]
+    dist.all_gather(chunks, hidden, group=tp_group)
+    return torch.cat(chunks, dim=0), hidden, chunks
+
+
+def _tp_all_gather_backward_impl(
+    grad: torch.Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # TODO: use reduce_scatter instead of all_reduce
+    grad = grad.contiguous()
+    dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_group)
+    offset = _local_tp_offset(all_sizes, tp_rank)
+    return grad[offset : offset + all_sizes[tp_rank]].clone(), grad
+
+
+def _tp_reduce_scatter_sum_forward_impl(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # TODO: use reduce_scatter instead of all_reduce
+    reduced = hidden.contiguous().clone()
+    dist.all_reduce(reduced, op=dist.ReduceOp.SUM, group=tp_group)
+    offset = _local_tp_offset(all_sizes, tp_rank)
+    return reduced[offset : offset + all_sizes[tp_rank]].contiguous(), reduced
+
+
+def _tp_reduce_scatter_sum_backward_impl(
+    grad_slice: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    grad_slice = grad_slice.contiguous()
+    chunks = [torch.empty(s, grad_slice.shape[1], dtype=grad_slice.dtype, device=grad_slice.device) for s in all_sizes]
+    dist.all_gather(chunks, grad_slice, group=tp_group)
+    return torch.cat(chunks, dim=0), grad_slice, chunks
+
+
 class _TPAllGather(torch.autograd.Function):
     """TP AllGather with autograd support.
 
@@ -70,24 +141,79 @@ def forward(
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        chunks = [torch.empty(s, hidden.shape[1], dtype=hidden.dtype, device=hidden.device) for s in all_sizes]
-        dist.all_gather(chunks, hidden.contiguous(), group=tp_group)
+        gathered, _, _ = _tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
         ctx.all_sizes = all_sizes
-        return torch.cat(chunks, dim=0)
+        return gathered
 
     @staticmethod
     def backward(
         ctx: Any,
         grad: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
-        # TODO: use reduce_scatter instead of all_reduce
-        grad = grad.contiguous()
-        dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=ctx.tp_group)
-        offset = sum(ctx.all_sizes[: ctx.tp_rank])
-        return grad[offset : offset + ctx.all_sizes[ctx.tp_rank]].clone(), None, None, None, None
+        grad_input, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        return grad_input, None, None, None, None
+
+
+class _AsyncTPAllGather(torch.autograd.Function):
+    """TP AllGather on dispatcher comm stream.
+
+    Forward : wait for the previous event, then all-gather token slices.
+    Backward: wait until post-dispatch grad is ready, all-reduce grad, then
+              slice this TP rank's input grad.
+    """
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+        forward_previous_event: torch.cuda.Event,
+        forward_finished_event: torch.cuda.Event,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        with torch.cuda.stream(comm_stream):
+            comm_stream.wait_event(forward_previous_event)
+            gathered, hidden_for_comm, chunks = _tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
+
+            # 中文注释：同步/异步共用 TP AllGather 核心逻辑；异步只额外管理 stream/event 生命周期。
+            _record_stream((hidden_for_comm, chunks, gathered), comm_stream)
+            forward_finished_event.record(comm_stream)
+
+        ctx.tp_group = tp_group
+        ctx.tp_size = tp_size
+        ctx.tp_rank = tp_rank
+        ctx.all_sizes = all_sizes
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return gathered
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
+        with torch.cuda.stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            grad_input, grad_for_comm = _tp_all_gather_backward_impl(
+                grad,
+                ctx.all_sizes,
+                ctx.tp_rank,
+                ctx.tp_group,
+            )
+
+            _record_stream((grad_for_comm, grad_input), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+
+        return grad_input, None, None, None, None, None, None, None, None, None
 
 
 class _TPReduceScatterSum(torch.autograd.Function):
@@ -108,33 +234,106 @@ def forward(
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        # TODO: use reduce_scatter instead of all_reduce
-        hidden = hidden.clone()
-        dist.all_reduce(hidden, op=dist.ReduceOp.SUM, group=tp_group)
-        offset = sum(all_sizes[:tp_rank])
+        out, _ = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
         ctx.all_sizes = all_sizes
-        return hidden[offset : offset + all_sizes[tp_rank]].contiguous()
+        return out
 
     @staticmethod
     def backward(
         ctx: Any,
         grad_slice: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
-        chunks = [
-            torch.empty(s, grad_slice.shape[1], dtype=grad_slice.dtype, device=grad_slice.device)
-            for s in ctx.all_sizes
-        ]
-        dist.all_gather(chunks, grad_slice.contiguous(), group=ctx.tp_group)
-        full_grad = torch.cat(chunks, dim=0)
+        full_grad, _, _ = _tp_reduce_scatter_sum_backward_impl(grad_slice, ctx.all_sizes, ctx.tp_group)
         return full_grad, None, None, None, None
 
 
+class _AsyncTPReduceScatterSum(torch.autograd.Function):
+    """TP ReduceScatterSum on dispatcher comm stream."""
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+        forward_previous_event: torch.cuda.Event,
+        forward_finished_event: torch.cuda.Event,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        with torch.cuda.stream(comm_stream):
+            comm_stream.wait_event(forward_previous_event)
+            out, reduced = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+
+            # 中文注释：同步/异步共用 TP ReduceScatter 核心逻辑；异步只额外管理 stream/event。
+            _record_stream((hidden, reduced, out), comm_stream)
+            forward_finished_event.record(comm_stream)
+
+        ctx.tp_group = tp_group
+        ctx.tp_size = tp_size
+        ctx.tp_rank = tp_rank
+        ctx.all_sizes = all_sizes
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return out
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad_slice: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
+        with torch.cuda.stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_sum_backward_impl(
+                grad_slice,
+                ctx.all_sizes,
+                ctx.tp_group,
+            )
+
+            _record_stream((grad_slice_for_comm, chunks, full_grad), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+
+        return full_grad, None, None, None, None, None, None, None, None, None
+
+
+def _tp_all_gather_sizes(
+    hidden: torch.Tensor,
+    tp_group: dist.ProcessGroup,
+    stream: torch.cuda.Stream | None = None,
+) -> list[int]:
+    """Gather per-TP-rank token counts as host ints for variable-size
+    gather."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        return [hidden.shape[0]]
+
+    if stream is None:
+        local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
+        all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
+        dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
+    else:
+        # 中文注释：尺寸通信不依赖计算流，避免为了取 Python list 等待前面的 compute kernel。
+        with torch.cuda.stream(stream):
+            local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
+            all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
+            dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
+            local_size.record_stream(stream)
+            all_sizes_t.record_stream(stream)
+        stream.synchronize()
+    return [int(s) for s in all_sizes_t.tolist()]
+
+
 def _tp_all_gather(
     hidden: torch.Tensor,
     tp_group: dist.ProcessGroup,
+    all_sizes: list[int] | None = None,
 ) -> tuple[torch.Tensor, list[int]]:
     """All-gather ``hidden`` across the TP group and return the gathered tensor
     plus per-rank sizes."""
@@ -143,15 +342,44 @@ def _tp_all_gather(
         return hidden, [hidden.shape[0]]
 
     tp_rank = dist.get_rank(group=tp_group)
-    local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
-    all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
-    dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
-    all_sizes = [int(s) for s in all_sizes_t.tolist()]
+    if all_sizes is None:
+        all_sizes = _tp_all_gather_sizes(hidden, tp_group)
 
     gathered = _TPAllGather.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
     return gathered, all_sizes
 
 
+def _async_tp_all_gather(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+    forward_previous_event: torch.cuda.Event,
+    forward_finished_event: torch.cuda.Event,
+    backward_previous_event: torch.cuda.Event,
+    backward_finished_event: torch.cuda.Event,
+    comm_stream: torch.cuda.Stream,
+) -> torch.Tensor:
+    """Async TP AllGather wrapper used by Domino TP+EP path."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        forward_finished_event.record()
+        return hidden
+
+    tp_rank = dist.get_rank(group=tp_group)
+    return _AsyncTPAllGather.apply(
+        hidden,
+        all_sizes,
+        tp_group,
+        tp_size,
+        tp_rank,
+        forward_previous_event,
+        forward_finished_event,
+        backward_previous_event,
+        backward_finished_event,
+        comm_stream,
+    )
+
+
 def _tp_reduce_scatter_sum(
     hidden: torch.Tensor,
     all_sizes: list[int],
@@ -167,6 +395,37 @@ def _tp_reduce_scatter_sum(
     return _TPReduceScatterSum.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
 
 
+def _async_tp_reduce_scatter_sum(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+    forward_previous_event: torch.cuda.Event,
+    forward_finished_event: torch.cuda.Event,
+    backward_previous_event: torch.cuda.Event,
+    backward_finished_event: torch.cuda.Event,
+    comm_stream: torch.cuda.Stream,
+) -> torch.Tensor:
+    """Async TP ReduceScatterSum wrapper used by Domino TP+EP path."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        forward_finished_event.record()
+        return hidden
+
+    tp_rank = dist.get_rank(group=tp_group)
+    return _AsyncTPReduceScatterSum.apply(
+        hidden,
+        all_sizes,
+        tp_group,
+        tp_size,
+        tp_rank,
+        forward_previous_event,
+        forward_finished_event,
+        backward_previous_event,
+        backward_finished_event,
+        comm_stream,
+    )
+
+
 def _tp_all_gather_tokens_per_expert_group(
     tokens_per_expert_group: torch.Tensor,
     tp_group: dist.ProcessGroup,
@@ -182,13 +441,38 @@ def _tp_all_gather_tokens_per_expert_group(
     return gathered
 
 
+def _async_tp_all_gather_tokens_per_expert_group(
+    tokens_per_expert_group: torch.Tensor,
+    tp_group: dist.ProcessGroup,
+    forward_previous_event: torch.cuda.Event,
+    forward_finished_event: torch.cuda.Event,
+    comm_stream: torch.cuda.Stream,
+) -> torch.Tensor:
+    """Async gather for routing counts; no autograd is needed for these
+    counts."""
+    tp_size = tp_group.size()
+    if tp_size == 1:
+        forward_finished_event.record()
+        return tokens_per_expert_group.unsqueeze(0)
+
+    gathered = tokens_per_expert_group.new_empty((tp_size, *tokens_per_expert_group.shape))
+    with torch.cuda.stream(comm_stream):
+        comm_stream.wait_event(forward_previous_event)
+        counts = tokens_per_expert_group.contiguous()
+        dist.all_gather_into_tensor(gathered, counts, group=tp_group)
+        counts.record_stream(comm_stream)
+        gathered.record_stream(comm_stream)
+        forward_finished_event.record(comm_stream)
+    return gathered
+
+
 class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
     """TP+EP dispatcher: wraps ``TorchAll2AllDispatcher`` with TP AllGather and
     ReduceScatterSum.
 
-    Overrides only ``dispatch_postprocess`` and ``combine_preprocess``; all other steps
-    (dispatch_preprocess, dispatch, combine, combine_postprocess) are unchanged from the
-    EP-only base class.
+    Keeps ``dispatch_preprocess`` and ``combine_postprocess`` from the EP-only
+    base class, and moves the TP collectives into the communication methods
+    ``dispatch`` and ``combine``.
 
     Args:
         n_routed_experts (int): Total number of routed experts across all EP ranks.
@@ -198,6 +482,11 @@ class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
         generate_dtype (str): Dtype for generation, ``"bf16"`` or ``"fp8"``.
     """
 
+    # 中文注释：_tp_meta_stream 只跑 output_splits_tp 这类小的尺寸 all_gather。
+    # 尺寸结果要同步回 Python list；如果复用 _comm_stream，会连同前面排队的大块
+    # EP AllToAll 一起等完，削弱 Domino 隐藏 TP/EP 通信的效果。
+    _tp_meta_stream: torch.cuda.Stream | None = None
+
     def __init__(
         self,
         *,
@@ -215,6 +504,85 @@ def __init__(
         )
         self._tp_group = tp_group
         self._tp_size = tp_group.size()
+        if TorchAll2AllTPEPDispatcher._tp_meta_stream is None:
+            TorchAll2AllTPEPDispatcher._tp_meta_stream = torch.cuda.Stream()
+        self._tp_meta_stream = TorchAll2AllTPEPDispatcher._tp_meta_stream
+
+    @override
+    def dispatch(
+        self,
+        *,
+        pre_dispatched: TorchAll2AllPreDispatchResult,
+        topk_weights: torch.Tensor,
+        async_op: bool = False,
+        decoding: bool = False,
+    ) -> TorchAll2AllTPEPDispatchResult:
+        ep_dispatched = super().dispatch(
+            pre_dispatched=pre_dispatched,
+            topk_weights=topk_weights,
+            async_op=async_op,
+            decoding=decoding,
+        )
+
+        if async_op:
+            assert ep_dispatched["forward_finished_event"] is not None, "Use async_op=True for dispatch!"
+            assert ep_dispatched["backward_previous_event"] is not None, "Use async_op=True for dispatch!"
+            comm_stream = cast(torch.cuda.Stream, self._comm_stream)
+            # 中文注释：只同步变长 all_gather 的尺寸；大块 TP hidden 通信放到 comm stream 中隐藏。
+            # 这里刻意使用 _tp_meta_stream，避免为了拿 output_splits_tp 的 Python list
+            # 去同步 _comm_stream 上已经排队的 EP hidden AllToAll。
+            output_splits_tp = _tp_all_gather_sizes(
+                ep_dispatched["hidden_states"],
+                self._tp_group,
+                stream=self._tp_meta_stream,
+            )
+            tp_hidden_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+            tp_counts_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+            tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
+            hidden_states = _async_tp_all_gather(
+                ep_dispatched["hidden_states"],
+                all_sizes=output_splits_tp,
+                tp_group=self._tp_group,
+                forward_previous_event=ep_dispatched["forward_finished_event"],
+                forward_finished_event=tp_hidden_finished_event,
+                backward_previous_event=tp_backward_previous_event,
+                backward_finished_event=ep_dispatched["backward_previous_event"],
+                comm_stream=comm_stream,
+            )
+            tokens_per_expert_group = _async_tp_all_gather_tokens_per_expert_group(
+                ep_dispatched["tokens_per_expert_group"],
+                tp_group=self._tp_group,
+                forward_previous_event=tp_hidden_finished_event,
+                forward_finished_event=tp_counts_finished_event,
+                comm_stream=comm_stream,
+            )
+            forward_finished_event = tp_counts_finished_event
+            backward_previous_event = tp_backward_previous_event
+        else:
+            hidden_states, output_splits_tp = _tp_all_gather(
+                ep_dispatched["hidden_states"],
+                tp_group=self._tp_group,
+            )
+            tokens_per_expert_group = _tp_all_gather_tokens_per_expert_group(
+                ep_dispatched["tokens_per_expert_group"],
+                tp_group=self._tp_group,
+            )
+            forward_finished_event = None
+            backward_previous_event = None
+
+        if decoding:
+            raise NotImplementedError("Decoding is not yet supported for TorchAll2AllTPEPDispatcher.")
+
+        return TorchAll2AllTPEPDispatchResult(
+            hidden_states=hidden_states,
+            topk_weights=ep_dispatched["topk_weights"],
+            tokens_per_expert_group=tokens_per_expert_group,
+            input_splits=ep_dispatched["input_splits"],
+            output_splits=ep_dispatched["output_splits"],
+            forward_finished_event=forward_finished_event,
+            backward_previous_event=backward_previous_event,
+            output_splits_tp=output_splits_tp,
+        )
 
     @override
     def dispatch_postprocess(
@@ -225,43 +593,30 @@ def dispatch_postprocess(
         async_op: bool = False,
         decoding: bool = False,
     ) -> TorchAll2AllTPEPPostDispatchResult:
+        tpep_dispatched = cast(TorchAll2AllTPEPDispatchResult, dispatched)
         if async_op:
-            # async_op for TP collectives is not yet implemented; fall back to synchronous.
-            assert dispatched["forward_finished_event"] is not None, "Use async_op=True for dispatch!"
-            self.wait_comm_stream(dispatched["forward_finished_event"])
-
-        # TP AllGather: [M_ep_recv, H] → [M_total, H]; also returns per-TP-rank sizes.
-        gathered_hidden, output_splits_tp = _tp_all_gather(
-            dispatched["hidden_states"],
-            tp_group=self._tp_group,
-        )
+            assert tpep_dispatched["forward_finished_event"] is not None, "Use async_op=True for dispatch!"
+            assert tpep_dispatched["backward_previous_event"] is not None, "Use async_op=True for dispatch!"
+            self.wait_comm_stream(tpep_dispatched["forward_finished_event"])
 
-        # Permute [M_total, H] into local-expert order for grouped GEMM.  Since
-        # TP AllGather concatenates tp0_block | tp1_block | ..., expert counts
-        # must be gathered in the same TP order before building the row labels.
-        gathered_tokens_per_expert_group = _tp_all_gather_tokens_per_expert_group(
-            dispatched["tokens_per_expert_group"],
-            tp_group=self._tp_group,
-        )
-        token_counts = gathered_tokens_per_expert_group.ravel()
+        token_counts = tpep_dispatched["tokens_per_expert_group"].ravel().to(torch.long)
         local_expert_ids = self._expert_ids_per_ep_rank.repeat(self._tp_size)
         global_input_tokens_local_experts_indices = torch.repeat_interleave(
             local_expert_ids,
             token_counts,
-            output_size=gathered_hidden.shape[0],
+            output_size=tpep_dispatched["hidden_states"].shape[0],
         )
         global_input_tokens, row_ids_map = permute(
-            gathered_hidden,
+            tpep_dispatched["hidden_states"],
             global_input_tokens_local_experts_indices.to(torch.int32),
         )
-        tokens_per_expert = gathered_tokens_per_expert_group.sum(dim=(0, 1))
+        tokens_per_expert = tpep_dispatched["tokens_per_expert_group"].sum(dim=(0, 1))
 
         if async_op:
-            assert dispatched["backward_previous_event"] is not None, "Use async_op=True for dispatch!"
             if global_input_tokens.grad_fn is not None:
                 global_input_tokens.grad_fn.register_hook(
                     get_backward_hook(
-                        dispatched["backward_previous_event"],
+                        cast(torch.cuda.Event, tpep_dispatched["backward_previous_event"]),
                         name="TorchAll2AllTPEPDispatcher.dispatch_postprocess",
                         debug=XTUNER_DISPATCHER_DEBUG,
                     )
@@ -274,7 +629,6 @@ def dispatch_postprocess(
             hidden_states=global_input_tokens,
             row_ids_map=row_ids_map,
             tokens_per_expert=tokens_per_expert,
-            output_splits_tp=output_splits_tp,
         )
 
     @override
@@ -288,16 +642,8 @@ def combine_preprocess(
         async_op: bool = False,
         decoding: bool = False,
     ) -> TorchAll2AllPreCombineResult:
-        tpep_post = cast(TorchAll2AllTPEPPostDispatchResult, post_dispatched)
         # Unpermute [M_total, H] back to TP-AllGather order (tp0_block | tp1_block | ...).
-        hidden_states = unpermute(hidden_states, tpep_post["row_ids_map"])
-
-        # TP ReduceScatterSum: [M_total, H] → [M_ep_recv, H] for this TP rank.
-        hidden_states = _tp_reduce_scatter_sum(
-            hidden_states,
-            all_sizes=tpep_post["output_splits_tp"],
-            tp_group=self._tp_group,
-        )
+        hidden_states = unpermute(hidden_states, post_dispatched["row_ids_map"])
 
         if async_op:
             backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
@@ -323,3 +669,65 @@ def combine_preprocess(
             backward_previous_event=backward_previous_event,
             forward_finished_event=forward_finished_event,
         )
+
+    @override
+    def combine(
+        self,
+        *,
+        pre_dispatched: TorchAll2AllPreDispatchResult,
+        dispatched: TorchAll2AllDispatchResult,
+        post_dispatched: TorchAll2AllPostDispatchResult,
+        pre_combined: TorchAll2AllPreCombineResult,
+        async_op: bool = False,
+        decoding: bool = False,
+    ) -> TorchAll2AllCombineResult:
+        tpep_dispatched = cast(TorchAll2AllTPEPDispatchResult, dispatched)
+
+        if async_op:
+            forward_previous_event = pre_combined["forward_finished_event"]
+            backward_finished_event = pre_combined["backward_previous_event"]
+            assert forward_previous_event is not None, "Use async_op=True for combine_preprocess!"
+            assert backward_finished_event is not None, "Use async_op=True for combine_preprocess!"
+            comm_stream = cast(torch.cuda.Stream, self._comm_stream)
+
+            tp_forward_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+            tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
+            # 中文注释：TP ReduceScatter 属于 combine 通信段，EP combine 等它完成后再发起。
+            hidden_states = _async_tp_reduce_scatter_sum(
+                pre_combined["hidden_states"],
+                all_sizes=tpep_dispatched["output_splits_tp"],
+                tp_group=self._tp_group,
+                forward_previous_event=forward_previous_event,
+                forward_finished_event=tp_forward_finished_event,
+                backward_previous_event=tp_backward_previous_event,
+                backward_finished_event=backward_finished_event,
+                comm_stream=comm_stream,
+            )
+            pre_combined_for_ep = TorchAll2AllPreCombineResult(
+                hidden_states=hidden_states,
+                backward_previous_event=tp_backward_previous_event,
+                forward_finished_event=tp_forward_finished_event,
+            )
+        else:
+            hidden_states = _tp_reduce_scatter_sum(
+                pre_combined["hidden_states"],
+                all_sizes=tpep_dispatched["output_splits_tp"],
+                tp_group=self._tp_group,
+            )
+            pre_combined_for_ep = TorchAll2AllPreCombineResult(
+                hidden_states=hidden_states,
+                backward_previous_event=None,
+                forward_finished_event=None,
+            )
+
+        return cast(
+            TorchAll2AllCombineResult,
+            super().combine(
+                pre_dispatched=pre_dispatched,
+                dispatched=dispatched,
+                post_dispatched=post_dispatched,
+                pre_combined=pre_combined_for_ep,
+                async_op=async_op,
+                decoding=decoding,
+            ),
+        )
diff --git a/xtuner_ep_domino.md b/xtuner_ep_domino.md
index d26b0eac7..e9ba82ca0 100644
--- a/xtuner_ep_domino.md
+++ b/xtuner_ep_domino.md
@@ -316,17 +316,17 @@ residual，得到本层输出。
 表中加粗的 `A/D/E/C/S` 是相对耗时大的主算子，后续时间线主要围绕它们观察重叠。
 
 
-| CPU/host 操作                                                                                               |
-| ------------------------------------------------------------------------------------------------------------- |
-| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |
-| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         |
-| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`     |
-| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`     |
-| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      |
-| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |
-| **`S0`** -> **`S1`**                                                                                        |
-| `wait Fd0` -> `Cpost0`                                                                                      |
-| `wait Fd1` -> `Cpost1`                                                                                      |
+| CPU/host 操作                                                                                         |
+| ------------------------------------------------------------------------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                   |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                   |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0` |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                |
+| **`S0`** -> **`S1`**                                                                                  |
+| `wait Fd0` -> `Cpost0`                                                                                |
+| `wait Fd1` -> `Cpost1`                                                                                |
 
 其中：
 
@@ -362,18 +362,18 @@ event；如果 `Dpre0` 已完成，而 `A1/Dpre1` 还在 compute stream 中排
 `wait Fa0` 表示 comm stream 等这个 event。其他 event 同理。
 
 
-| 计算 stream                                                                       | 通信 stream                                  |
-| ----------------------------------------------------------------------------------- | ---------------------------------------------- |
-| **`A0`**                                                                          |                                              |
-| `Dpre0` -> `record Fa0`                                                           |                                              |
-| **`A1`**                                                                          | `wait Fa0` -> **`D0`** -> `record Fb0`      |
-| `Dpre1` -> `record Fa1`                                                           |                                              |
-| `wait Fb0` -> `Dpost0`                                                            |                                              |
-| **`E0`** -> `Cpre0` -> `record Fc0`                                               | `wait Fa1` -> **`D1`** -> `record Fb1`      |
-| `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                     | `wait Fc0` -> **`C0`** -> `record Fd0`      |
-| **`S0`** -> **`S1`**                                                              | `wait Fc1` -> **`C1`** -> `record Fd1`      |
-| `wait Fd0` -> `Cpost0`                                                            |                                              |
-| `wait Fd1` -> `Cpost1`                                                            |                                              |
+| 计算 stream                                                   | 通信 stream                            |
+| --------------------------------------------------------------- | ---------------------------------------- |
+| **`A0`**                                                      |                                        |
+| `Dpre0` -> `record Fa0`                                       |                                        |
+| **`A1`**                                                      | `wait Fa0` -> **`D0`** -> `record Fb0` |
+| `Dpre1` -> `record Fa1`                                       |                                        |
+| `wait Fb0` -> `Dpost0`                                        |                                        |
+| **`E0`** -> `Cpre0` -> `record Fc0`                           | `wait Fa1` -> **`D1`** -> `record Fb1` |
+| `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` | `wait Fc0` -> **`C0`** -> `record Fd0` |
+| **`S0`** -> **`S1`**                                          | `wait Fc1` -> **`C1`** -> `record Fd1` |
+| `wait Fd0` -> `Cpost0`                                        |                                        |
+| `wait Fd1` -> `Cpost1`                                        |                                        |
 
 同一行两列表示这两个 stream 上的操作可以重叠；长通信可能延续到后面的行。每一行到下一行的顺序只表达同一
 stream FIFO 或 event 约束能保证的偏序。为避免表格过长，主算子和紧邻的 event `record/wait` 写在同一个
@@ -407,17 +407,17 @@ compute/comm stream 上已经允许出现的操作。某个 GPU 操作可以出
 这样才能表达 CUDA 异步执行导致的计算通信重叠。
 
 
-| CPU/host 严格时间轴                                                                                         | 计算 stream                                                                       | 通信 stream                                  |
-| ------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | ---------------------------------------------- |
-| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |                                                                                   |                                              |
-| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         | **`A0`** -> `Dpre0` -> `record Fa0`                                               |                                              |
-| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`     | **`A1`** -> `Dpre1` -> `record Fa1`                                               | `wait Fa0` -> **`D0`** -> `record Fb0`      |
-| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`     | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`                    | `wait Fa1` -> **`D1`** -> `record Fb1`      |
-| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                    | `wait Fc0` -> **`C0`** -> `record Fd0`      |
-| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |                                                                                   |                                              |
-| **`S0`** -> **`S1`**                                                                                        | **`S0`** -> **`S1`**                                                              | `wait Fc1` -> **`C1`** -> `record Fd1`      |
-| `wait Fd0` -> `Cpost0`                                                                                      | `wait Fd0` -> `Cpost0`                                                            |                                              |
-| `wait Fd1` -> `Cpost1`                                                                                      | `wait Fd1` -> `Cpost1`                                                            |                                              |
+| CPU/host 严格时间轴                                                                                   | 计算 stream                                                   | 通信 stream                            |
+| ------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | ---------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                   |                                                               |                                        |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                   | **`A0`** -> `Dpre0` -> `record Fa0`                           |                                        |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0` | **`A1`** -> `Dpre1` -> `record Fa1`                           | `wait Fa0` -> **`D0`** -> `record Fb0` |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0` | `wait Fa1` -> **`D1`** -> `record Fb1` |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` | `wait Fc0` -> **`C0`** -> `record Fd0` |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                |                                                               |                                        |
+| **`S0`** -> **`S1`**                                                                                  | **`S0`** -> **`S1`**                                          | `wait Fc1` -> **`C1`** -> `record Fd1` |
+| `wait Fd0` -> `Cpost0`                                                                                | `wait Fd0` -> `Cpost0`                                        |                                        |
+| `wait Fd1` -> `Cpost1`                                                                                | `wait Fd1` -> `Cpost1`                                        |                                        |
 
 ## 6. 反向中的事件链
 
@@ -476,14 +476,14 @@ CPU/autograd 侧看到的是 backward node 的遍历顺序：
 表中加粗的 `A/D/E/C/S` 同样表示反向中相对耗时大的主算子。
 
 
-| CPU/autograd 操作示例                                                                                                     |
-| ---------------------------------------------------------------------------------------------------------------------------- |
-| `Cpost1_bwd` -> `record Bd1`; `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                                   |
-| `Cpost0_bwd` -> `record Bd0`; `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                                   |
-| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1`     |
-| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0`     |
-| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                                  |
-| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                                  |
+| CPU/autograd 操作示例                                                                                                 |
+| ----------------------------------------------------------------------------------------------------------------------- |
+| `Cpost1_bwd` -> `record Bd1`; `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                              |
+| `Cpost0_bwd` -> `record Bd0`; `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                              |
+| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1` |
+| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0` |
+| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                             |
+| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                             |
 
 其中：
 
@@ -500,14 +500,14 @@ compute stream 上的 `Cpost0_bwd`，只要 `Bd1` 已经被记录，`C1_bwd` 就
 在上述 autograd 发起顺序下，CUDA 侧更接近下面这张 event 依赖图：
 
 
-| 计算 stream                                                                                         | 通信 stream                                  |
-| ------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
-| `Cpost1_bwd` -> `record Bd1`                                                                          |                                              |
-| `Cpost0_bwd` -> `record Bd0`                                                                          | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`  |
-| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`                              | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`  |
-| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`                              | `wait Bb1` -> **`D1_bwd`** -> `record Ba1`  |
-| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                            | `wait Bb0` -> **`D0_bwd`** -> `record Ba0`  |
-| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                            |                                              |
+| 计算 stream                                                               | 通信 stream                                |
+| --------------------------------------------------------------------------- | -------------------------------------------- |
+| `Cpost1_bwd` -> `record Bd1`                                              |                                            |
+| `Cpost0_bwd` -> `record Bd0`                                              | `wait Bd1` -> **`C1_bwd`** -> `record Bc1` |
+| `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1` | `wait Bd0` -> **`C0_bwd`** -> `record Bc0` |
+| `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0` | `wait Bb1` -> **`D1_bwd`** -> `record Ba1` |
+| `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                 | `wait Bb0` -> **`D0_bwd`** -> `record Ba0` |
+| `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                 |                                            |
 
 同一行两列表示可重叠窗口；长通信可能延续到后面的行。每个 `wait Ba*` / `wait Bc*` 都位于对应
 `record Ba*` / `record Bc*` 同一行或之后，每个 `wait Bb*` / `wait Bd*` 都位于对应
@@ -530,18 +530,18 @@ backward node 的顺序决定，不能仅凭 `hidden0, hidden1` 的返回顺序
 反向时间线相反；严格 event 约束以 7.2 为准。
 
 
-| 前向 CPU/host 严格时间轴                                                                                      | 前向计算 stream                                                                  | 前向通信 stream                                 | 反向 CPU/autograd 对应阶段（滞后）                                                                                                               | 反向计算 stream（逆序，对齐前向 GPU）                                                         | 反向通信 stream（逆序，对齐前向 GPU）             |
-| -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------- |
-| **`A0`** -> `Dpre0` -> `record Fa0`                                                                         |                                                                                  |                                                 |                                                                                                                                                   |                                                                                               |                                                   |
-| **`A1`** -> `Dpre1` -> `record Fa1`                                                                         | **`A0`** -> `Dpre0` -> `record Fa0`                                             |                                                 | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                                                       | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                   |                                                   |
-| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`      | **`A1`** -> `Dpre1` -> `record Fa1`                                             | `wait Fa0` -> **`D0`** -> `record Fb0`         | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                                                       | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                   | `wait Bb0` -> **`D0_bwd`** -> `record Ba0`       |
-| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`      | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0`                  | `wait Fa1` -> **`D1`** -> `record Fb1`         | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0`                            | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`                | `wait Bb1` -> **`D1_bwd`** -> `record Ba1`       |
-| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                      | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1`                  | `wait Fc0` -> **`C0`** -> `record Fd0`         | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1`                            | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`                | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`       |
-| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                      |                                                                                  |                                                 | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                                                                                     |                                                                                               |                                                   |
-| **`S0`** -> **`S1`**                                                                                        | **`S0`** -> **`S1`**                                                            | `wait Fc1` -> **`C1`** -> `record Fd1`         | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                                                                                     | **`S*_bwd`**                                                                                | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`       |
-| `wait Fd0` -> `Cpost0`                                                                                      | `wait Fd0` -> `Cpost0`                                                          |                                                 | `S*_bwd`，如果开启 shared experts                                                                                                                | `Cpost0_bwd` -> `record Bd0`                                                               |                                                   |
-| `wait Fd1` -> `Cpost1`                                                                                      | `wait Fd1` -> `Cpost1`                                                          |                                                 | `Cpost0_bwd` -> `record Bd0`                                                                                                                    | `Cpost1_bwd` -> `record Bd1`                                                               |                                                   |
-|                                                                                                                |                                                                                  |                                                 | `Cpost1_bwd` -> `record Bd1`                                                                                                                    |                                                                                               |                                                   |
+| 前向 CPU/host 严格时间轴                                                                              | 前向计算 stream                                               | 前向通信 stream                        | 反向 CPU/autograd 对应阶段（滞后）                                                                                    | 反向计算 stream（逆序，对齐前向 GPU）                                     | 反向通信 stream（逆序，对齐前向 GPU）      |
+| ------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------- | -------------------------------------------- |
+| **`A0`** -> `Dpre0` -> `record Fa0`                                                                   |                                                               |                                        |                                                                                                                       |                                                                           |                                            |
+| **`A1`** -> `Dpre1` -> `record Fa1`                                                                   | **`A0`** -> `Dpre0` -> `record Fa0`                           |                                        | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                                                             | `wait Ba0` -> `Dpre0_bwd` -> **`A0_bwd`**                                 |                                            |
+| `wait Fa0` -> **`D0`** -> `record Fb0`; `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0` | **`A1`** -> `Dpre1` -> `record Fa1`                           | `wait Fa0` -> **`D0`** -> `record Fb0` | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                                                             | `wait Ba1` -> `Dpre1_bwd` -> **`A1_bwd`**                                 | `wait Bb0` -> **`D0_bwd`** -> `record Ba0` |
+| `wait Fa1` -> **`D1`** -> `record Fb1`; `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` | `wait Fb0` -> `Dpost0` -> **`E0`** -> `Cpre0` -> `record Fc0` | `wait Fa1` -> **`D1`** -> `record Fb1` | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0`; `wait Bb0` -> **`D0_bwd`** -> `record Ba0` | `wait Bc0` -> `Cpre0_bwd` -> **`E0_bwd`** -> `Dpost0_bwd` -> `record Bb0` | `wait Bb1` -> **`D1_bwd`** -> `record Ba1` |
+| `wait Fc0` -> **`C0`** -> `record Fd0`                                                                | `wait Fb1` -> `Dpost1` -> **`E1`** -> `Cpre1` -> `record Fc1` | `wait Fc0` -> **`C0`** -> `record Fd0` | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1`; `wait Bb1` -> **`D1_bwd`** -> `record Ba1` | `wait Bc1` -> `Cpre1_bwd` -> **`E1_bwd`** -> `Dpost1_bwd` -> `record Bb1` | `wait Bd0` -> **`C0_bwd`** -> `record Bc0` |
+| `wait Fc1` -> **`C1`** -> `record Fd1`                                                                |                                                               |                                        | `wait Bd0` -> **`C0_bwd`** -> `record Bc0`                                                                            |                                                                           |                                            |
+| **`S0`** -> **`S1`**                                                                                  | **`S0`** -> **`S1`**                                          | `wait Fc1` -> **`C1`** -> `record Fd1` | `wait Bd1` -> **`C1_bwd`** -> `record Bc1`                                                                            | **`S1_bwd`** -> **`S0_bwd`**                                              | `wait Bd1` -> **`C1_bwd`** -> `record Bc1` |
+| `wait Fd0` -> `Cpost0`                                                                                | `wait Fd0` -> `Cpost0`                                        |                                        | **`S1_bwd`** -> **`S0_bwd`**，如果开启 shared experts                                                                 | `Cpost0_bwd` -> `record Bd0`                                              |                                            |
+| `wait Fd1` -> `Cpost1`                                                                                | `wait Fd1` -> `Cpost1`                                        |                                        | `Cpost0_bwd` -> `record Bd0`                                                                                          | `Cpost1_bwd` -> `record Bd1`                                              |                                            |
+|                                                                                                       |                                                               |                                        | `Cpost1_bwd` -> `record Bd1`                                                                                          |                                                                           |                                            |
 
 shared experts 的反向本地计算没有在上面的 EP dispatcher event 链中单独展开；如果开启 `n_shared_experts`，
 `S*_bwd` 也是 compute stream 上的耗时计算，能否覆盖某段 EP 通信取决于 autograd 对 shared 分支和 MoE 分支的实际调度。
@@ -568,21 +568,31 @@ compute stream 中剥离出来，让它们尽可能和另一个 micro batch 的
 ## 8. TP+EP 情况下的差异
 
 当同时打开 TP 和 EP 时，`build_dispatcher` 会选择 `TorchAll2AllTPEPDispatcher`。它继承 EP-only 的
-`dispatch_preprocess`、`dispatch`、`combine`、`combine_postprocess`，只改两处：
+`dispatch_preprocess` 和 `combine_postprocess`，并把 TP 通信归入 `dispatch` / `combine` 两个通信阶段：
 
-1. `dispatch_postprocess`：EP all2all 后先做 TP AllGather，把同一 EP rank 上不同 TP rank 的 token slice 拼成
-   `[M_total, hidden]`，再按 local expert 排序给 grouped GEMM。
-2. `combine_preprocess`：expert 输出先按 local expert 的 `row_ids_map` unpermute 回 TP AllGather 顺序，再做
-   TP ReduceScatterSum，恢复每个 TP rank 自己的 `[M_ep_recv, hidden]`，最后进入 EP combine all2all。
+1. `dispatch`：先做 EP all2all，再做 TP AllGather，把同一 EP rank 上不同 TP rank 的 token slice 拼成
+   `[M_total, hidden]`。
+2. `dispatch_postprocess`：只做本地按 local expert 排序，给 grouped GEMM 使用。
+3. `combine_preprocess`：只做本地 unpermute，把 expert 输出恢复到 TP AllGather 顺序。
+4. `combine`：先做 TP ReduceScatterSum，恢复每个 TP rank 自己的 `[M_ep_recv, hidden]`，再进入 EP combine all2all。
 
 专家权重本身由 `GroupedLinear` 按 TP 切分：
 
 - `fused_w1w3` 是 column parallel。
 - `fused_w2` 是 row parallel。
 
-需要注意的是，当前 TPEP dispatcher 的 TP AllGather / ReduceScatterSum 仍是同步实现；`async_op=True` 只复用
-EP all2all 的事件链。也就是说，Domino EP 的异步重叠主要作用在 EP dispatch/combine 上，TP collectives 还没有
-被同样地放到独立通信 stream 中流水。
+当前 TPEP dispatcher 在 `async_op=True` 时也把 TP AllGather / ReduceScatterSum 接入同一套事件链：
+
+- `dispatch` 中，TP AllGather 在 dispatcher 的 comm stream 上等待 EP dispatch 完成事件；compute stream 只在
+  `dispatch_postprocess` 做本地排序前等待 TP AllGather 完成。
+- `combine` 中，TP ReduceScatterSum 在 comm stream 上等待 `combine_preprocess` 的本地 unpermute 完成事件；
+  后续 EP combine 再等待 TP ReduceScatterSum 完成事件。
+- 反向中，TP AllGather / ReduceScatterSum 对应的反向 collective 也在 comm stream 上执行，并通过 autograd hook
+  把等待点放在梯度真正被消费的位置。
+
+因此 TP+EP 下的 Domino 流水不再只覆盖 EP dispatch/combine；TP collectives 也可以和另一个 micro batch 的
+attention、expert 或 shared expert 计算重叠。变长 TP AllGather 仍需要先收集每个 TP rank 的 token 数用于分配输出
+buffer，这一步只传输很小的 size 张量，不承载主要 hidden 通信量。
 
 ## 9. 小结
 
@@ -593,9 +603,9 @@ XTuner 当前 Domino EP 实现可以概括为：
   micro-batch forward。
 - 层级 `MoEDecoderLayer._micro_batch_forward` 负责重新排列单层内两个 micro batch 的 attention/gate、EP
   dispatch、expert、combine、shared expert、postprocess。
-- dispatcher 的 `async_op=True` 负责把 EP all2all 放到独立 comm stream，并用 CUDA event 和 autograd hook
-  维持正确依赖。
+- dispatcher 的 `async_op=True` 负责把 EP all2all 以及 TP+EP 中的 TP AllGather / ReduceScatterSum 放到独立
+  comm stream，并用 CUDA event 和 autograd hook 维持正确依赖。
 - 前向重叠需要按 event 判断：`D0` 可覆盖 `A1/Dpre1`，`D1` 可覆盖 `E0/Cpre0`，`C0/C1` 可覆盖后续
   compute；但每个 micro batch 在 `dispatch_postprocess` / `combine_postprocess` 消费通信结果前仍会等待。
-- 反向通过 `_AsyncDispatch.backward`、`_AsyncCombine.backward` 和 backward hook，把 dispatch/combine 的反向
-  all2all 延后到梯度准备好后异步发起，并只在梯度消费点等待，从而给两个 micro batch 之间的反向计算通信重叠留下空间。
+- 反向通过 `_AsyncDispatch.backward`、`_AsyncCombine.backward`、TP collective 的异步 backward 和 backward hook，
+  把通信延后到梯度准备好后异步发起，并只在梯度消费点等待，从而给两个 micro batch 之间的反向计算通信重叠留下空间。

From 802d6d0a30183babc144cd739ff2aa443cbf2f6c Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Wed, 13 May 2026 13:41:10 +0000
Subject: [PATCH 10/25] Enhance documentation on host metadata synchronization
 in variable-length all-to-all operations and  Clarify the impact on
 computation overlap for domino ep.

---
 xtuner_ep_dispatcher.md | 67 +++++++++++++++++++++++++++++++++++++++++
 xtuner_ep_domino.md     | 16 ++++++++++
 2 files changed, 83 insertions(+)

diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index f530237bd..33db7d0a0 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -181,6 +181,36 @@ dispatched["tokens_per_expert_group"]:    [EP, E_local] = [2, 3]
 
 在这个例子里两个 rank 都是 `M_recv=8`，但真实训练里不保证均匀。
 
+### 2.1 变长 all2all 的 host metadata 同步
+
+上面的 `input_splits` / `output_splits` 在真实 `TorchAll2AllDispatcher` 中不是纯 GPU metadata。
+当前实现会先在 GPU 上统计和交换每个 expert 的 token 数，然后把 split sizes 拉回 CPU：
+
+```python
+tokens_per_expert = torch.histc(topk_ids, bins=n_routed_experts, min=0, max=n_routed_experts)
+dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert, group=process_group)
+
+input_splits = (
+    tokens_per_expert.reshape(ep_size, num_experts_per_rank)
+    .to(device=torch.device("cpu"))
+    .sum(dim=1)
+    .tolist()
+)
+output_splits = tokens_per_expert_group.to(device=torch.device("cpu")).sum(dim=-1).tolist()
+```
+
+这一步会形成 CPU/host 同步点，因为 PyTorch 变长 `all_to_all_single` 需要 Python `list[int]` 形式的
+`input_split_sizes` / `output_split_sizes`。也就是说，EP-only 的 `async_op=True` 并不是“完全无 host 同步”：
+
+- 大块 hidden 的 EP all2all 会被放到 dispatcher 的通信流中，并由 CUDA event 串依赖。
+- 但在真正发起大块 hidden all2all 之前，host 需要等 token count 交换完成并拿到 split list。
+- `combine` 会复用 dispatch 阶段保存的 `input_splits` / `output_splits`，通常不会再新增同类 split-size 同步。
+
+这个细节对 Domino EP 的计算通信重叠很重要。host 等 split list 时，已经 enqueue 到 GPU 的另一个 micro batch
+计算仍然可以继续执行；但 host 不能继续 enqueue 后续的 `dispatch_postprocess -> expert -> combine_preprocess`
+或下一个 dispatch。如果 split-size 同步能被另一个 micro batch 的 attention/gate/pre-dispatch 覆盖，7.3 中的
+流水基本成立；如果同步时间更长，就会吃掉一部分甚至全部重叠窗口。
+
 ## 3. `dispatch_postprocess`: destination rank 内按 local expert 再排序
 
 all2all 后的顺序是：
@@ -306,6 +336,10 @@ input_split_sizes  = dispatched["output_splits"]
 output_split_sizes = dispatched["input_splits"]
 ```
 
+这里没有重新统计 token，也不会再把新的 split tensor 拉回 CPU；它依赖第一次 dispatch 已经确定的
+source/destination 分片关系。因此对于 `TorchAll2AllDispatcher`，前向中最主要的 host metadata 同步点在第一次
+dispatch，而不是 combine。
+
 对 source `ep0` 来说，它会收回自己原来发出去的 8 个 token copy 输出：
 
 ```text
@@ -398,3 +432,36 @@ router_weights: [N, E]
 
 第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
 语义相同（scatter，1D indices 无 topk 展开），只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。
+
+## DeepEP dispatcher 的对应差异
+
+`DeepEPDispatcher` 使用 DeepEP 的 `Buffer.get_dispatch_layout()` / `Buffer.dispatch()` / `Buffer.combine()` 来管理
+layout、通信 handle 和事件。它不像 `TorchAll2AllDispatcher` 那样显式执行：
+
+```python
+to(device=torch.device("cpu")).tolist()
+```
+
+但它仍然存在 host 可见的 metadata 准备点。`xtuner/v1/ops/comm/deepep_op.py::dispatch_forward()` 中已经注明：
+
+```python
+# NOTES: the CPU will wait for GPU's signal to arrive,
+# so this is not compatible with CUDA graph
+```
+
+DeepEP dispatch 会返回：
+
+```python
+num_recv_tokens_per_expert_list, handle, event
+```
+
+其中 `num_recv_tokens_per_expert_list` 是 Python list，`dispatch_postprocess` 需要用它计算 `num_out_tokens` 和
+`tokens_per_expert`。因此 DeepEP 也不是完全没有 host 同步；只是同步被 DeepEP 的 layout/dispatch handle 机制封装
+在库内部，不是 PyTorch split-size list 的 `.tolist()` 同步。
+
+对 Domino EP 来说，两者的影响边界一致：
+
+- 已经 enqueue 到 GPU 的另一个 micro batch 计算不会被 host 同步打断。
+- host 等 metadata 时无法继续 enqueue 后续本地算子和通信。
+- 如果 metadata 等待短于可覆盖的另一个 micro batch 计算，重叠效果基本保留。
+- 如果 metadata 等待更长，`xtuner_ep_domino.md` 7.3 中的理想时间线会被压缩，真实重叠比例下降。
diff --git a/xtuner_ep_domino.md b/xtuner_ep_domino.md
index e9ba82ca0..8a19bbf8d 100644
--- a/xtuner_ep_domino.md
+++ b/xtuner_ep_domino.md
@@ -310,6 +310,11 @@ residual，得到本层输出。
 表中的 `wait x` 表示 CPU 在对应 CUDA stream 上插入 `cudaStreamWaitEvent(x)`，不是 CPU 阻塞等待
 这个 event 完成。
 
+注意：本节时间线主要描述 CUDA event 和 stream 队列上的依赖。真实 dispatcher 还可能在 host 侧等待 routing
+metadata，例如变长 all2all 的 split sizes 或 DeepEP 的 dispatch layout signal。这个等待不会打断已经 enqueue 的
+GPU 计算，但会阻止 host 继续 enqueue 后续算子，从而压缩计算通信重叠窗口。具体同步点见
+`xtuner_ep_dispatcher.md` 的 “变长 all2all 的 host metadata 同步” 和 “DeepEP dispatcher 的对应差异”。
+
 ### 5.1 图一：CPU/host 侧顺序
 
 `MoEDecoderLayer._micro_batch_forward` 在 host 侧大致按下面顺序调用：
@@ -406,6 +411,11 @@ stream FIFO 或 event 约束能保证的偏序。为避免表格过长，主算
 compute/comm stream 上已经允许出现的操作。某个 GPU 操作可以出现在其 CPU 行之后的后续行；
 这样才能表达 CUDA 异步执行导致的计算通信重叠。
 
+这张表假设 host 能及时发起后续 dispatcher 调用。对于 `TorchAll2AllDispatcher`，第一次 dispatch 需要把变长
+split metadata 同步到 CPU；对于 `DeepEPDispatcher`，dispatch layout/handle 也有 host 可见的 GPU signal 等待。
+如果这些等待短于另一个 micro batch 已经 enqueue 的计算，表中的重叠基本成立；如果等待更长，host 无法继续发起
+`Dpost/E/Cpre` 或下一个 dispatch，实际时间线会比表中更串行。
+
 
 | CPU/host 严格时间轴                                                                                   | 计算 stream                                                   | 通信 stream                            |
 | ------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | ---------------------------------------- |
@@ -565,6 +575,10 @@ shared experts 的反向本地计算没有在上面的 EP dispatcher event 链
 因此，反向比前向更依赖 autograd 图的调度，但事件链的目标很明确：把 `combine` 和 `dispatch` 的反向通信从
 compute stream 中剥离出来，让它们尽可能和另一个 micro batch 的本地反向计算重叠。
 
+7.3 的六列表同样应理解为 GPU event 依赖的理想化对齐视图。前向 dispatch 阶段的 host metadata 同步不在表中展开；
+它会影响 host 继续 enqueue 后续前向节点的速度。反向通常复用前向保存的 split/handle metadata，但具体 dispatcher
+是否还有库内 signal 等待，应以 `xtuner_ep_dispatcher.md` 中对应 dispatcher 的说明为准。
+
 ## 8. TP+EP 情况下的差异
 
 当同时打开 TP 和 EP 时，`build_dispatcher` 会选择 `TorchAll2AllTPEPDispatcher`。它继承 EP-only 的
@@ -607,5 +621,7 @@ XTuner 当前 Domino EP 实现可以概括为：
   comm stream，并用 CUDA event 和 autograd hook 维持正确依赖。
 - 前向重叠需要按 event 判断：`D0` 可覆盖 `A1/Dpre1`，`D1` 可覆盖 `E0/Cpre0`，`C0/C1` 可覆盖后续
   compute；但每个 micro batch 在 `dispatch_postprocess` / `combine_postprocess` 消费通信结果前仍会等待。
+- 这些时间线没有展开 dispatcher 的 host metadata 同步；变长 all2all split list 和 DeepEP dispatch layout signal
+  会影响 host enqueue 进度，细节见 `xtuner_ep_dispatcher.md`。
 - 反向通过 `_AsyncDispatch.backward`、`_AsyncCombine.backward`、TP collective 的异步 backward 和 backward hook，
   把通信延后到梯度准备好后异步发起，并只在梯度消费点等待，从而给两个 micro batch 之间的反向计算通信重叠留下空间。

From 3e5bf67573b0d403360da4c14061f4e9ecf5e0ca Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 14 May 2026 03:12:03 +0000
Subject: [PATCH 11/25] Refactor TPEP TP collectives

Move TP communication into dispatch/combine, share sync and async collective cores, and use real TP reduce-scatter semantics. Update tests, docs, pseudocode, and validation snapshots for the new flow.
---
 .dev_scripts/validate_xtuner_tpep_md.py       |  53 +--
 CONTEXT.md                                    |  44 +++
 improve_tpep_dispatcher.py                    | 347 ++++++++++++++++++
 .../test_torch_all2all_tpep_async.py          |  36 +-
 .../module/dispatcher/torch_all2all_tpep.py   |  67 ++--
 xtuner_ep_dispatcher.md                       |  93 +++++
 xtuner_ep_domino.md                           |   3 +
 7 files changed, 580 insertions(+), 63 deletions(-)
 create mode 100644 CONTEXT.md
 create mode 100644 improve_tpep_dispatcher.py

diff --git a/.dev_scripts/validate_xtuner_tpep_md.py b/.dev_scripts/validate_xtuner_tpep_md.py
index cef1b40ff..30308b7b5 100644
--- a/.dev_scripts/validate_xtuner_tpep_md.py
+++ b/.dev_scripts/validate_xtuner_tpep_md.py
@@ -12,12 +12,11 @@
 每个 TP rank 持有 N_local=2 个 token，EP+TP 后的流程：
 
     dispatch_preprocess : 按 expert 排序（每 TP rank 独立）
-    dispatch            : EP AlltoAll（每 TP rank 独立，仅路由本 TP 的 token 副本）
-    dispatch_postprocess: TP AllGather → 将 TP slices 合并成 M_total token
-                          + 按 local expert 再排序（供 grouped GEMM）
+    dispatch            : EP AlltoAll → TP AllGather，将 TP slices 合并成 M_total token
+    dispatch_postprocess: 按 local expert 再排序（供 grouped GEMM）
     [Expert GEMM]       : 冗余计算（同一 EP rank 内各 TP rank 计算结果相同）
-    combine_preprocess  : unpermute → TP ReduceScatterSum → 恢复每 TP rank M_ep_recv
-    combine             : EP AlltoAll 逆向
+    combine_preprocess  : unpermute，恢复到 TP AllGather 顺序
+    combine             : TP ReduceScatterSum → EP AlltoAll 逆向
     combine_postprocess : unpermute + topk 加权求和 → [N_local, H]
 
 运行方式：
@@ -126,11 +125,12 @@ class ParallelInfo:
         # sorted (topk-slot-first then by expert): A0(e0), A1(e1), A1(e3), A0(e4)
         pre_hidden=(10.0, 11.0, 11.0, 10.0),
         pre_row_id_map=(0, 2, 3, 1),
-        # after EP A2A: from self=[A0(e0),A1(e1)], from ep1_tp0=[B0(e1),B1(e2)]
-        dispatch_hidden=(10.0, 11.0, 20.0, 21.0),
+        # after EP A2A + TP AllGather:
+        # tp0=[A0(e0),A1(e1),B0(e1),B1(e2)], tp1=[A3(e0),A2(e2),B2(e0),B3(e1)]
+        dispatch_hidden=(10.0, 11.0, 20.0, 21.0, 13.0, 12.0, 22.0, 23.0),
         input_splits=(2, 2),
         output_splits=(2, 2),
-        tokens_per_expert_group=(1.0, 1.0, 0.0, 0.0, 1.0, 1.0),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0),
         output_splits_tp=(4, 4),
         # after TP AllGather (tp0||tp1) + sort by local expert:
         # e0: A0,A3,B2  e1: A1,B0,B3  e2: B1,A2
@@ -139,8 +139,8 @@ class ParallelInfo:
         tokens_per_expert=(3.0, 3.0, 2.0),
         # expert adds global_expert_id * 100
         experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
-        # after ReduceScatterSum — tp0 slice [0:4]
-        pre_combine_hidden=(20.0, 222.0, 240.0, 442.0),
+        # after local unpermute back to TP AllGather order
+        pre_combine_hidden=(10.0, 111.0, 120.0, 221.0, 13.0, 212.0, 22.0, 123.0),
         # after EP A2A reverse: from self=[20,222], from ep1_tp0=[622,820]
         combine_hidden=(20.0, 222.0, 622.0, 820.0),
         post_combine_hidden=(620.0, 382.0),
@@ -152,19 +152,19 @@ class ParallelInfo:
         # sorted: A3(e0), A2(e2), A3(e4), A2(e5)
         pre_hidden=(13.0, 12.0, 13.0, 12.0),
         pre_row_id_map=(1, 2, 3, 0),
-        # after EP A2A: from self=[A3(e0),A2(e2)], from ep1_tp1=[B2(e0),B3(e1)]
-        dispatch_hidden=(13.0, 12.0, 22.0, 23.0),
+        # after EP A2A + TP AllGather, same gathered tensor as ep0_tp0
+        dispatch_hidden=(10.0, 11.0, 20.0, 21.0, 13.0, 12.0, 22.0, 23.0),
         input_splits=(2, 2),
         output_splits=(2, 2),
-        tokens_per_expert_group=(1.0, 0.0, 1.0, 1.0, 1.0, 0.0),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0),
         output_splits_tp=(4, 4),
         # both tp ranks see the same gathered tensor after AllGather
         post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 21.0, 12.0),
         post_row_ids_map=(0, 3, 4, 6, 1, 7, 2, 5),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 221.0, 212.0),
-        # after ReduceScatterSum — tp1 slice [4:8]
-        pre_combine_hidden=(26.0, 424.0, 44.0, 246.0),
+        # after local unpermute back to TP AllGather order
+        pre_combine_hidden=(10.0, 111.0, 120.0, 221.0, 13.0, 212.0, 22.0, 123.0),
         # after EP A2A reverse: from self=[26,424], from ep1_tp1=[826,1024]
         combine_hidden=(26.0, 424.0, 826.0, 1024.0),
         post_combine_hidden=(604.0, 666.0),
@@ -176,19 +176,20 @@ class ParallelInfo:
         # sorted: B0(e1), B1(e2), B0(e3), B1(e4)
         pre_hidden=(20.0, 21.0, 20.0, 21.0),
         pre_row_id_map=(0, 3, 2, 1),
-        # after EP A2A: from ep0_tp0=[A1(e3),A0(e4)], from self=[B0(e3),B1(e4)]
-        dispatch_hidden=(11.0, 10.0, 20.0, 21.0),
+        # after EP A2A + TP AllGather:
+        # tp0=[A1(e3),A0(e4),B0(e3),B1(e4)], tp1=[A3(e4),A2(e5),B3(e3),B2(e5)]
+        dispatch_hidden=(11.0, 10.0, 20.0, 21.0, 13.0, 12.0, 23.0, 22.0),
         input_splits=(2, 2),
         output_splits=(2, 2),
-        tokens_per_expert_group=(1.0, 1.0, 0.0, 1.0, 1.0, 0.0),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0),
         output_splits_tp=(4, 4),
         # after TP AllGather (tp0||tp1) + sort: e3: A1,B0,B3  e4: A0,B1,A3  e5: A2,B2
         post_hidden=(11.0, 20.0, 23.0, 10.0, 21.0, 13.0, 12.0, 22.0),
         post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
-        # after ReduceScatterSum — tp0 slice [0:4]
-        pre_combine_hidden=(622.0, 820.0, 640.0, 842.0),
+        # after local unpermute back to TP AllGather order
+        pre_combine_hidden=(311.0, 410.0, 320.0, 421.0, 413.0, 512.0, 323.0, 522.0),
         # after EP A2A reverse: from ep0_tp0=[240,442], from self=[640,842]
         combine_hidden=(240.0, 442.0, 640.0, 842.0),
         post_combine_hidden=(560.0, 642.0),
@@ -200,18 +201,18 @@ class ParallelInfo:
         # sorted: B2(e0), B3(e1), B3(e3), B2(e5)
         pre_hidden=(22.0, 23.0, 23.0, 22.0),
         pre_row_id_map=(3, 2, 0, 1),
-        # after EP A2A: from ep0_tp1=[A3(e4),A2(e5)], from self=[B3(e3),B2(e5)]
-        dispatch_hidden=(13.0, 12.0, 23.0, 22.0),
+        # after EP A2A + TP AllGather, same gathered tensor as ep1_tp0
+        dispatch_hidden=(11.0, 10.0, 20.0, 21.0, 13.0, 12.0, 23.0, 22.0),
         input_splits=(2, 2),
         output_splits=(2, 2),
-        tokens_per_expert_group=(0.0, 1.0, 1.0, 1.0, 0.0, 1.0),
+        tokens_per_expert_group=(1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0),
         output_splits_tp=(4, 4),
         post_hidden=(11.0, 20.0, 23.0, 10.0, 21.0, 13.0, 12.0, 22.0),
         post_row_ids_map=(0, 3, 1, 4, 5, 6, 2, 7),
         tokens_per_expert=(3.0, 3.0, 2.0),
         experts_out=(311.0, 320.0, 323.0, 410.0, 421.0, 413.0, 512.0, 522.0),
-        # after ReduceScatterSum — tp1 slice [4:8]
-        pre_combine_hidden=(826.0, 1024.0, 646.0, 1044.0),
+        # after local unpermute back to TP AllGather order
+        pre_combine_hidden=(311.0, 410.0, 320.0, 421.0, 413.0, 512.0, 323.0, 522.0),
         # after EP A2A reverse: from ep0_tp1=[44,246], from self=[646,1044]
         combine_hidden=(44.0, 246.0, 646.0, 1044.0),
         post_combine_hidden=(944.0, 386.0),
@@ -342,7 +343,7 @@ def _run_tpep_case(parallel_info: ParallelInfo) -> dict[str, Any]:
         "input_splits": dispatched["input_splits"],
         "output_splits": dispatched["output_splits"],
         "tokens_per_expert_group": dispatched["tokens_per_expert_group"],
-        "output_splits_tp": post_dispatched["output_splits_tp"],
+        "output_splits_tp": dispatched["output_splits_tp"],
         "post_hidden": post_dispatched["hidden_states"],
         "post_row_ids_map": post_dispatched["row_ids_map"],
         "tokens_per_expert": post_dispatched["tokens_per_expert"],
diff --git a/CONTEXT.md b/CONTEXT.md
new file mode 100644
index 000000000..2490261dc
--- /dev/null
+++ b/CONTEXT.md
@@ -0,0 +1,44 @@
+# XTuner MoE Dispatch
+
+This context describes the communication language used by XTuner MoE dispatchers when Expert Parallelism and Tensor Parallelism are enabled together.
+
+## Language
+
+**TP ReduceScatterSum**:
+对同一 TP group 中完整 token 批的 hidden 做 SUM 归约，并只保留当前 TP rank 负责的 token slice 的通信语义。
+_Avoid_: all_reduce + slice
+
+**Variable TP ReduceScatterSum**:
+使用 **TP size meta** 描述不等长 token slice 的 **TP ReduceScatterSum**。
+_Avoid_: equal-only reduce scatter
+
+**TP size meta**:
+每个 TP rank 在 EP dispatch 后拥有的 token 行数列表，用来描述变长 TP token slice 的拼接和切分边界。
+_Avoid_: shape hack, split list
+
+## Relationships
+
+- **TP AllGather** 的反向通信是 **TP ReduceScatterSum**。
+- **TP ReduceScatterSum** 的反向通信是 **TP AllGather**。
+- **TP size meta** 定义 **TP ReduceScatterSum** 输出给每个 TP rank 的 token slice 边界。
+- **Variable TP ReduceScatterSum** 是 TP+EP MoE routing 下的默认语义；等长 fast path 只是实现优化。
+- **TP ReduceScatterSum** 的实现策略应集中在一个共享核心函数中，避免 combine forward 和 TP AllGather backward 分叉。
+- **TP ReduceScatterSum** 的输出 shape 严格由当前 TP rank 的 **TP size meta** 决定，允许 0 行，不引入 padding 或 capacity。
+
+## Example dialogue
+
+> **Dev:** "combine forward 和 TP AllGather backward 都能叫 **TP ReduceScatterSum** 吗？"
+> **Domain expert:** "可以。它们都是先跨 TP rank 做 SUM，再只保留当前 rank 的 token slice。具体用 reduce_scatter 还是 all_reduce + slice 是实现细节。"
+
+> **Dev:** "只支持等长 reduce scatter 够吗？"
+> **Domain expert:** "不够。EP routing 后每个 TP rank 的 token 数可能不同，默认要按 **TP size meta** 做 **Variable TP ReduceScatterSum**。"
+
+> **Dev:** "等长和变长 reduce scatter 要不要分别写在不同调用点？"
+> **Domain expert:** "不要。调用点只表达 **TP ReduceScatterSum**，共享核心函数内部选择等长 fast path 或变长路径。"
+
+> **Dev:** "如果某个 TP rank 没有 token，要不要 pad 到 1 行或固定容量？"
+> **Domain expert:** "不要。**TP ReduceScatterSum** 输出真实 token slice，0 行就是合法输出。"
+
+## Flagged ambiguities
+
+- "reduce scatter" 在本上下文中特指 **TP ReduceScatterSum**；不是只做 scatter，也不是不带 SUM 的切分。
diff --git a/improve_tpep_dispatcher.py b/improve_tpep_dispatcher.py
new file mode 100644
index 000000000..cb376c389
--- /dev/null
+++ b/improve_tpep_dispatcher.py
@@ -0,0 +1,347 @@
+"""TPEP dispatcher TP collective refactor sketch.
+
+这个文件是设计伪代码，不接入训练路径。它描述当前更轻量的改法：
+
+1. 不引入额外的执行上下文概念。
+2. 保留同步和异步两个 autograd Function，让流程仍然直观对应当前代码。
+3. 只把 TP AllGather / ReduceScatter 的核心通信、拼接、切片逻辑抽成共享函数。
+4. 异步 Function 只比同步 Function 多做 stream wait、event record、record_stream。
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+Tensor = Any
+ProcessGroup = Any
+CudaEvent = Any
+CudaStream = Any
+
+
+# =============================================================================
+# 1. 共享核心实现：同步/异步都调用这些函数
+# =============================================================================
+
+
+def tp_all_gather_forward_impl(
+    hidden: Tensor,
+    all_sizes: list[int],
+    tp_group: ProcessGroup,
+) -> tuple[Tensor, Tensor, list[Tensor]]:
+    """TP AllGather forward 的共享核心。
+
+    中文注释：这里只表达数学和 collective：
+    [M_local, H] -> all_gather -> [M_total, H]。
+    它不关心是否异步，也不关心 CUDA event。
+    """
+    hidden_for_comm = hidden.contiguous()
+    chunks = [empty_rows_like(hidden_for_comm, rows) for rows in all_sizes]
+    dist_all_gather(chunks, hidden_for_comm, group=tp_group)
+    gathered = cat_rows(chunks)
+    return gathered, hidden_for_comm, chunks
+
+
+def tp_all_gather_backward_impl(
+    grad: Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: ProcessGroup,
+) -> tuple[Tensor, Tensor, list[Tensor]]:
+    """TP AllGather backward 的共享核心。
+
+    中文注释：AllGather backward 的语义就是 TP ReduceScatterSum，
+    因此和 combine forward 共用同一个真正 reduce_scatter 实现。
+    """
+    return tp_reduce_scatter_sum_impl(grad, all_sizes, tp_rank, tp_group)
+
+
+def tp_reduce_scatter_sum_impl(
+    hidden: Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: ProcessGroup,
+) -> tuple[Tensor, Tensor, list[Tensor]]:
+    """TP ReduceScatterSum 的共享核心。
+
+    中文注释：等长时走 reduce_scatter_tensor fast path；变长时按 TP size meta
+    split 成 input_list，走 torch.distributed.reduce_scatter。
+    """
+    hidden_for_comm = hidden.contiguous()
+    out = empty_rows_like(hidden_for_comm, all_sizes[tp_rank])
+    if all_rows_are_empty(all_sizes):
+        return out, hidden_for_comm, []
+    if all_splits_equal(all_sizes):
+        dist_reduce_scatter_tensor(out, hidden_for_comm, group=tp_group)
+        return out, hidden_for_comm, []
+
+    input_chunks = split_rows(hidden_for_comm, all_sizes)
+    dist_reduce_scatter(out, input_chunks, group=tp_group)
+    return out, hidden_for_comm, input_chunks
+
+
+def tp_reduce_scatter_sum_forward_impl(
+    hidden: Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: ProcessGroup,
+) -> tuple[Tensor, Tensor, list[Tensor]]:
+    """TP ReduceScatterSum forward 的共享核心。"""
+    return tp_reduce_scatter_sum_impl(hidden, all_sizes, tp_rank, tp_group)
+
+
+def tp_reduce_scatter_sum_backward_impl(
+    grad_slice: Tensor,
+    all_sizes: list[int],
+    tp_group: ProcessGroup,
+) -> tuple[Tensor, Tensor, list[Tensor]]:
+    """TP ReduceScatterSum backward 的共享核心。"""
+    grad_slice_for_comm = grad_slice.contiguous()
+    chunks = [empty_rows_like(grad_slice_for_comm, rows) for rows in all_sizes]
+    dist_all_gather(chunks, grad_slice_for_comm, group=tp_group)
+    full_grad = cat_rows(chunks)
+    return full_grad, grad_slice_for_comm, chunks
+
+
+# =============================================================================
+# 2. 同步 Function：只调用共享核心
+# =============================================================================
+
+
+class TPAllGather:
+    """同步 TP AllGather 伪代码。真实代码继承 ``torch.autograd.Function``。"""
+
+    @staticmethod
+    def forward(ctx: Any, hidden: Tensor, all_sizes: list[int], tp_group: ProcessGroup, tp_rank: int) -> Tensor:
+        gathered, _, _ = tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
+        ctx.all_sizes = all_sizes
+        ctx.tp_rank = tp_rank
+        ctx.tp_group = tp_group
+        return gathered
+
+    @staticmethod
+    def backward(ctx: Any, grad: Tensor) -> Tensor:
+        grad_input, _, _ = tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        return grad_input
+
+
+class TPReduceScatterSum:
+    """同步 TP ReduceScatterSum 伪代码。"""
+
+    @staticmethod
+    def forward(ctx: Any, hidden: Tensor, all_sizes: list[int], tp_group: ProcessGroup, tp_rank: int) -> Tensor:
+        out, _, _ = tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_slice: Tensor) -> Tensor:
+        full_grad, _, _ = tp_reduce_scatter_sum_backward_impl(grad_slice, ctx.all_sizes, ctx.tp_group)
+        return full_grad
+
+
+# =============================================================================
+# 3. 异步 Function：流程和同步一致，只额外包 stream/event
+# =============================================================================
+
+
+class AsyncTPAllGather:
+    """异步 TP AllGather 伪代码。"""
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: Tensor,
+        all_sizes: list[int],
+        tp_group: ProcessGroup,
+        tp_rank: int,
+        forward_previous_event: CudaEvent,
+        forward_finished_event: CudaEvent,
+        backward_previous_event: CudaEvent,
+        backward_finished_event: CudaEvent,
+        comm_stream: CudaStream,
+    ) -> Tensor:
+        with cuda_stream(comm_stream):
+            comm_stream.wait_event(forward_previous_event)
+            gathered, hidden_for_comm, chunks = tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
+
+            # 中文注释：异步路径不重写 TP AllGather 逻辑，只管理 stream/event 生命周期。
+            record_stream((hidden_for_comm, chunks, gathered), comm_stream)
+            forward_finished_event.record(comm_stream)
+
+        ctx.all_sizes = all_sizes
+        ctx.tp_rank = tp_rank
+        ctx.tp_group = tp_group
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return gathered
+
+    @staticmethod
+    def backward(ctx: Any, grad: Tensor) -> Tensor:
+        with cuda_stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            grad_input, grad_for_comm, chunks = tp_all_gather_backward_impl(
+                grad,
+                ctx.all_sizes,
+                ctx.tp_rank,
+                ctx.tp_group,
+            )
+            record_stream((grad_for_comm, chunks, grad_input), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+        return grad_input
+
+
+class AsyncTPReduceScatterSum:
+    """异步 TP ReduceScatterSum 伪代码。"""
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        hidden: Tensor,
+        all_sizes: list[int],
+        tp_group: ProcessGroup,
+        tp_rank: int,
+        forward_previous_event: CudaEvent,
+        forward_finished_event: CudaEvent,
+        backward_previous_event: CudaEvent,
+        backward_finished_event: CudaEvent,
+        comm_stream: CudaStream,
+    ) -> Tensor:
+        with cuda_stream(comm_stream):
+            comm_stream.wait_event(forward_previous_event)
+            out, hidden_for_comm, chunks = tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+
+            # 中文注释：异步路径不重写 ReduceScatter 逻辑，只记录通信流持有的 tensor。
+            record_stream((hidden_for_comm, chunks, out), comm_stream)
+            forward_finished_event.record(comm_stream)
+
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_slice: Tensor) -> Tensor:
+        with cuda_stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            full_grad, grad_slice_for_comm, chunks = tp_reduce_scatter_sum_backward_impl(
+                grad_slice,
+                ctx.all_sizes,
+                ctx.tp_group,
+            )
+            record_stream((grad_slice_for_comm, chunks, full_grad), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+        return full_grad
+
+
+# =============================================================================
+# 4. dispatcher 仍然保持当前显式流程
+# =============================================================================
+
+
+def dispatch_tpep_pseudocode(ep_dispatched: Any, tp_group: ProcessGroup, async_op: bool) -> Any:
+    """EP dispatch 后做 TP AllGather；这里只展示同步/异步流程保持相似。"""
+    all_sizes = gather_tp_sizes(ep_dispatched.hidden_states, tp_group)
+    tp_rank = dist_get_rank(tp_group)
+
+    if async_op:
+        hidden_states = AsyncTPAllGather.forward(
+            ctx=new_ctx(),
+            hidden=ep_dispatched.hidden_states,
+            all_sizes=all_sizes,
+            tp_group=tp_group,
+            tp_rank=tp_rank,
+            forward_previous_event=ep_dispatched.forward_finished_event,
+            forward_finished_event=new_cuda_event(),
+            backward_previous_event=new_cuda_event(),
+            backward_finished_event=ep_dispatched.backward_previous_event,
+            comm_stream=get_comm_stream(),
+        )
+    else:
+        hidden_states = TPAllGather.forward(
+            ctx=new_ctx(),
+            hidden=ep_dispatched.hidden_states,
+            all_sizes=all_sizes,
+            tp_group=tp_group,
+            tp_rank=tp_rank,
+        )
+    return hidden_states
+
+
+def migration_plan() -> list[str]:
+    return [
+        "保留现有同步/异步 autograd Function，不新增 stage/context 抽象。",
+        "抽出 AllGather forward/backward 的共享核心函数。",
+        "抽出真正 reduce_scatter 的 TP ReduceScatterSum 共享核心函数。",
+        "异步 Function 只保留 wait_event、record_stream、record_event 这些异步胶水。",
+        "dispatcher 的 dispatch/combine 调用形状保持不变。",
+    ]
+
+
+# =============================================================================
+# 5. 伪代码占位函数
+# =============================================================================
+
+
+def empty_rows_like(tensor: Tensor, rows: int) -> Tensor:
+    raise NotImplementedError
+
+
+def dist_all_gather(chunks: list[Tensor], tensor: Tensor, *, group: ProcessGroup) -> None:
+    raise NotImplementedError
+
+
+def dist_reduce_scatter_tensor(output: Tensor, input: Tensor, *, group: ProcessGroup) -> None:
+    raise NotImplementedError
+
+
+def dist_reduce_scatter(output: Tensor, input_list: list[Tensor], *, group: ProcessGroup) -> None:
+    raise NotImplementedError
+
+
+def split_rows(tensor: Tensor, sizes: list[int]) -> list[Tensor]:
+    raise NotImplementedError
+
+
+def all_splits_equal(sizes: list[int]) -> bool:
+    raise NotImplementedError
+
+
+def all_rows_are_empty(sizes: list[int]) -> bool:
+    raise NotImplementedError
+
+
+def cat_rows(chunks: list[Tensor]) -> Tensor:
+    raise NotImplementedError
+
+
+def cuda_stream(stream: CudaStream) -> Any:
+    raise NotImplementedError
+
+
+def record_stream(value: Any, stream: CudaStream) -> None:
+    raise NotImplementedError
+
+
+def gather_tp_sizes(hidden: Tensor, tp_group: ProcessGroup) -> list[int]:
+    raise NotImplementedError
+
+
+def dist_get_rank(tp_group: ProcessGroup) -> int:
+    raise NotImplementedError
+
+
+def new_ctx() -> Any:
+    raise NotImplementedError
+
+
+def new_cuda_event() -> CudaEvent:
+    raise NotImplementedError
+
+
+def get_comm_stream() -> CudaStream:
+    raise NotImplementedError
diff --git a/tests/module/dispatcher/test_torch_all2all_tpep_async.py b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
index ce3eceb84..9aba9a3f3 100644
--- a/tests/module/dispatcher/test_torch_all2all_tpep_async.py
+++ b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
@@ -105,18 +105,26 @@ def fake_all_gather_into_tensor(output, input, group=None) -> None:
             output[0].copy_(input)
             output[1].copy_(input)
 
+    def fake_reduce_scatter_tensor(output, input, op=None, group=None) -> None:
+        output.copy_(input[: output.shape[0]])
+
+    def fake_reduce_scatter(output, input_list, op=None, group=None) -> None:
+        output.copy_(input_list[getattr(group, "rank", 0)])
+
+    def fake_all_reduce(tensor, op=None, group=None) -> None:
+        raise AssertionError("TP ReduceScatterSum should not use all_reduce + slice")
+
     def fake_all_gather(chunks, tensor, group=None) -> None:
         chunks[0].copy_(tensor)
         chunks[1].copy_(tensor + 10)
 
-    def fake_all_reduce(tensor, op=None, group=None) -> None:
-        return None
-
     monkeypatch.setattr(dist, "get_rank", fake_get_rank)
     monkeypatch.setattr(dist, "all_to_all_single", fake_all_to_all_single)
     monkeypatch.setattr(torch_all2all, "all_to_all_single_autograd", fake_ep_all_to_all_single_autograd)
     monkeypatch.setattr(dist, "all_gather_into_tensor", fake_all_gather_into_tensor)
     monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+    monkeypatch.setattr(dist, "reduce_scatter_tensor", fake_reduce_scatter_tensor)
+    monkeypatch.setattr(dist, "reduce_scatter", fake_reduce_scatter)
     monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
 
     hidden = torch.randn(32, 128, device="cuda", dtype=torch.float32, requires_grad=True)
@@ -174,11 +182,16 @@ def fake_all_gather(chunks, tensor, group=None) -> None:
         for chunk in chunks:
             chunk.copy_(tensor[: chunk.shape[0]])
 
+    def fake_reduce_scatter_tensor(output, input, op=None, group=None) -> None:
+        calls.append(("reduce_scatter_tensor", _stream_id()))
+        output.copy_(input[: output.shape[0]])
+
     def fake_all_reduce(tensor, op=None, group=None) -> None:
-        calls.append(("all_reduce", _stream_id()))
+        raise AssertionError("TP AllGather backward should use reduce_scatter")
 
     monkeypatch.setattr(dist, "get_rank", fake_get_rank)
     monkeypatch.setattr(dist, "all_gather", fake_all_gather)
+    monkeypatch.setattr(dist, "reduce_scatter_tensor", fake_reduce_scatter_tensor)
     monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
 
     hidden = torch.randn(2, 3, device="cuda", requires_grad=True)
@@ -210,7 +223,7 @@ def fake_all_reduce(tensor, op=None, group=None) -> None:
     assert hidden.grad is not None
     assert calls == [
         ("all_gather", comm_stream.cuda_stream),
-        ("all_reduce", comm_stream.cuda_stream),
+        ("reduce_scatter_tensor", comm_stream.cuda_stream),
     ]
 
 
@@ -222,15 +235,20 @@ def test_async_tp_reduce_scatter_uses_comm_stream(monkeypatch) -> None:
     def fake_get_rank(group=None) -> int:
         return getattr(group, "rank", 0)
 
+    def fake_reduce_scatter(output, input_list, op=None, group=None) -> None:
+        calls.append(("reduce_scatter", _stream_id()))
+        output.copy_(input_list[getattr(group, "rank", 0)])
+
     def fake_all_reduce(tensor, op=None, group=None) -> None:
-        calls.append(("all_reduce", _stream_id()))
+        raise AssertionError("TP ReduceScatterSum should use reduce_scatter")
 
     def fake_all_gather(chunks, tensor, group=None) -> None:
         calls.append(("all_gather", _stream_id()))
         for chunk in chunks:
-            chunk.copy_(tensor[: chunk.shape[0]])
+            chunk.copy_(tensor[:1].expand_as(chunk))
 
     monkeypatch.setattr(dist, "get_rank", fake_get_rank)
+    monkeypatch.setattr(dist, "reduce_scatter", fake_reduce_scatter)
     monkeypatch.setattr(dist, "all_reduce", fake_all_reduce)
     monkeypatch.setattr(dist, "all_gather", fake_all_gather)
 
@@ -243,7 +261,7 @@ def fake_all_gather(chunks, tensor, group=None) -> None:
 
     out = _async_tp_reduce_scatter_sum(
         hidden,
-        all_sizes=[2, 2],
+        all_sizes=[1, 3],
         tp_group=group,  # type: ignore[arg-type]
         forward_previous_event=forward_previous_event,
         forward_finished_event=forward_finished_event,
@@ -261,6 +279,6 @@ def fake_all_gather(chunks, tensor, group=None) -> None:
 
     assert hidden.grad is not None
     assert calls == [
-        ("all_reduce", comm_stream.cuda_stream),
+        ("reduce_scatter", comm_stream.cuda_stream),
         ("all_gather", comm_stream.cuda_stream),
     ]
diff --git a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
index c6ac2f7e8..1774fd708 100644
--- a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
+++ b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
@@ -70,10 +70,6 @@ def _record_stream(value: Any, stream: torch.cuda.Stream) -> None:
             _record_stream(item, stream)
 
 
-def _local_tp_offset(all_sizes: list[int], tp_rank: int) -> int:
-    return sum(all_sizes[:tp_rank])
-
-
 def _tp_all_gather_forward_impl(
     hidden: torch.Tensor,
     all_sizes: list[int],
@@ -92,12 +88,33 @@ def _tp_all_gather_backward_impl(
     all_sizes: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    # TODO: use reduce_scatter instead of all_reduce
-    grad = grad.contiguous()
-    dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_group)
-    offset = _local_tp_offset(all_sizes, tp_rank)
-    return grad[offset : offset + all_sizes[tp_rank]].clone(), grad
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    return _tp_reduce_scatter_sum_impl(grad, all_sizes, tp_rank, tp_group)
+
+
+def _tp_reduce_scatter_sum_impl(
+    hidden: torch.Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    """Run TP ReduceScatterSum and return tensors whose lifetime may need
+    recording."""
+    hidden = hidden.contiguous()
+    assert hidden.shape[0] == sum(all_sizes), "TP ReduceScatterSum input rows must match TP size meta."
+
+    out = hidden.new_empty((all_sizes[tp_rank], *hidden.shape[1:]))
+    if hidden.shape[0] == 0:
+        # 中文注释：所有 TP rank 都没有 token 时没有实际通信量，直接返回合法的 0 行 slice。
+        return out, hidden, []
+
+    if all(size == all_sizes[0] for size in all_sizes):
+        dist.reduce_scatter_tensor(out, hidden, op=dist.ReduceOp.SUM, group=tp_group)
+        return out, hidden, []
+
+    input_chunks = list(torch.split(hidden, all_sizes, dim=0))
+    dist.reduce_scatter(out, input_chunks, op=dist.ReduceOp.SUM, group=tp_group)
+    return out, hidden, input_chunks
 
 
 def _tp_reduce_scatter_sum_forward_impl(
@@ -105,12 +122,8 @@ def _tp_reduce_scatter_sum_forward_impl(
     all_sizes: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    # TODO: use reduce_scatter instead of all_reduce
-    reduced = hidden.contiguous().clone()
-    dist.all_reduce(reduced, op=dist.ReduceOp.SUM, group=tp_group)
-    offset = _local_tp_offset(all_sizes, tp_rank)
-    return reduced[offset : offset + all_sizes[tp_rank]].contiguous(), reduced
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    return _tp_reduce_scatter_sum_impl(hidden, all_sizes, tp_rank, tp_group)
 
 
 def _tp_reduce_scatter_sum_backward_impl(
@@ -128,8 +141,7 @@ class _TPAllGather(torch.autograd.Function):
     """TP AllGather with autograd support.
 
     Forward : ``all_gather`` across the TP group, concatenating along the token dim.
-    Backward: ``all_reduce`` (SUM) the gradient then slice, accumulating gradients from
-              each TP weight shard into the original local token slice.
+    Backward: ``reduce_scatter`` (SUM) the gradient into the original local token slice.
     """
 
     @staticmethod
@@ -153,7 +165,7 @@ def backward(
         ctx: Any,
         grad: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
-        grad_input, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        grad_input, _, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
         return grad_input, None, None, None, None
 
 
@@ -161,8 +173,8 @@ class _AsyncTPAllGather(torch.autograd.Function):
     """TP AllGather on dispatcher comm stream.
 
     Forward : wait for the previous event, then all-gather token slices.
-    Backward: wait until post-dispatch grad is ready, all-reduce grad, then
-              slice this TP rank's input grad.
+    Backward: wait until post-dispatch grad is ready, then reduce-scatter grad
+              into this TP rank's input slice.
     """
 
     @staticmethod
@@ -203,14 +215,14 @@ def backward(
     ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
         with torch.cuda.stream(ctx.comm_stream):
             ctx.comm_stream.wait_event(ctx.backward_previous_event)
-            grad_input, grad_for_comm = _tp_all_gather_backward_impl(
+            grad_input, grad_for_comm, chunks = _tp_all_gather_backward_impl(
                 grad,
                 ctx.all_sizes,
                 ctx.tp_rank,
                 ctx.tp_group,
             )
 
-            _record_stream((grad_for_comm, grad_input), ctx.comm_stream)
+            _record_stream((grad_for_comm, chunks, grad_input), ctx.comm_stream)
             ctx.backward_finished_event.record(ctx.comm_stream)
 
         return grad_input, None, None, None, None, None, None, None, None, None
@@ -219,8 +231,7 @@ def backward(
 class _TPReduceScatterSum(torch.autograd.Function):
     """TP ReduceScatterSum with autograd support.
 
-    Forward : ``all_reduce`` (SUM) then slice — equivalent to a sum reduce-scatter
-              for the unequal-size token case used here.
+    Forward : ``reduce_scatter`` (SUM) to this TP rank's local token slice.
     Backward: ``all_gather`` the gradient slices to reconstruct the full gradient tensor,
               matching the sum reduction in the forward pass.
     """
@@ -234,7 +245,7 @@ def forward(
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        out, _ = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+        out, _, _ = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
@@ -269,10 +280,10 @@ def forward(
     ) -> torch.Tensor:
         with torch.cuda.stream(comm_stream):
             comm_stream.wait_event(forward_previous_event)
-            out, reduced = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+            out, hidden_for_comm, chunks = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
 
             # 中文注释：同步/异步共用 TP ReduceScatter 核心逻辑；异步只额外管理 stream/event。
-            _record_stream((hidden, reduced, out), comm_stream)
+            _record_stream((hidden_for_comm, chunks, out), comm_stream)
             forward_finished_event.record(comm_stream)
 
         ctx.tp_group = tp_group
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index 33db7d0a0..13f2cf758 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -465,3 +465,96 @@ num_recv_tokens_per_expert_list, handle, event
 - host 等 metadata 时无法继续 enqueue 后续本地算子和通信。
 - 如果 metadata 等待短于可覆盖的另一个 micro batch 计算，重叠效果基本保留。
 - 如果 metadata 等待更长，`xtuner_ep_domino.md` 7.3 中的理想时间线会被压缩，真实重叠比例下降。
+
+## TP+EP 中 ReduceScatterSum 与 padding/capacity 取舍
+
+`TorchAll2AllTPEPDispatcher` 在 EP dispatch 之后会额外做 TP AllGather，在 combine 阶段会做 TP
+ReduceScatterSum。这里的 **TP ReduceScatterSum** 是语义名：对同一 TP group 中完整 token 批的 hidden 做
+SUM 归约，并只保留当前 TP rank 负责的 token slice。它同时出现在两个方向：
+
+- combine forward：row-parallel expert output 先做 TP ReduceScatterSum，再进入 EP combine all2all。
+- TP AllGather backward：AllGather 的反向也是 TP ReduceScatterSum。
+
+TP+EP MoE routing 后，同一个 EP rank 上的不同 TP rank 不一定收到相同数量的 token。以 `tp_size=2` 为例：
+
+```text
+EP dispatch 后：
+  TP rank0 hidden: [3, H]
+  TP rank1 hidden: [5, H]
+
+TP size meta:
+  output_splits_tp = [3, 5]
+
+TP AllGather 后每个 TP rank 都看到：
+  gathered hidden: [8, H] = rank0 rows [0:3] | rank1 rows [3:8]
+```
+
+expert 的 row-parallel down projection 后，两个 TP rank 都有 `[8, H]` 的 partial hidden。TP ReduceScatterSum 需要
+对这两个 `[8, H]` 做 SUM，并按同一个 TP size meta 切回：
+
+```text
+TP rank0 output: rows [0:3] -> [3, H]
+TP rank1 output: rows [3:8] -> [5, H]
+```
+
+因此当前设计选择是：**优先实现真正的变长 `reduce_scatter`，不引入 padding/capacity**。dispatcher 已经有
+`output_splits_tp` 作为 TP size meta，正好可以作为变长 reduce scatter 的 split 边界：
+
+```python
+input_tensor_list = list(torch.split(hidden.contiguous(), output_splits_tp, dim=0))
+output = torch.empty_like(input_tensor_list[tp_rank])
+dist.reduce_scatter(output, input_tensor_list, op=dist.ReduceOp.SUM, group=tp_group)
+```
+
+当 `output_splits_tp` 全部相等时，可以在共享核心函数内部走等长 fast path：
+
+```python
+dist.reduce_scatter_tensor(output, hidden.contiguous(), op=dist.ReduceOp.SUM, group=tp_group)
+```
+
+但这只是实现优化，不改变 dispatcher 对外的 TP size meta 语义。真正的 ReduceScatterSum 实现应集中在一个共享核心
+函数中，避免 combine forward 和 TP AllGather backward 分叉。
+
+### 为什么不先做 padding/capacity
+
+padding 和 capacity 带来的收益不同，需要分开看：
+
+- **padding 的收益** 是把一次变长 collective 包装成等长 collective。通信前把每个 TP rank 的真实 slice pad 到同一
+  长度，通信时就可以使用 `reduce_scatter_tensor` / `all_gather_into_tensor` 这类 tensor fast path。若 capacity
+  仍由本 step 的 `max(output_splits_tp)` 动态决定，padding 只减少大块 hidden collective 的 variable-list
+  split 开销，不能消除 TP size meta 的 CPU 同步。
+- **固定 capacity 的收益** 是让这个等长长度跨 step 稳定下来。只有 capacity 是配置值或静态上界时，shape 才稳定，
+  大块通信 shape 才能从本 step 的 Python split list 中解耦，后续也才更容易做 CUDA graph、buffer 复用或通信
+  buffer 预分配。
+- **对 Domino 的影响** 主要来自 host CPU split metadata 同步。只做动态 padding 时，host 仍要拿到
+  `output_splits_tp` 来决定 pad/unpad 边界和本步 capacity，因此这个同步点仍然存在；固定 capacity 才可能减少
+  运行时 shape 决策，并把大块通信从 split-list 发起路径中移出。这和前面 EP All2All 的 host metadata 同步问题
+  类似：host 等 split list 时，已经 enqueue 到 GPU 的另一个 micro batch 计算仍可继续，但 host 不能继续
+  enqueue 后续本地算子和通信；如果等待时间超过可覆盖窗口，会压缩 Domino 的真实 overlap。
+
+因此，如果只是每步动态取 `capacity = max(output_splits_tp)`，它仍然需要 TP size meta 的 CPU 同步，只能减少
+variable collective 的 split-list 开销，不能获得固定 shape / CUDA graph，也不能消除 TP size meta 对 Domino
+host enqueue 的影响。
+
+但它会把问题从通信层扩散到 layout 层。至少有两种做法：
+
+1. **通信内部 padding，通信后立刻 unpad。**
+
+   例如 TP size meta 是 `[3, 5]`，capacity 取 `5`。AllGather 前把 rank0 的 `[3, H]` pad 到 `[5, H]`，
+   rank1 保持 `[5, H]`；等长 AllGather 得到 `[10, H]` 后再按真实 sizes compact 回 `[8, H]`。ReduceScatter
+   则需要先按 `[3, 5]` 切分、分别 pad 到 `[5, H]`，concat 成 `[10, H]` 后走 `reduce_scatter_tensor`，
+   最后再 unpad 成当前 rank 的真实 `[3, H]` 或 `[5, H]`。
+
+   这个方案不改变 expert 看到的 token 数，但增加 pad/unpad copy，并且仍然需要 TP size meta。收益要靠 benchmark
+   证明。
+
+2. **端到端 capacity，让 padding token 进入 expert layout。**
+
+   这种方案会让 `[tp_size * capacity, H]` 直接进入 `dispatch_postprocess` 和 grouped GEMM。它需要定义 padding
+   token 的 expert 归属、`tokens_per_expert` 是否包含 padding、grouped GEMM 是否计算 padding、combine 如何剔除
+   padding，以及 `row_ids_map` / `topk_weights` 如何保证 padding 不影响真实 token。
+
+   这会把改动扩散到 routing、expert layout、postprocess/combine，不适合作为替换 `all_reduce + slice` 的第一步。
+
+因此当前阶段的目标是局部替换：用真正的 TP ReduceScatterSum 取代 `all_reduce + slice`，输出 shape 严格按照
+`output_splits_tp[tp_rank]` 分配，允许 0 行，不做 padding/capacity。
diff --git a/xtuner_ep_domino.md b/xtuner_ep_domino.md
index 8a19bbf8d..dd20e419b 100644
--- a/xtuner_ep_domino.md
+++ b/xtuner_ep_domino.md
@@ -603,6 +603,9 @@ compute stream 中剥离出来，让它们尽可能和另一个 micro batch 的
   后续 EP combine 再等待 TP ReduceScatterSum 完成事件。
 - 反向中，TP AllGather / ReduceScatterSum 对应的反向 collective 也在 comm stream 上执行，并通过 autograd hook
   把等待点放在梯度真正被消费的位置。
+- `TP ReduceScatterSum` 使用真正的 reduce-scatter 语义：等长 token slice 走 `reduce_scatter_tensor` fast path，
+  变长 token slice 按 TP size meta 切成 `input_list` 后走 `reduce_scatter`。这避免了 `all_reduce` 后再丢弃非本
+  rank slice 的额外通信和写入。
 
 因此 TP+EP 下的 Domino 流水不再只覆盖 EP dispatch/combine；TP collectives 也可以和另一个 micro batch 的
 attention、expert 或 shared expert 计算重叠。变长 TP AllGather 仍需要先收集每个 TP rank 的 token 数用于分配输出

From 419134f2eb78d7cbbdfce777eb4510fb174333b8 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 14 May 2026 12:47:16 +0000
Subject: [PATCH 12/25] Support Naive ExpertTP without EP

---
 tests/model/test_moe_expert_tp_without_ep.py  |  69 ++++++++
 .../module/dispatcher/test_noep_expert_tp.py  | 134 ++++++++++++++++
 xtuner/v1/model/moe/moe.py                    |  13 +-
 xtuner/v1/module/dispatcher/__init__.py       |   1 +
 xtuner/v1/module/dispatcher/base.py           |  30 +++-
 xtuner/v1/module/dispatcher/expert_tp.py      | 147 ++++++++++++++++++
 6 files changed, 386 insertions(+), 8 deletions(-)
 create mode 100644 tests/model/test_moe_expert_tp_without_ep.py
 create mode 100644 tests/module/dispatcher/test_noep_expert_tp.py
 create mode 100644 xtuner/v1/module/dispatcher/expert_tp.py

diff --git a/tests/model/test_moe_expert_tp_without_ep.py b/tests/model/test_moe_expert_tp_without_ep.py
new file mode 100644
index 000000000..3993c2dce
--- /dev/null
+++ b/tests/model/test_moe_expert_tp_without_ep.py
@@ -0,0 +1,69 @@
+import os
+import unittest
+
+import torch
+import torch.distributed as dist
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.module.attention import MHAConfig
+from xtuner.v1.module.dispatcher.base import NaiveDispatcher
+from xtuner.v1.module.router.greedy import GreedyRouterConfig
+from xtuner.v1.model.moe.qwen3 import Qwen3MoEConfig
+
+
+def _tiny_moe_cfg() -> Qwen3MoEConfig:
+    return Qwen3MoEConfig(
+        vocab_size=32,
+        max_position_embeddings=32,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        num_hidden_layers=1,
+        hidden_size=16,
+        intermediate_size=32,
+        rms_norm_eps=1e-6,
+        rope_theta=1e6,
+        hidden_act="silu",
+        attention=MHAConfig(num_attention_heads=2, num_key_value_heads=1, head_dim=8, qk_norm=True),
+        tie_word_embeddings=False,
+        n_routed_experts=4,
+        n_shared_experts=0,
+        num_experts_per_tok=2,
+        first_k_dense_replace=0,
+        hidden_factor=1.0,
+        moe_intermediate_size=8,
+        router=GreedyRouterConfig(scoring_func="softmax", norm_topk_prob=True, router_scaling_factor=1.0),
+        ep_size=1,
+        expert_tp_size=2,
+        dispatcher=None,
+        compile_cfg=False,
+        balancing_loss_cfg=None,
+        z_loss_cfg=None,
+    )
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA/NCCL is required for real ExpertTP mesh validation.")
+class TestMoEExpertTPWithoutEP(DeterministicDDPTestCase):
+    def test_builds_real_ep_ownership_mesh_for_expert_tp_without_ep(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+
+        model = _tiny_moe_cfg().build()
+        layer = model.layers["0"]
+
+        # 中文注释：不开 EP 但开启 expert TP 时，EP ownership 维度仍然真实存在，只是 size=1。
+        assert model.ep_mesh is not None
+        assert model.tp_mesh is not None
+        assert model.ep_mesh.size() == 1
+        assert model.tp_mesh.size() == 2
+        assert layer.experts.fused_w1w3.ep_size == 1
+        assert layer.experts.fused_w1w3.tp_size == 2
+        assert isinstance(layer.dispatcher, NaiveDispatcher)
+
+        dist.barrier()
+        dist.destroy_process_group(pg)
+
+    @property
+    def world_size(self) -> int:
+        return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "2"))
diff --git a/tests/module/dispatcher/test_noep_expert_tp.py b/tests/module/dispatcher/test_noep_expert_tp.py
new file mode 100644
index 000000000..ffb924ed9
--- /dev/null
+++ b/tests/module/dispatcher/test_noep_expert_tp.py
@@ -0,0 +1,134 @@
+import os
+import unittest
+
+import torch
+import torch.distributed as dist
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.module.dispatcher import build_dispatcher
+from xtuner.v1.module.dispatcher.base import NaiveDispatcher
+
+
+def _payload_for_rank(rank: int, device: torch.device) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    rows = rank + 2
+    hidden_size = 8
+    start = sum(i + 2 for i in range(rank))
+    token_ids = torch.arange(start, start + rows, device=device)
+    hidden = token_ids.to(torch.float32).unsqueeze(1) * 10 + torch.arange(hidden_size, device=device)
+    topk_ids = torch.stack((token_ids % 4, (token_ids + 1) % 4), dim=1).to(torch.int64)
+    topk_weights = torch.stack(
+        (
+            torch.full((rows,), 1.0, device=device),
+            torch.full((rows,), 0.25 * (rank + 1), device=device),
+        ),
+        dim=1,
+    )
+    return hidden, topk_ids, topk_weights
+
+
+def _run_dispatcher(
+    dispatcher,
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    expert_scale: float = 1.0,
+):
+    pre_dispatched = dispatcher.dispatch_preprocess(
+        hidden_states=hidden_states,
+        topk_ids=topk_ids,
+    )
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+    # 中文注释：dispatcher 测试不跑真实 row-parallel expert；
+    # 每个 TP rank 提供 1/tp_size 的 partial output，真实 ReduceScatterSum 后应回到 baseline。
+    experts_results = post_dispatched["hidden_states"] * expert_scale
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_results,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+    )
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+    result = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+    return result, dispatched, post_dispatched, pre_combined, combined
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA/NCCL is required for real ExpertTP dispatcher validation.")
+class TestNaiveExpertTPDispatcher(DeterministicDDPTestCase):
+    def test_sync_path_uses_real_tp_collectives(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_groups = [dist.new_group([ep_rank], backend="nccl") for ep_rank in range(world_size)]
+        ep_group = ep_groups[rank]
+
+        local_hidden, local_topk_ids, local_topk_weights = _payload_for_rank(rank, device)
+        full_payloads = [_payload_for_rank(tp_rank, device) for tp_rank in range(world_size)]
+        full_hidden = torch.cat([payload[0] for payload in full_payloads], dim=0)
+        full_topk_ids = torch.cat([payload[1] for payload in full_payloads], dim=0)
+        full_topk_weights = torch.cat([payload[2] for payload in full_payloads], dim=0)
+
+        baseline = NaiveDispatcher(n_routed_experts=4)
+        baseline_result, _, baseline_post, _, _ = _run_dispatcher(
+            baseline,
+            full_hidden,
+            full_topk_ids,
+            full_topk_weights,
+        )
+
+        dispatcher = build_dispatcher(
+            dispatcher=None,
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=dist.group.WORLD,
+        )
+        result, dispatched, post_dispatched, pre_combined, combined = _run_dispatcher(
+            dispatcher,
+            local_hidden,
+            local_topk_ids,
+            local_topk_weights,
+            expert_scale=1.0 / world_size,
+        )
+
+        all_sizes = [tp_rank + 2 for tp_rank in range(world_size)]
+        slice_start = sum(all_sizes[:rank])
+        slice_end = slice_start + all_sizes[rank]
+
+        torch.testing.assert_close(dispatched["hidden_states"], full_hidden)
+        torch.testing.assert_close(dispatched["topk_ids"], full_topk_ids)
+        torch.testing.assert_close(dispatched["topk_weights"], full_topk_weights)
+        torch.testing.assert_close(post_dispatched["tokens_per_expert"], baseline_post["tokens_per_expert"])
+        torch.testing.assert_close(pre_combined["hidden_states"], baseline_result["hidden_states"] / world_size)
+        torch.testing.assert_close(combined["hidden_states"], baseline_result["hidden_states"][slice_start:slice_end])
+        torch.testing.assert_close(result["hidden_states"], baseline_result["hidden_states"][slice_start:slice_end])
+
+        dist.barrier()
+        for group in ep_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    @property
+    def world_size(self) -> int:
+        return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "2"))
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index 3a27d6054..41b0b90f0 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -176,14 +176,17 @@ class MoE(BaseModel):
 
     def __init__(self, config: MoEConfig):
         super().__init__(config)
-        if config.ep_size is not None and config.ep_size > 1:
+        ep_size = config.ep_size if config.ep_size is not None else 1
+        expert_tp_size = config.expert_tp_size if config.expert_tp_size > 1 else 1
+        if ep_size > 1 or expert_tp_size > 1:
             world_size = dist.get_world_size()
-            expert_tp_size = config.expert_tp_size if config.expert_tp_size > 1 else 1
-            fsdp_size = world_size // (config.ep_size * expert_tp_size)
+            fsdp_size = world_size // (ep_size * expert_tp_size)
             if expert_tp_size > 1:
+                # 中文注释：即使不开 EP，也保留 size=1 的 expert ownership 维度，
+                # 这样 routed experts 和 expert TP 仍然使用同一套 mesh 语义。
                 _init_mesh = init_device_mesh(
                     DEVICE,
-                    (fsdp_size, config.ep_size, expert_tp_size),
+                    (fsdp_size, ep_size, expert_tp_size),
                     mesh_dim_names=(
                         f"{self.config.mesh_prefix}.dp",
                         f"{self.config.mesh_prefix}.ep",
@@ -195,7 +198,7 @@ def __init__(self, config: MoEConfig):
             else:
                 _init_mesh = init_device_mesh(
                     DEVICE,
-                    (fsdp_size, config.ep_size),
+                    (fsdp_size, ep_size),
                     mesh_dim_names=(f"{self.config.mesh_prefix}.dp", f"{self.config.mesh_prefix}.ep"),
                 )
                 self.ep_mesh = _init_mesh[f"{self.config.mesh_prefix}.ep"]
diff --git a/xtuner/v1/module/dispatcher/__init__.py b/xtuner/v1/module/dispatcher/__init__.py
index 710360b94..914a88acc 100644
--- a/xtuner/v1/module/dispatcher/__init__.py
+++ b/xtuner/v1/module/dispatcher/__init__.py
@@ -42,6 +42,7 @@ def build_dispatcher(
         return NaiveDispatcher(
             n_routed_experts=n_routed_experts,
             process_group=ep_group,
+            tp_group=tp_group,
             training_dtype=training_dtype,
             generate_dtype=generate_dtype,
         )  # type: ignore[return-value]
diff --git a/xtuner/v1/module/dispatcher/base.py b/xtuner/v1/module/dispatcher/base.py
index b268d75f6..072bffd21 100644
--- a/xtuner/v1/module/dispatcher/base.py
+++ b/xtuner/v1/module/dispatcher/base.py
@@ -11,6 +11,8 @@
 
 from xtuner.v1.ops import permute, unpermute
 
+from .expert_tp import ExpertTP
+
 
 HiddenStates: TypeAlias = torch.Tensor
 
@@ -174,7 +176,9 @@ class DispacherInterface(
 class NaivePreDispatchResult(PreDispatchResult): ...
 
 
-class NaiveDispatchResult(DispatchResult): ...
+class NaiveDispatchResult(DispatchResult, total=False):
+    topk_ids: torch.Tensor
+    tp_size_meta: list[int]
 
 
 class NaivePostDispatchResult(PostDispatchResult):
@@ -205,6 +209,7 @@ def __init__(
         *,
         n_routed_experts: int,
         process_group: torch.distributed.ProcessGroup | None = None,
+        tp_group: torch.distributed.ProcessGroup | None = None,
         training_dtype: Literal["fp8", "bf16"] = "bf16",
         generate_dtype: Literal["fp8", "bf16"] = "bf16",
     ):
@@ -216,6 +221,7 @@ def __init__(
         )
         if self._process_group is not None:
             assert self._process_group.size() == 1, "Naive dispatcher is only for ep=1."
+        self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
 
     @override
     def dispatch_preprocess(
@@ -245,6 +251,17 @@ def dispatch(
         if async_op:
             raise NotImplementedError("Naive dispatcher is only for ep=1.")
 
+        if self._expert_tp is not None:
+            hidden_states, tp_size_meta = self._expert_tp.all_gather(pre_dispatched["hidden_states"])
+            topk_ids = self._expert_tp.all_gather_metadata(pre_dispatched["topk_ids"], tp_size_meta)
+            topk_weights = self._expert_tp.all_gather_metadata(topk_weights, tp_size_meta)
+            return NaiveDispatchResult(
+                hidden_states=hidden_states,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                tp_size_meta=tp_size_meta,
+            )
+
         return NaiveDispatchResult(
             hidden_states=pre_dispatched["hidden_states"],
             topk_weights=topk_weights,
@@ -262,11 +279,11 @@ def dispatch_postprocess(
         if async_op:
             raise NotImplementedError("Naive dispatcher is only for ep=1.")
 
+        topk_ids = dispatched["topk_ids"] if self._expert_tp is not None else pre_dispatched["topk_ids"]
         hidden_states, row_id_maps = permute(
             dispatched["hidden_states"],
-            pre_dispatched["topk_ids"].to(torch.int32),
+            topk_ids.to(torch.int32),
         )
-        topk_ids = pre_dispatched["topk_ids"]
         tokens_per_expert = torch.histc(topk_ids, bins=self._n_routed_experts, min=0, max=self._n_routed_experts)
         if decoding:
             raise NotImplementedError
@@ -318,6 +335,13 @@ def combine(
         if decoding:
             raise NotImplementedError
         else:
+            if self._expert_tp is not None:
+                hidden_states = self._expert_tp.reduce_scatter_sum(
+                    pre_combined["hidden_states"],
+                    dispatched["tp_size_meta"],
+                )
+                return NaiveCombineResult(hidden_states=hidden_states)
+
             return NaiveCombineResult(hidden_states=pre_combined["hidden_states"])
 
     @override
diff --git a/xtuner/v1/module/dispatcher/expert_tp.py b/xtuner/v1/module/dispatcher/expert_tp.py
new file mode 100644
index 000000000..e61411f19
--- /dev/null
+++ b/xtuner/v1/module/dispatcher/expert_tp.py
@@ -0,0 +1,147 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+import torch.distributed as dist
+
+
+def _tp_all_gather_forward_impl(
+    tensor: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    tensor = tensor.contiguous()
+    chunks = [torch.empty((size, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) for size in all_sizes]
+    dist.all_gather(chunks, tensor, group=tp_group)
+    return torch.cat(chunks, dim=0), tensor, chunks
+
+
+def _tp_reduce_scatter_sum_impl(
+    tensor: torch.Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    tensor = tensor.contiguous()
+    assert tensor.shape[0] == sum(all_sizes), "TP ReduceScatterSum input rows must match TP size meta."
+
+    out = tensor.new_empty((all_sizes[tp_rank], *tensor.shape[1:]))
+    if tensor.shape[0] == 0:
+        # 中文注释：所有 TP rank 都没有 token 时没有通信量，直接返回当前 rank 的 0 行 slice。
+        return out, tensor, []
+
+    if all(size == all_sizes[0] for size in all_sizes):
+        dist.reduce_scatter_tensor(out, tensor, op=dist.ReduceOp.SUM, group=tp_group)
+        return out, tensor, []
+
+    input_chunks = list(torch.split(tensor, all_sizes, dim=0))
+    dist.reduce_scatter(out, input_chunks, op=dist.ReduceOp.SUM, group=tp_group)
+    return out, tensor, input_chunks
+
+
+def _tp_all_gather_backward_impl(
+    grad: torch.Tensor,
+    all_sizes: list[int],
+    tp_rank: int,
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    return _tp_reduce_scatter_sum_impl(grad, all_sizes, tp_rank, tp_group)
+
+
+def _tp_reduce_scatter_sum_backward_impl(
+    grad_slice: torch.Tensor,
+    all_sizes: list[int],
+    tp_group: dist.ProcessGroup,
+) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
+    grad_slice = grad_slice.contiguous()
+    chunks = [
+        torch.empty((size, *grad_slice.shape[1:]), dtype=grad_slice.dtype, device=grad_slice.device)
+        for size in all_sizes
+    ]
+    dist.all_gather(chunks, grad_slice, group=tp_group)
+    return torch.cat(chunks, dim=0), grad_slice, chunks
+
+
+class _TPAllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+    ) -> torch.Tensor:
+        gathered, _, _ = _tp_all_gather_forward_impl(tensor, all_sizes, tp_group)
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        ctx.tp_rank = tp_rank
+        return gathered
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor) -> tuple[torch.Tensor, None, None, None, None]:
+        grad_input, _, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        return grad_input, None, None, None, None
+
+
+class _TPReduceScatterSum(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+    ) -> torch.Tensor:
+        out, _, _ = _tp_reduce_scatter_sum_impl(tensor, all_sizes, tp_rank, tp_group)
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_slice: torch.Tensor) -> tuple[torch.Tensor, None, None, None, None]:
+        full_grad, _, _ = _tp_reduce_scatter_sum_backward_impl(grad_slice, ctx.all_sizes, ctx.tp_group)
+        return full_grad, None, None, None, None
+
+
+class ExpertTP:
+    """Token-sliced Expert TP collectives shared by dispatcher routing
+    paths."""
+
+    def __init__(self, tp_group: dist.ProcessGroup) -> None:
+        self._tp_group = tp_group
+        self._tp_size = tp_group.size()
+
+    def gather_size_meta(self, tensor: torch.Tensor) -> list[int]:
+        if self._tp_size == 1:
+            return [tensor.shape[0]]
+
+        local_size = tensor.new_tensor([tensor.shape[0]], dtype=torch.long)
+        all_sizes_t = tensor.new_empty([self._tp_size], dtype=torch.long)
+        dist.all_gather_into_tensor(all_sizes_t, local_size, group=self._tp_group)
+        return [int(size) for size in all_sizes_t.tolist()]
+
+    def all_gather(self, tensor: torch.Tensor, all_sizes: list[int] | None = None) -> tuple[torch.Tensor, list[int]]:
+        if self._tp_size == 1:
+            return tensor, [tensor.shape[0]]
+
+        if all_sizes is None:
+            all_sizes = self.gather_size_meta(tensor)
+
+        tp_rank = dist.get_rank(group=self._tp_group)
+        gathered = _TPAllGather.apply(tensor, all_sizes, self._tp_group, self._tp_size, tp_rank)
+        return gathered, all_sizes
+
+    def all_gather_metadata(self, tensor: torch.Tensor, all_sizes: list[int]) -> torch.Tensor:
+        # 中文注释：topk_ids/topk_weights 和 hidden 使用同一份 TP size meta，保证 source token 对齐。
+        gathered, _ = self.all_gather(tensor, all_sizes)
+        return gathered
+
+    def reduce_scatter_sum(self, tensor: torch.Tensor, all_sizes: list[int]) -> torch.Tensor:
+        if self._tp_size == 1:
+            return tensor
+
+        tp_rank = dist.get_rank(group=self._tp_group)
+        return _TPReduceScatterSum.apply(tensor, all_sizes, self._tp_group, self._tp_size, tp_rank)

From a3ecc11f2f0ea8075b140c917f85d64d5f22b083 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Fri, 15 May 2026 03:35:00 +0000
Subject: [PATCH 13/25] Support async Naive ExpertTP events

---
 CONTEXT.md                                    |  31 ++-
 .../module/dispatcher/test_noep_expert_tp.py  | 174 +++++++++++++++
 xtuner/v1/module/dispatcher/base.py           | 183 +++++++++++++--
 xtuner/v1/module/dispatcher/expert_tp.py      | 191 ++++++++++++++++
 xtuner_ep_dispatcher.md                       | 142 +++++++++++-
 xtuner_etp.md                                 | 211 ++++++++++++++++++
 6 files changed, 905 insertions(+), 27 deletions(-)
 create mode 100644 xtuner_etp.md

diff --git a/CONTEXT.md b/CONTEXT.md
index 2490261dc..298b15b1c 100644
--- a/CONTEXT.md
+++ b/CONTEXT.md
@@ -1,6 +1,6 @@
 # XTuner MoE Dispatch
 
-This context describes the communication language used by XTuner MoE dispatchers when Expert Parallelism and Tensor Parallelism are enabled together.
+This context describes the communication language used by XTuner MoE dispatchers when routed experts use Expert Parallelism or Expert Tensor Parallelism.
 
 ## Language
 
@@ -13,17 +13,33 @@ _Avoid_: all_reduce + slice
 _Avoid_: equal-only reduce scatter
 
 **TP size meta**:
-每个 TP rank 在 EP dispatch 后拥有的 token 行数列表，用来描述变长 TP token slice 的拼接和切分边界。
+每个 expert TP rank 在 TP AllGather 前、当前 dispatcher token 空间中拥有的 token 行数列表，用来描述变长 TP token slice 的拼接和切分边界。
 _Avoid_: shape hack, split list
 
+**Token-sliced Expert TP**:
+expert MLP 权重按 TP 切分，并让每个 expert TP rank 只保留自己的 token slice；expert 前用 **TP AllGather** 得到完整 token 批，expert 后用 **TP ReduceScatterSum** 回到本 rank 的 token slice。
+_Also called_: ExpertTP in dispatcher code
+_Avoid_: replicated-token expert TP
+
+**Domino-compatible ExpertTP**:
+让 **Token-sliced Expert TP** 的 **TP AllGather** 属于 dispatcher dispatch 通信段，让 **TP ReduceScatterSum** 属于 dispatcher combine 通信段，从而能被 Domino micro-batch 流水隐藏的 MoE expert TP 语义。
+_Avoid_: attention TP, dense MLP TP
+
 ## Relationships
 
 - **TP AllGather** 的反向通信是 **TP ReduceScatterSum**。
 - **TP ReduceScatterSum** 的反向通信是 **TP AllGather**。
 - **TP size meta** 定义 **TP ReduceScatterSum** 输出给每个 TP rank 的 token slice 边界。
-- **Variable TP ReduceScatterSum** 是 TP+EP MoE routing 下的默认语义；等长 fast path 只是实现优化。
+- **Token-sliced Expert TP** 是 `expert_tp_size > 1` 的默认语义；`ep_size=1` 时 EP AllToAll 退化为空，但 TP AllGather / TP ReduceScatterSum 仍然保留。
+- **Variable TP ReduceScatterSum** 是 routed MoE token-sliced expert TP 下的默认语义；等长 fast path 只是实现优化。
 - **TP ReduceScatterSum** 的实现策略应集中在一个共享核心函数中，避免 combine forward 和 TP AllGather backward 分叉。
 - **TP ReduceScatterSum** 的输出 shape 严格由当前 TP rank 的 **TP size meta** 决定，允许 0 行，不引入 padding 或 capacity。
+- 当 `ep_size=1` 且 `expert_tp_size>1` 时，expert ownership 维度仍然存在，只是大小为 1；所有 routed experts 都属于这个唯一 EP rank。
+- 在 Naive routing + **Token-sliced Expert TP** 下，**TP size meta** 记录 source token rows；在 EP routing + **Token-sliced Expert TP** 下，**TP size meta** 记录 EP routing 后的 route-copy rows。
+- **Token-sliced Expert TP** 的异步边界由 TP AllGather 和 **TP ReduceScatterSum** 定义；这个边界不依赖 EP 是否开启。
+- 当前支持范围是 Naive routing + **Token-sliced Expert TP** 和 All2All routing + **Token-sliced Expert TP**；DeepEP routing + **Token-sliced Expert TP** 暂不作为目标语义。
+- **Domino-compatible ExpertTP** 只覆盖 MoE routed experts 的 **Token-sliced Expert TP** 通信隐藏，不表示 attention 或 dense MLP 的普通 TP。
+- 进入 routed experts 前，每个 expert TP rank 已经持有不重复的 source token slice；这些 slice 可以来自不同样本，也可以来自同一样本的不同序列片段。
 
 ## Example dialogue
 
@@ -39,6 +55,15 @@ _Avoid_: shape hack, split list
 > **Dev:** "如果某个 TP rank 没有 token，要不要 pad 到 1 行或固定容量？"
 > **Domain expert:** "不要。**TP ReduceScatterSum** 输出真实 token slice，0 行就是合法输出。"
 
+> **Dev:** "不开 EP 只开 expert TP 时，是不是可以让每个 TP rank 都持有完整 token 批，最后做 all-reduce？"
+> **Domain expert:** "不采用这个语义。无 EP expert TP 仍然是 **Token-sliced Expert TP**：前向按 TP token slice 进入 dispatcher，expert 前 all-gather，expert 后 reduce-scatter。"
+
+> **Dev:** "Naive routing + expert TP 时，TP AllGather 是 gather source tokens，还是 gather topK 展开后的 route-copy tokens？"
+> **Domain expert:** "gather source tokens。topK route-copy 展开仍然发生在 expert layout 阶段；expert 输出先 fold 回 source token partial output，再做 **TP ReduceScatterSum**。"
+
+> **Dev:** "Naive routing + expert TP 的异步路径要不要和 EP routing + expert TP 使用同一套分段语义？"
+> **Domain expert:** "要。Naive routing 没有 EP AllToAll，但 **TP AllGather** 和 **TP ReduceScatterSum** 仍然是 dispatcher 通信段，异步依赖边界应保持一致。"
+
 ## Flagged ambiguities
 
 - "reduce scatter" 在本上下文中特指 **TP ReduceScatterSum**；不是只做 scatter，也不是不带 SUM 的切分。
diff --git a/tests/module/dispatcher/test_noep_expert_tp.py b/tests/module/dispatcher/test_noep_expert_tp.py
index ffb924ed9..e119ae7ff 100644
--- a/tests/module/dispatcher/test_noep_expert_tp.py
+++ b/tests/module/dispatcher/test_noep_expert_tp.py
@@ -32,19 +32,23 @@ def _run_dispatcher(
     topk_ids: torch.Tensor,
     topk_weights: torch.Tensor,
     expert_scale: float = 1.0,
+    async_op: bool = False,
 ):
     pre_dispatched = dispatcher.dispatch_preprocess(
         hidden_states=hidden_states,
         topk_ids=topk_ids,
+        async_op=async_op,
     )
     dispatched = dispatcher.dispatch(
         pre_dispatched=pre_dispatched,
         topk_weights=topk_weights,
         decoding=False,
+        async_op=async_op,
     )
     post_dispatched = dispatcher.dispatch_postprocess(
         pre_dispatched=pre_dispatched,
         dispatched=dispatched,
+        async_op=async_op,
     )
     # 中文注释：dispatcher 测试不跑真实 row-parallel expert；
     # 每个 TP rank 提供 1/tp_size 的 partial output，真实 ReduceScatterSum 后应回到 baseline。
@@ -54,6 +58,7 @@ def _run_dispatcher(
         pre_dispatched=pre_dispatched,
         dispatched=dispatched,
         post_dispatched=post_dispatched,
+        async_op=async_op,
     )
     combined = dispatcher.combine(
         pre_dispatched=pre_dispatched,
@@ -61,6 +66,7 @@ def _run_dispatcher(
         post_dispatched=post_dispatched,
         pre_combined=pre_combined,
         decoding=False,
+        async_op=async_op,
     )
     result = dispatcher.combine_postprocess(
         pre_dispatched=pre_dispatched,
@@ -68,10 +74,15 @@ def _run_dispatcher(
         post_dispatched=post_dispatched,
         pre_combined=pre_combined,
         combined=combined,
+        async_op=async_op,
     )
     return result, dispatched, post_dispatched, pre_combined, combined
 
 
+def _assert_cuda_event(value: torch.cuda.Event | None) -> None:
+    assert isinstance(value, torch.cuda.Event)
+
+
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA/NCCL is required for real ExpertTP dispatcher validation.")
 class TestNaiveExpertTPDispatcher(DeterministicDDPTestCase):
     def test_sync_path_uses_real_tp_collectives(self) -> None:
@@ -129,6 +140,169 @@ def test_sync_path_uses_real_tp_collectives(self) -> None:
             dist.destroy_process_group(group)
         dist.destroy_process_group(pg)
 
+    def test_async_path_exposes_events_at_stage_boundaries(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_groups = [dist.new_group([ep_rank], backend="nccl") for ep_rank in range(world_size)]
+        ep_group = ep_groups[rank]
+        dispatcher = build_dispatcher(
+            dispatcher=None,
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=dist.group.WORLD,
+        )
+
+        local_hidden, local_topk_ids, local_topk_weights = _payload_for_rank(rank, device)
+        hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        hidden = hidden_leaf * 1.25
+        topk_weights = topk_weights_leaf * 0.5
+
+        pre_dispatched = dispatcher.dispatch_preprocess(
+            hidden_states=hidden,
+            topk_ids=local_topk_ids,
+            async_op=True,
+        )
+        _assert_cuda_event(pre_dispatched["forward_finished_event"])
+        _assert_cuda_event(pre_dispatched["backward_previous_event"])
+
+        dispatched = dispatcher.dispatch(
+            pre_dispatched=pre_dispatched,
+            topk_weights=topk_weights,
+            decoding=False,
+            async_op=True,
+        )
+        _assert_cuda_event(dispatched["forward_finished_event"])
+        _assert_cuda_event(dispatched["backward_previous_event"])
+        _assert_cuda_event(dispatched["topk_weights_backward_previous_event"])
+
+        # 中文注释：这里不手动 wait dispatch event，由 dispatch_postprocess 自己建立等待边界。
+        post_dispatched = dispatcher.dispatch_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            async_op=True,
+        )
+
+        total_rows = sum(tp_rank + 2 for tp_rank in range(world_size))
+        assert dispatched["hidden_states"].shape == (total_rows, local_hidden.shape[1])
+        assert dispatched["topk_ids"].shape == (total_rows, local_topk_ids.shape[1])
+        assert dispatched["topk_weights"].shape == (total_rows, local_topk_weights.shape[1])
+        assert post_dispatched["hidden_states"].shape == (
+            total_rows * local_topk_ids.shape[1],
+            local_hidden.shape[1],
+        )
+
+        experts_results = post_dispatched["hidden_states"] / world_size
+        pre_combined = dispatcher.combine_preprocess(
+            hidden_states=experts_results,
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            async_op=True,
+        )
+        _assert_cuda_event(pre_combined["forward_finished_event"])
+        _assert_cuda_event(pre_combined["backward_previous_event"])
+        assert pre_combined["hidden_states"].shape == (total_rows, local_hidden.shape[1])
+
+        combined = dispatcher.combine(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            decoding=False,
+            async_op=True,
+        )
+        _assert_cuda_event(combined["forward_finished_event"])
+        _assert_cuda_event(combined["backward_previous_event"])
+        assert combined["hidden_states"].shape == local_hidden.shape
+
+        # 中文注释：这里同样不手动 wait combine event，由 combine_postprocess 返回本 rank source token slice。
+        result = dispatcher.combine_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            combined=combined,
+            async_op=True,
+        )
+        assert result["hidden_states"].shape == local_hidden.shape
+
+        result["hidden_states"].square().sum().backward()
+        torch.cuda.synchronize()
+        assert hidden_leaf.grad is not None
+        assert topk_weights_leaf.grad is not None
+
+        dist.barrier()
+        for group in ep_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    def test_async_sync_path_matches_output_and_gradients(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_groups = [dist.new_group([ep_rank], backend="nccl") for ep_rank in range(world_size)]
+        ep_group = ep_groups[rank]
+        dispatcher = build_dispatcher(
+            dispatcher=None,
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=dist.group.WORLD,
+        )
+
+        local_hidden, local_topk_ids, local_topk_weights = _payload_for_rank(rank, device)
+        sync_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        sync_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        sync_hidden = sync_hidden_leaf * 1.25
+        sync_topk_weights = sync_topk_weights_leaf * 0.5
+        sync_result, *_ = _run_dispatcher(
+            dispatcher,
+            sync_hidden,
+            local_topk_ids,
+            sync_topk_weights,
+            expert_scale=1.0 / world_size,
+            async_op=False,
+        )
+        sync_loss = sync_result["hidden_states"].square().sum()
+        sync_loss.backward()
+        torch.cuda.synchronize()
+
+        async_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        async_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        async_hidden = async_hidden_leaf * 1.25
+        async_topk_weights = async_topk_weights_leaf * 0.5
+        async_result, *_ = _run_dispatcher(
+            dispatcher,
+            async_hidden,
+            local_topk_ids,
+            async_topk_weights,
+            expert_scale=1.0 / world_size,
+            async_op=True,
+        )
+        async_loss = async_result["hidden_states"].square().sum()
+        async_loss.backward()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(async_result["hidden_states"], sync_result["hidden_states"])
+        assert sync_hidden_leaf.grad is not None
+        assert async_hidden_leaf.grad is not None
+        assert sync_topk_weights_leaf.grad is not None
+        assert async_topk_weights_leaf.grad is not None
+        torch.testing.assert_close(async_hidden_leaf.grad, sync_hidden_leaf.grad)
+        torch.testing.assert_close(async_topk_weights_leaf.grad, sync_topk_weights_leaf.grad)
+
+        dist.barrier()
+        for group in ep_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
     @property
     def world_size(self) -> int:
         return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "2"))
diff --git a/xtuner/v1/module/dispatcher/base.py b/xtuner/v1/module/dispatcher/base.py
index 072bffd21..94d1f0ed3 100644
--- a/xtuner/v1/module/dispatcher/base.py
+++ b/xtuner/v1/module/dispatcher/base.py
@@ -17,6 +17,20 @@
 HiddenStates: TypeAlias = torch.Tensor
 
 
+def _get_backward_pre_hook(backward_previous_event: torch.cuda.Event):
+    def _backward_pre_hook(*_):
+        torch.cuda.current_stream().wait_event(backward_previous_event)
+
+    return _backward_pre_hook
+
+
+def _get_backward_hook(backward_finished_event: torch.cuda.Event):
+    def _backward_hook(*_):
+        backward_finished_event.record()
+
+    return _backward_hook
+
+
 class PreDispatchResult(TypedDict):
     hidden_states: torch.Tensor
     topk_ids: torch.Tensor
@@ -173,22 +187,31 @@ class DispacherInterface(
 ): ...
 
 
-class NaivePreDispatchResult(PreDispatchResult): ...
+class NaivePreDispatchResult(PreDispatchResult, total=False):
+    forward_finished_event: torch.cuda.Event | None
+    backward_previous_event: torch.cuda.Event | None
 
 
 class NaiveDispatchResult(DispatchResult, total=False):
     topk_ids: torch.Tensor
     tp_size_meta: list[int]
+    forward_finished_event: torch.cuda.Event | None
+    backward_previous_event: torch.cuda.Event | None
+    topk_weights_backward_previous_event: torch.cuda.Event | None
 
 
 class NaivePostDispatchResult(PostDispatchResult):
     row_ids_map: torch.Tensor
 
 
-class NaivePreCombineResult(PreCombineResult): ...
+class NaivePreCombineResult(PreCombineResult, total=False):
+    forward_finished_event: torch.cuda.Event | None
+    backward_previous_event: torch.cuda.Event | None
 
 
-class NaiveCombineResult(CombineResult): ...
+class NaiveCombineResult(CombineResult, total=False):
+    forward_finished_event: torch.cuda.Event | None
+    backward_previous_event: torch.cuda.Event | None
 
 
 class NaivePostCombineResult(PostCombineResult): ...
@@ -204,6 +227,8 @@ class NaiveDispatcher(
         NaivePostCombineResult,
     ]
 ):
+    _comm_stream: torch.cuda.Stream | None = None
+
     def __init__(
         self,
         *,
@@ -222,6 +247,8 @@ def __init__(
         if self._process_group is not None:
             assert self._process_group.size() == 1, "Naive dispatcher is only for ep=1."
         self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
+        if self._expert_tp is not None and NaiveDispatcher._comm_stream is None:
+            NaiveDispatcher._comm_stream = torch.cuda.Stream()
 
     @override
     def dispatch_preprocess(
@@ -230,9 +257,23 @@ def dispatch_preprocess(
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         async_op: bool = False,
-    ) -> PreDispatchResult:
+    ) -> NaivePreDispatchResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
+
+            forward_finished_event = torch.cuda.Event()
+            forward_finished_event.record()
+            backward_previous_event = torch.cuda.Event()
+            if hidden_states.grad_fn is not None:
+                hidden_states.grad_fn.register_prehook(_get_backward_pre_hook(backward_previous_event))
+
+            return NaivePreDispatchResult(
+                hidden_states=hidden_states,
+                topk_ids=topk_ids,
+                forward_finished_event=forward_finished_event,
+                backward_previous_event=backward_previous_event,
+            )
 
         return NaivePreDispatchResult(
             hidden_states=hidden_states,
@@ -243,13 +284,66 @@ def dispatch_preprocess(
     def dispatch(
         self,
         *,
-        pre_dispatched: PreDispatchResult,
+        pre_dispatched: NaivePreDispatchResult,
         topk_weights: torch.Tensor,
         async_op: bool = False,
         decoding: bool = False,
     ) -> NaiveDispatchResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
+
+            forward_previous_event = pre_dispatched["forward_finished_event"]
+            backward_finished_event = pre_dispatched["backward_previous_event"]
+            assert forward_previous_event is not None, "Use async_op=True for dispatch_preprocess!"
+            assert backward_finished_event is not None, "Use async_op=True for dispatch_preprocess!"
+            assert self._comm_stream is not None
+
+            tp_size_meta = self._expert_tp.gather_size_meta(pre_dispatched["hidden_states"])
+            # 中文注释：dispatch 内部的 TP AllGather 都排在同一个 comm stream，
+            # 互相不需要 event 串行化；只在 dispatch 阶段边界记录最终完成事件。
+            forward_finished_event = torch.cuda.Event()
+            hidden_backward_previous_event = torch.cuda.Event()
+            topk_weights_backward_previous_event = torch.cuda.Event()
+            topk_weights_backward_finished_event = torch.cuda.Event()
+            if topk_weights.grad_fn is not None:
+                topk_weights.grad_fn.register_prehook(_get_backward_pre_hook(topk_weights_backward_finished_event))
+
+            hidden_states = self._expert_tp.async_all_gather(
+                pre_dispatched["hidden_states"],
+                all_sizes=tp_size_meta,
+                forward_previous_event=forward_previous_event,
+                forward_finished_event=None,
+                backward_previous_event=hidden_backward_previous_event,
+                backward_finished_event=backward_finished_event,
+                comm_stream=self._comm_stream,
+            )
+            topk_ids = self._expert_tp.async_all_gather_metadata(
+                pre_dispatched["topk_ids"],
+                all_sizes=tp_size_meta,
+                forward_previous_event=None,
+                forward_finished_event=None,
+                comm_stream=self._comm_stream,
+            )
+            topk_weights = self._expert_tp.async_all_gather(
+                topk_weights,
+                all_sizes=tp_size_meta,
+                forward_previous_event=None,
+                forward_finished_event=forward_finished_event,
+                backward_previous_event=topk_weights_backward_previous_event,
+                backward_finished_event=topk_weights_backward_finished_event,
+                comm_stream=self._comm_stream,
+            )
+
+            return NaiveDispatchResult(
+                hidden_states=hidden_states,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                tp_size_meta=tp_size_meta,
+                forward_finished_event=forward_finished_event,
+                backward_previous_event=hidden_backward_previous_event,
+                topk_weights_backward_previous_event=topk_weights_backward_previous_event,
+            )
 
         if self._expert_tp is not None:
             hidden_states, tp_size_meta = self._expert_tp.all_gather(pre_dispatched["hidden_states"])
@@ -277,7 +371,11 @@ def dispatch_postprocess(
         decoding: bool = False,
     ) -> NaivePostDispatchResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
+            forward_finished_event = dispatched["forward_finished_event"]
+            assert forward_finished_event is not None, "Use async_op=True for dispatch!"
+            torch.cuda.current_stream().wait_event(forward_finished_event)
 
         topk_ids = dispatched["topk_ids"] if self._expert_tp is not None else pre_dispatched["topk_ids"]
         hidden_states, row_id_maps = permute(
@@ -285,6 +383,12 @@ def dispatch_postprocess(
             topk_ids.to(torch.int32),
         )
         tokens_per_expert = torch.histc(topk_ids, bins=self._n_routed_experts, min=0, max=self._n_routed_experts)
+        if async_op:
+            backward_previous_event = dispatched["backward_previous_event"]
+            assert backward_previous_event is not None, "Use async_op=True for dispatch!"
+            if hidden_states.grad_fn is not None:
+                hidden_states.grad_fn.register_hook(_get_backward_hook(backward_previous_event))
+
         if decoding:
             raise NotImplementedError
         else:
@@ -304,19 +408,37 @@ def combine_preprocess(
         post_dispatched: NaivePostDispatchResult,
         async_op: bool = False,
         decoding: bool = False,
-    ) -> PreCombineResult:
+    ) -> NaivePreCombineResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
 
         hidden_states = unpermute(
             input_act=hidden_states,
             row_id_map=post_dispatched["row_ids_map"],
             probs=dispatched["topk_weights"],
         )
+        if async_op:
+            backward_previous_event = torch.cuda.Event()
+            forward_finished_event = torch.cuda.Event()
+            forward_finished_event.record()
+            if hidden_states.grad_fn is not None:
+                hidden_states.grad_fn.register_prehook(_get_backward_pre_hook(backward_previous_event))
+                topk_weights_backward_previous_event = dispatched["topk_weights_backward_previous_event"]
+                assert topk_weights_backward_previous_event is not None, "Use async_op=True for dispatch!"
+                hidden_states.grad_fn.register_hook(_get_backward_hook(topk_weights_backward_previous_event))
+        else:
+            backward_previous_event = None
+            forward_finished_event = None
+
         if decoding:
             raise NotImplementedError("NaiveDispatcher does not support decoding.")
         else:
-            return PreCombineResult(hidden_states=hidden_states)
+            return NaivePreCombineResult(
+                hidden_states=hidden_states,
+                backward_previous_event=backward_previous_event,
+                forward_finished_event=forward_finished_event,
+            )
 
     @override
     def combine(
@@ -330,12 +452,37 @@ def combine(
         decoding: bool = False,
     ) -> NaiveCombineResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
 
         if decoding:
             raise NotImplementedError
         else:
             if self._expert_tp is not None:
+                if async_op:
+                    forward_previous_event = pre_combined["forward_finished_event"]
+                    backward_finished_event = pre_combined["backward_previous_event"]
+                    assert forward_previous_event is not None, "Use async_op=True for combine_preprocess!"
+                    assert backward_finished_event is not None, "Use async_op=True for combine_preprocess!"
+                    assert self._comm_stream is not None
+
+                    forward_finished_event = torch.cuda.Event()
+                    backward_previous_event = torch.cuda.Event()
+                    hidden_states = self._expert_tp.async_reduce_scatter_sum(
+                        pre_combined["hidden_states"],
+                        all_sizes=dispatched["tp_size_meta"],
+                        forward_previous_event=forward_previous_event,
+                        forward_finished_event=forward_finished_event,
+                        backward_previous_event=backward_previous_event,
+                        backward_finished_event=backward_finished_event,
+                        comm_stream=self._comm_stream,
+                    )
+                    return NaiveCombineResult(
+                        hidden_states=hidden_states,
+                        forward_finished_event=forward_finished_event,
+                        backward_previous_event=backward_previous_event,
+                    )
+
                 hidden_states = self._expert_tp.reduce_scatter_sum(
                     pre_combined["hidden_states"],
                     dispatched["tp_size_meta"],
@@ -356,6 +503,16 @@ def combine_postprocess(
         async_op: bool = False,
     ) -> PostCombineResult:
         if async_op:
-            raise NotImplementedError("Naive dispatcher is only for ep=1.")
+            if self._expert_tp is None:
+                raise NotImplementedError("Naive dispatcher async_op=True requires ExpertTP.")
+            forward_finished_event = combined["forward_finished_event"]
+            backward_previous_event = combined["backward_previous_event"]
+            assert forward_finished_event is not None, "Use async_op=True for combine!"
+            assert backward_previous_event is not None, "Use async_op=True for combine!"
+            torch.cuda.current_stream().wait_event(forward_finished_event)
+            hidden_states = combined["hidden_states"].view_as(combined["hidden_states"])
+            if hidden_states.grad_fn is not None:
+                hidden_states.grad_fn.register_hook(_get_backward_hook(backward_previous_event))
+            return PostCombineResult(hidden_states=hidden_states)
 
         return PostCombineResult(hidden_states=combined["hidden_states"])
diff --git a/xtuner/v1/module/dispatcher/expert_tp.py b/xtuner/v1/module/dispatcher/expert_tp.py
index e61411f19..3d5b4b5ef 100644
--- a/xtuner/v1/module/dispatcher/expert_tp.py
+++ b/xtuner/v1/module/dispatcher/expert_tp.py
@@ -6,6 +6,14 @@
 import torch.distributed as dist
 
 
+def _record_stream(value: Any, stream: torch.cuda.Stream) -> None:
+    if isinstance(value, torch.Tensor):
+        value.record_stream(stream)
+    elif isinstance(value, (list, tuple)):
+        for item in value:
+            _record_stream(item, stream)
+
+
 def _tp_all_gather_forward_impl(
     tensor: torch.Tensor,
     all_sizes: list[int],
@@ -85,6 +93,60 @@ def backward(ctx: Any, grad: torch.Tensor) -> tuple[torch.Tensor, None, None, No
         return grad_input, None, None, None, None
 
 
+class _AsyncTPAllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+        forward_previous_event: torch.cuda.Event | None,
+        forward_finished_event: torch.cuda.Event | None,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        with torch.cuda.stream(comm_stream):
+            if forward_previous_event is not None:
+                comm_stream.wait_event(forward_previous_event)
+            gathered, tensor_for_comm, chunks = _tp_all_gather_forward_impl(tensor, all_sizes, tp_group)
+            # 中文注释：异步路径只增加 stream/event 管理，collective 核心逻辑和同步路径一致。
+            _record_stream((tensor_for_comm, chunks, gathered), comm_stream)
+            if forward_finished_event is not None:
+                forward_finished_event.record(comm_stream)
+
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        ctx.tp_rank = tp_rank
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return gathered
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
+        grad_ready_event = torch.cuda.Event()
+        grad_ready_event.record()
+        with torch.cuda.stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            ctx.comm_stream.wait_event(grad_ready_event)
+            grad_input, grad_for_comm, chunks = _tp_all_gather_backward_impl(
+                grad,
+                ctx.all_sizes,
+                ctx.tp_rank,
+                ctx.tp_group,
+            )
+            _record_stream((grad_for_comm, chunks, grad_input), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+
+        return grad_input, None, None, None, None, None, None, None, None, None
+
+
 class _TPReduceScatterSum(torch.autograd.Function):
     @staticmethod
     def forward(
@@ -106,6 +168,56 @@ def backward(ctx: Any, grad_slice: torch.Tensor) -> tuple[torch.Tensor, None, No
         return full_grad, None, None, None, None
 
 
+class _AsyncTPReduceScatterSum(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        tp_group: dist.ProcessGroup,
+        tp_size: int,
+        tp_rank: int,
+        forward_previous_event: torch.cuda.Event,
+        forward_finished_event: torch.cuda.Event,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        with torch.cuda.stream(comm_stream):
+            comm_stream.wait_event(forward_previous_event)
+            out, tensor_for_comm, chunks = _tp_reduce_scatter_sum_impl(tensor, all_sizes, tp_rank, tp_group)
+            # 中文注释：TP ReduceScatterSum 属于 combine 通信段，输出事件交给 combine_postprocess 等待。
+            _record_stream((tensor_for_comm, chunks, out), comm_stream)
+            forward_finished_event.record(comm_stream)
+
+        ctx.all_sizes = all_sizes
+        ctx.tp_group = tp_group
+        ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_event = backward_finished_event
+        ctx.comm_stream = comm_stream
+        return out
+
+    @staticmethod
+    def backward(
+        ctx: Any,
+        grad_slice: torch.Tensor,
+    ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
+        grad_ready_event = torch.cuda.Event()
+        grad_ready_event.record()
+        with torch.cuda.stream(ctx.comm_stream):
+            ctx.comm_stream.wait_event(ctx.backward_previous_event)
+            ctx.comm_stream.wait_event(grad_ready_event)
+            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_sum_backward_impl(
+                grad_slice,
+                ctx.all_sizes,
+                ctx.tp_group,
+            )
+            _record_stream((grad_slice_for_comm, chunks, full_grad), ctx.comm_stream)
+            ctx.backward_finished_event.record(ctx.comm_stream)
+
+        return full_grad, None, None, None, None, None, None, None, None, None
+
+
 class ExpertTP:
     """Token-sliced Expert TP collectives shared by dispatcher routing
     paths."""
@@ -139,9 +251,88 @@ def all_gather_metadata(self, tensor: torch.Tensor, all_sizes: list[int]) -> tor
         gathered, _ = self.all_gather(tensor, all_sizes)
         return gathered
 
+    def async_all_gather(
+        self,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        forward_previous_event: torch.cuda.Event | None,
+        forward_finished_event: torch.cuda.Event | None,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        if self._tp_size == 1:
+            if forward_finished_event is not None:
+                forward_finished_event.record()
+            return tensor
+
+        tp_rank = dist.get_rank(group=self._tp_group)
+        return _AsyncTPAllGather.apply(
+            tensor,
+            all_sizes,
+            self._tp_group,
+            self._tp_size,
+            tp_rank,
+            forward_previous_event,
+            forward_finished_event,
+            backward_previous_event,
+            backward_finished_event,
+            comm_stream,
+        )
+
+    def async_all_gather_metadata(
+        self,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        forward_previous_event: torch.cuda.Event | None,
+        forward_finished_event: torch.cuda.Event | None,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        if self._tp_size == 1:
+            if forward_finished_event is not None:
+                forward_finished_event.record()
+            return tensor
+
+        with torch.cuda.stream(comm_stream):
+            if forward_previous_event is not None:
+                comm_stream.wait_event(forward_previous_event)
+            gathered, tensor_for_comm, chunks = _tp_all_gather_forward_impl(tensor, all_sizes, self._tp_group)
+            _record_stream((tensor_for_comm, chunks, gathered), comm_stream)
+            if forward_finished_event is not None:
+                forward_finished_event.record(comm_stream)
+        return gathered
+
     def reduce_scatter_sum(self, tensor: torch.Tensor, all_sizes: list[int]) -> torch.Tensor:
         if self._tp_size == 1:
             return tensor
 
         tp_rank = dist.get_rank(group=self._tp_group)
         return _TPReduceScatterSum.apply(tensor, all_sizes, self._tp_group, self._tp_size, tp_rank)
+
+    def async_reduce_scatter_sum(
+        self,
+        tensor: torch.Tensor,
+        all_sizes: list[int],
+        forward_previous_event: torch.cuda.Event,
+        forward_finished_event: torch.cuda.Event,
+        backward_previous_event: torch.cuda.Event,
+        backward_finished_event: torch.cuda.Event,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        if self._tp_size == 1:
+            forward_finished_event.record()
+            return tensor
+
+        tp_rank = dist.get_rank(group=self._tp_group)
+        return _AsyncTPReduceScatterSum.apply(
+            tensor,
+            all_sizes,
+            self._tp_group,
+            self._tp_size,
+            tp_rank,
+            forward_previous_event,
+            forward_finished_event,
+            backward_previous_event,
+            backward_finished_event,
+            comm_stream,
+        )
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index 13f2cf758..f7cae1aff 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -433,31 +433,143 @@ router_weights: [N, E]
 第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
 语义相同（scatter，1D indices 无 topk 展开），只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。
 
-## DeepEP dispatcher 的对应差异
+## DeepEPDispatcher: DeepEP Buffer dispatch/combine 原理
 
-`DeepEPDispatcher` 使用 DeepEP 的 `Buffer.get_dispatch_layout()` / `Buffer.dispatch()` / `Buffer.combine()` 来管理
-layout、通信 handle 和事件。它不像 `TorchAll2AllDispatcher` 那样显式执行：
+`DeepEPDispatcher` 仍然暴露和其他 dispatcher 一样的六阶段接口，但它把 EP all2all 的 routing layout、通信 handle
+和 event 管理交给 DeepSeek 开源 DeepEP 库的 `Buffer` API。DeepEP 的核心接口是：
+
+- `Buffer.get_dispatch_layout(topk_idx, num_experts, ...)`：根据 topK expert 选择计算 dispatch layout。
+- `Buffer.dispatch(...)`：把 token、`topk_idx`、`topk_weights` 发到拥有选中 expert 的 EP rank。
+- `Buffer.combine(...)`：使用 dispatch 返回的 handle，把 expert 输出或 dispatch backward 的梯度送回 source rank。
+- `EventOverlap`：DeepEP 对 CUDA event 的包装，支持 `current_stream_wait()` 让当前 compute stream 等通信完成。
+
+XTuner 的包装在 `xtuner/v1/ops/comm/deepep_op.py` 中：
 
 ```python
-to(device=torch.device("cpu")).tolist()
+num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, previous_event = \
+    buffer.get_dispatch_layout(topk_idx, num_experts, previous_event=previous_event, async_finish=True)
+
+recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = \
+    buffer.dispatch(
+        x,
+        topk_idx=topk_idx,
+        topk_weights=topk_weights,
+        num_tokens_per_rank=num_tokens_per_rank,
+        num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+        is_token_in_rank=is_token_in_rank,
+        num_tokens_per_expert=num_tokens_per_expert,
+        previous_event=previous_event,
+        async_finish=True,
+        allocate_on_comm_stream=True,
+    )
+```
+
+### DeepEP dispatch
+
+`DeepEPDispatcher.dispatch_preprocess` 不像 `TorchAll2AllDispatcher` 那样先本地 `permute`。它只保留原始 source token
+hidden，并把 `topk_ids` 转成 DeepEP 需要的 `int64`：
+
+```text
+hidden_states: [N, H]
+topk_ids:      [N, K]
+topk_weights:  [N, K]
+```
+
+跨 EP rank 搬运由 DeepEP dispatch kernel 完成；真正的 route-copy 展开仍在本 rank 的
+`dispatch_postprocess -> permute(recv_topk_idx)` 中完成。`Buffer.dispatch` 返回：
+
+```text
+recv_x                         # 本 EP rank 收到的 source token hidden
+recv_topk_idx                  # 与 recv_x 对齐的 [M_recv, K] expert ids；非本 rank expert 位置为 -1
+recv_topk_weights              # 与 recv_topk_idx 对齐的 topK weights
+num_recv_tokens_per_expert_list # 本 rank 每个 local expert 收到的 token 数
+handle                         # combine/backward 复用的通信 handle
+event                          # dispatch 完成事件
+```
+
+`handle` 是 DeepEP 的关键抽象。XTuner 注释里列出的 intranode handle 包括：
+
+```text
+rank_prefix_matrix
+channel_prefix_matrix
+recv_channel_prefix_matrix
+recv_src_idx
+is_token_in_rank
+send_head
+```
+
+这些张量记录了 dispatch 的源/目的映射、channel 前缀和接收源索引。后续 combine 不再重新根据 routing 计算布局，而是
+复用这个 handle 把 token 送回原 source rank；dispatch backward 和 combine backward 也复用同一个 handle。
+
+### DeepEP dispatch_postprocess
+
+DeepEP dispatch 已经把 token 发到拥有相关 local expert 的 EP rank，但输出还不是 grouped GEMM 需要的 local expert 连续分组。
+`dispatch_postprocess` 会先等待 dispatch event，然后用 `recv_topk_idx` 再做一次本地 `permute`：
+
+```text
+recv_x
+  --permute(recv_topk_idx, num_out_tokens=sum(num_recv_tokens_per_expert_list))-->
+local expert grouped hidden
+```
+
+`num_recv_tokens_per_expert_list` 被转换成 `tokens_per_expert`，供 grouped GEMM 使用。
+
+### DeepEP combine_preprocess / combine
+
+DeepEP 当前方案和 `TorchAll2AllDispatcher` 的一个重要差异是 `topk_weights` 的位置：
+
+- `TorchAll2AllDispatcher` 把 `topk_weights` 留在 source rank，最后 `combine_postprocess` 本地加权合并。
+- `DeepEPDispatcher` 在 dispatch 时把 `topk_weights` 一起发到拥有选中 expert 的 EP rank，并在
+  `combine_preprocess` 先加权合并：
+
+```python
+hidden_states = unpermute(
+    hidden_states,
+    post_dispatched["row_ids_map"],
+    probs=dispatched["topk_weights"],
+)
+```
+
+因此 DeepEP 的 forward combine 调用不再传 `topk_weights`：
+
+```python
+combined_x, _, event = buffer.combine(x, handle, async_finish=True, previous_event=previous_event)
 ```
 
-但它仍然存在 host 可见的 metadata 准备点。`xtuner/v1/ops/comm/deepep_op.py::dispatch_forward()` 中已经注明：
+进入 combine 的 hidden 已经是按 `recv_topk_weights` fold 过的 source-token partial output。DeepEP combine 只负责使用
+dispatch handle 把这些 hidden 送回 source rank 并做 SUM reduce。
+
+### DeepEP backward
+
+DeepEP 的反向复用相反方向的通信原语：
+
+- `DeepEPCombine.backward` 调用 `Buffer.dispatch(..., handle=handle)`：combine forward 的反向是 dispatch。
+- `DeepEPDispatch.backward` 调用 `Buffer.combine(grad_recv_x, handle, topk_weights=grad_recv_topk_weights)`：
+  dispatch forward 的反向是 combine，并且同时把 `grad_recv_topk_weights` 送回 source 侧，得到
+  `combined_grad_recv_topk_weights`。
+
+这解释了为什么 DeepEP dispatch 是一个 composite autograd op：它的 forward 同时产生 `recv_x` 和
+`recv_topk_weights`，backward 也同时返回 `x` 和 `topk_weights` 的梯度。
+
+### Host metadata 同步
+
+DeepEP 不像 `TorchAll2AllDispatcher` 那样在 XTuner 代码里显式执行：
 
 ```python
-# NOTES: the CPU will wait for GPU's signal to arrive,
-# so this is not compatible with CUDA graph
+to(device=torch.device("cpu")).tolist()
 ```
 
-DeepEP dispatch 会返回：
+但它仍然存在 host 可见的 metadata 准备点。DeepEP 的 legacy Buffer API 文档和 XTuner 包装都注明：dispatch 内部不知道
+当前 rank 会收到多少 token，因此 CPU 会等待 GPU signal，拿到 receive count 后才能继续。XTuner 代码中的表现是
+`Buffer.dispatch` 返回 Python list：
 
 ```python
 num_recv_tokens_per_expert_list, handle, event
 ```
 
-其中 `num_recv_tokens_per_expert_list` 是 Python list，`dispatch_postprocess` 需要用它计算 `num_out_tokens` 和
-`tokens_per_expert`。因此 DeepEP 也不是完全没有 host 同步；只是同步被 DeepEP 的 layout/dispatch handle 机制封装
-在库内部，不是 PyTorch split-size list 的 `.tolist()` 同步。
+`dispatch_postprocess` 必须用这个 list 计算 `num_out_tokens` 和 `tokens_per_expert`。因此 DeepEP 也不是完全无 host
+同步；只是同步被 DeepEP 的 layout/dispatch handle 机制封装在库内部，不是 PyTorch split-size list 的
+`.tolist()` 同步。
 
 对 Domino EP 来说，两者的影响边界一致：
 
@@ -466,6 +578,14 @@ num_recv_tokens_per_expert_list, handle, event
 - 如果 metadata 等待短于可覆盖的另一个 micro batch 计算，重叠效果基本保留。
 - 如果 metadata 等待更长，`xtuner_ep_domino.md` 7.3 中的理想时间线会被压缩，真实重叠比例下降。
 
+### 当前支持边界
+
+当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 会直接构造 `DeepEPDispatcher`，`tp_group` 没有接入
+DeepEP dispatcher。也就是说，XTuner 当前的 DeepEP 路径是 EP dispatcher，不包含 `TorchAll2AllTPEPDispatcher`
+那套 TP AllGather / TP ReduceScatterSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
+额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterSum，以及相应的 `topk_weights`
+event 语义；这部分见 `xtuner_etp.md`。
+
 ## TP+EP 中 ReduceScatterSum 与 padding/capacity 取舍
 
 `TorchAll2AllTPEPDispatcher` 在 EP dispatch 之后会额外做 TP AllGather，在 combine 阶段会做 TP
diff --git a/xtuner_etp.md b/xtuner_etp.md
new file mode 100644
index 000000000..85071d24f
--- /dev/null
+++ b/xtuner_etp.md
@@ -0,0 +1,211 @@
+# XTuner ExpertTP Event Notes
+
+本文记录 XTuner MoE dispatcher 中 Expert Tensor Parallelism（ExpertTP）的异步 event 语义。
+
+## 几种 dispatcher 语义
+
+ExpertTP 相关路径在 XTuner 里有几种常见组合：
+
+1. Naive routing + ExpertTP：没有 EP AllToAll，TP rank 持有不重复的 source token slice。dispatch 阶段用
+   TP AllGather 把各 TP rank 的 source token slice 拼成完整 source-token batch，然后本地展开 topK route-copy。
+2. TorchAll2All EP + TP：先由 EP AllToAll 把 route-copy hidden 发到 expert 所在 EP rank，再由 TP AllGather
+   把同一 EP rank 内各 TP rank 的 route-copy token slice 拼成 expert 输入。
+3. DeepEP dispatcher：由 DeepEP `Buffer.dispatch` 同时通信 hidden、`topk_idx`、`topk_weights`，再用 DeepEP
+   `Buffer.combine` 送回 source rank。当前 XTuner 的 DeepEP dispatcher 尚未接入 ExpertTP 的 TP AllGather /
+   TP ReduceScatterSum。
+
+这几种方式最大的差异是 `topk_weights` 在哪里参与 topK folding。
+
+### Naive routing + ExpertTP
+
+Naive + ExpertTP 的 dispatch TP AllGather 发生在 source-token 空间：
+
+```text
+local source tokens [N_local, H]
+  --TP AllGather-->
+full source tokens  [N_total, H]
+  --dispatch_postprocess / permute(topk_ids)-->
+route-copy tokens   [N_total * K, H]
+```
+
+因此，`topk_ids` 和 `topk_weights` 也必须和 gathered hidden 对齐到 `N_total` 个 source token。否则
+`dispatch_postprocess` 无法基于完整 token batch 做 route-copy 展开，`combine_preprocess` 也无法在完整 source-token
+空间中按本 token 的 topK weight fold 回 `[N_total, H]`。
+
+所以 Naive + ExpertTP 的 dispatch 通信段需要：
+
+```text
+hidden_states TP AllGather
+topk_ids      TP AllGather
+topk_weights  TP AllGather
+```
+
+`topk_ids` 只是路由元数据，不需要 autograd。`topk_weights` 参与 `unpermute(..., probs=topk_weights)`，需要梯度，
+因此它的 TP AllGather backward 会执行 TP ReduceScatterSum，把完整 token 空间里的 `dtopk_weights` 切回本 TP rank
+的 source-token slice。
+
+### TorchAll2All EP + TP
+
+TorchAll2All EP + TP 的 dispatch 首先已经在 route-copy 空间中通信 hidden：
+
+```text
+source route-copy hidden
+  --EP AllToAll-->
+expert-rank route-copy hidden
+  --TP AllGather-->
+expert-rank full route-copy hidden
+```
+
+当前 XTuner 的 `TorchAll2AllTPEPDispatcher` 设计选择 **不通信 `topk_weights`**：专家侧只计算每个 route-copy
+的 expert output，combine 通信把 route-copy output 送回 source 侧，最后由 `combine_postprocess` 在 source 侧使用
+本地保留的 `topk_weights` 做 topK folding：
+
+```text
+expert output route-copy
+  --TP ReduceScatterSum + EP combine-->
+source route-copy output
+  --unpermute(..., probs=local topk_weights)-->
+source hidden [N_local, H]
+```
+
+这种设计下，`topk_weights` 一直留在 source rank / source TP slice 上，不需要 EP AllToAll，也不需要 TP AllGather。
+因此当前 `TorchAll2AllTPEPDispatcher` 不存在 Naive + ExpertTP 中那条 `topk_weights` TP AllGather backward 的额外
+event 问题。
+
+### DeepEP dispatcher
+
+DeepEP 的默认处理方式不同：`Buffer.dispatch` 会把 `topk_weights` 和 hidden、`topk_idx` 一起发到拥有选中
+expert 的 EP rank。
+XTuner 的 `DeepEPDispatcher.combine_preprocess` 随后在 expert rank 上执行：
+
+```python
+unpermute(expert_out, row_ids_map, probs=dispatched["topk_weights"])
+```
+
+也就是说，DeepEP 路径是在 expert 侧先按 `recv_topk_weights` 做 topK folding，再调用 `Buffer.combine` 把已经加权
+合并后的 hidden 送回 source rank。它不是 `TorchAll2AllTPEPDispatcher` 那种“`topk_weights` 留在 source 侧，
+最后再加权”的设计。
+
+DeepEP dispatch 本身是一个 composite autograd op：
+
+```text
+forward : Buffer.dispatch(x, topk_idx, topk_weights) -> recv_x, recv_topk_idx, recv_topk_weights, handle
+backward: Buffer.combine(grad_recv_x, handle, topk_weights=grad_recv_topk_weights)
+          -> grad_x, grad_topk_weights
+```
+
+因此 DeepEP 的 `topk_weights` 梯度会沿 dispatch handle 反向通信回 source rank。异步情况下，`grad_x` 和
+`grad_topk_weights` 都来自同一个 DeepEP backward communication event；如果 `topk_weights` 是非叶子张量并且上游
+router backward 会继续消费 `grad_topk_weights`，也必须等待这个 event。当前代码显式给 `hidden_states.grad_fn`
+挂了 dispatch backward pre-hook；从 event 语义上看，`topk_weights.grad_fn` 也应等待同一个 dispatch backward
+完成事件，除非实现改成在 composite op 内部统一保证两个返回梯度被消费前已经同步。
+
+### DeepEP + ExpertTP 的方案
+
+当前 XTuner 的 `DeepEPDispatcher` 没有接入 `tp_group`；`dispatcher="deepep"` 时不会自动获得
+`TorchAll2AllTPEPDispatcher` 那套 TP AllGather / TP ReduceScatterSum。因此 DeepEP + ExpertTP 还需要单独设计。
+
+如果保留 DeepEP 的“`topk_weights` 发到 expert 侧并在 combine 前加权”的语义，那么混合 ExpertTP 后 dispatch
+阶段应当这样对齐：
+
+```text
+DeepEP dispatch:
+  recv_x, recv_topk_idx, recv_topk_weights
+
+TP dispatch segment:
+  recv_x            TP AllGather
+  recv_topk_idx     TP AllGather
+  recv_topk_weights TP AllGather
+
+dispatch_postprocess:
+  基于 TP-gather 后的 recv_topk_idx 做 local expert layout
+
+combine_preprocess:
+  基于 TP-gather 后的 recv_topk_weights 做 topK folding
+```
+
+这会让 DeepEP + ExpertTP 出现和 Naive + ExpertTP 相同的 `topk_weights` TP AllGather backward 问题：
+`recv_topk_weights` 的 TP AllGather backward 需要 TP ReduceScatterSum，得到本 TP rank 的
+`grad_recv_topk_weights` 后，DeepEP dispatch backward 再用 `Buffer.combine(..., topk_weights=grad_recv_topk_weights)`
+把梯度送回 source rank。
+
+推荐的 event 方案是把 DeepEP dispatch 和后续 TP AllGather 封装成一个 dispatch-level composite autograd stage：
+
+1. 前向在同一个 dispatch 通信段中排队 DeepEP dispatch、TP hidden AllGather、TP metadata AllGather 和
+   TP `topk_weights` AllGather，只在最后记录一个 dispatch `forward_finished_event`。
+2. 反向先完成 TP `topk_weights` / hidden 的 ReduceScatterSum，再调用 DeepEP dispatch backward，把
+   `grad_x` 和 `grad_topk_weights` 都送回 source rank。
+3. 只有当 hidden 和 `topk_weights` 两条反向通信都完成后，才记录同一个 dispatch `backward_finished_event`。
+4. 如果实现上仍拆成多个 autograd Function，则必须给 `topk_weights` 分支保留独立完成 event，并让
+   `topk_weights.grad_fn` 的 pre-hook 等待它；否则 router backward 可能在 TP/DeepEP 通信仍在写
+   `grad_topk_weights` 时提前读取。
+
+## 前向 event 边界
+
+ExpertTP 的通信阶段应和 All2All dispatcher 保持同一套六阶段边界：
+
+1. `dispatch_preprocess`：本地准备 dispatch 输入，并在 compute stream 上记录 `forward_finished_event`。
+2. `dispatch`：在 dispatcher 的通信 stream 上发起 TP AllGather。
+3. `dispatch_postprocess`：compute stream 等待 dispatch 的 `forward_finished_event`，再做本地 expert layout。
+4. `combine_preprocess`：本地 topK folding，并在 compute stream 上记录 `forward_finished_event`。
+5. `combine`：在 dispatcher 的通信 stream 上发起 TP ReduceScatterSum。
+6. `combine_postprocess`：compute stream 等待 combine 的 `forward_finished_event`，再返回本 rank 的 source token slice。
+
+同一个通信阶段里的多个 NCCL collective 如果都排在同一条 communication stream 上，阶段内部不需要额外 event 串行化。
+例如 Naive + ExpertTP 的 dispatch 会依次发起：
+
+```text
+hidden_states TP AllGather
+topk_ids      TP AllGather
+topk_weights  TP AllGather
+```
+
+它们都 enqueue 到同一条 communication stream，CUDA stream FIFO 已经保证顺序。因此前向只需要：
+
+- 阶段开始：communication stream 等待上一阶段的 `forward_finished_event`。
+- 阶段结束：最后一个 collective 后记录本阶段的 `forward_finished_event`。
+- 本地 postprocess：compute stream 等待本阶段的 `forward_finished_event`。
+
+## 反向 `topk_weights` event
+
+反向也应尽量保持“一阶段一组 event”的模型：
+
+- `backward_previous_event`：下游本地 backward 已经产出这个通信阶段需要的梯度。
+- `backward_finished_event`：该通信阶段的 backward collective 已完成，上游可以继续消费梯度。
+
+但 Naive + ExpertTP 的 dispatch 有一个细节：`hidden_states` 和 `topk_weights` 都经过 TP AllGather，且二者都是带梯度的输入。
+如果实现上把它们拆成两个独立 autograd Function，那么反向会形成两条独立分支：
+
+```text
+dP = TPReduceScatterSum.backward(dO)
+
+dE, dW_full = combine_preprocess.backward(dP)
+
+dH_full = dispatch_postprocess.backward(dE)
+
+dhidden = TPAllGather(hidden_states).backward(dH_full)
+dweight = TPAllGather(topk_weights).backward(dW_full)
+```
+
+其中 `topk_weights` 的本地梯度 `dweight` 不是纯本地计算得到的，而是由
+`TPAllGather(topk_weights).backward()` 在 communication stream 上执行 TP ReduceScatterSum 后得到。
+
+如果没有给 `topk_weights` 上游 autograd 节点单独挂一个等待通信完成的 event，可能出现：
+
+```text
+compute stream: topk_weights 上游 backward 读取 dweight
+comm stream:    TP ReduceScatterSum 仍在写 dweight
+```
+
+这就是跨 stream 读写 race。`hidden_states` 分支的 dispatch backward event 不能证明 `topk_weights` 分支的
+TP ReduceScatterSum 已完成，因为两者是独立 autograd Function，完成顺序由 autograd 调度和 CUDA 队列共同决定。
+
+因此，在当前“每个 TP collective 一个 autograd Function”的实现下：
+
+- 前向 dispatch 内部的中间 event 可以省掉，依靠同一条 communication stream 的 FIFO 顺序。
+- `topk_weights` backward 仍需要自己的完成 event，并让 `topk_weights.grad_fn` 的 pre-hook 等待该 event 后再继续上游 backward。
+
+如果未来把 Naive + ExpertTP dispatch 封装成一个 dispatch-level composite autograd Function，由它同时管理
+`hidden_states` / `topk_ids` / `topk_weights` 的通信和反向，那么可以在这个 composite op 内部统一使用一组 stage-level
+backward event：只有在 hidden 和 topk_weights 两条反向 collective 都已正确排队并完成后，才记录同一个
+`backward_finished_event`。

From d18a3a7d1cb6d9aa95afd9f769ec2eae6ecb772b Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Wed, 20 May 2026 11:37:06 +0000
Subject: [PATCH 14/25] Fix MoE compile config for ExpertTP

---
 tests/utils/test_compile.py         | 69 ++++++++++++++++++++++++++---
 xtuner/v1/model/moe/moe.py          |  7 ++-
 xtuner/v1/model/moe/qwen3_5_text.py |  4 +-
 xtuner/v1/module/dispatcher/base.py | 31 ++++++++++---
 xtuner_ep_dispatcher.md             | 38 ++++++++++++++++
 5 files changed, 135 insertions(+), 14 deletions(-)

diff --git a/tests/utils/test_compile.py b/tests/utils/test_compile.py
index 567a5113a..c91a3450f 100644
--- a/tests/utils/test_compile.py
+++ b/tests/utils/test_compile.py
@@ -1,11 +1,32 @@
-from xtuner.v1.model import Qwen3Dense8BConfig, Qwen3MoE30BA3Config, Qwen3VLMoE30BA3Config, GptOss21BA3P6Config, DeepSeekV3Config, InternVL3P5Dense1BConfig, XTunerBaseModelConfig
-import torch
-from xtuner.v1.utils import get_logger
-from xtuner._testing.utils import LogCapture
 from ast import literal_eval
+import re
+
 import pytest
+import torch
 
-import re
+from xtuner._testing.utils import LogCapture
+from xtuner.v1.model import (
+    DeepSeekV3Config,
+    GptOss21BA3P6Config,
+    InternVL3P5Dense1BConfig,
+    Qwen3Dense8BConfig,
+    Qwen3MoE30BA3Config,
+    Qwen3VLMoE30BA3Config,
+)
+from xtuner.v1.model.moe.moe import MOE_EP_COMPILE_CFG, MOE_NON_EP_COMPILE_CFG, MoE
+from xtuner.v1.model.moe.qwen3_5_text import (
+    MOE_EP_COMPILE_CFG as QWEN35_MOE_EP_COMPILE_CFG,
+    MOE_NON_EP_COMPILE_CFG as QWEN35_MOE_NON_EP_COMPILE_CFG,
+    Qwen3_5_VLTextMoE,
+    Qwen3_5_VLTextMoE35BA3BConfig,
+)
+from xtuner.v1.module.dispatcher.base import (
+    NaiveCombineResult,
+    NaiveDispatchResult,
+    NaivePreCombineResult,
+    NaivePreDispatchResult,
+)
+from xtuner.v1.utils import get_logger
 
 
 logger = get_logger()
@@ -60,3 +81,41 @@ def test_compile_model_exception():
     with pytest.raises(Exception):
         with torch.device("meta"):
             Qwen3Dense8BConfig(compile_cfg={"xtuner.v1.module.decoder_layer.moe_decoder_layer.MoEBlock.fuck": {}}).build()
+
+
+@pytest.mark.parametrize(
+    "ep_size,expert_tp_size,expected_compile_cfg",
+    [
+        (1, 1, MOE_NON_EP_COMPILE_CFG),
+        (2, 1, MOE_EP_COMPILE_CFG),
+        (1, 2, MOE_EP_COMPILE_CFG),
+        (2, 2, MOE_EP_COMPILE_CFG),
+    ],
+)
+def test_moe_compile_cfg_treats_expert_tp_like_ep(ep_size, expert_tp_size, expected_compile_cfg):
+    model = object.__new__(MoE)
+    model.config = Qwen3MoE30BA3Config(ep_size=ep_size, expert_tp_size=expert_tp_size)
+    assert model.default_compile_cfg == expected_compile_cfg
+
+
+@pytest.mark.parametrize(
+    "ep_size,expert_tp_size,expected_compile_cfg",
+    [
+        (1, 1, QWEN35_MOE_NON_EP_COMPILE_CFG),
+        (2, 1, QWEN35_MOE_EP_COMPILE_CFG),
+        (1, 2, QWEN35_MOE_EP_COMPILE_CFG),
+        (2, 2, QWEN35_MOE_EP_COMPILE_CFG),
+    ],
+)
+def test_qwen35_moe_compile_cfg_treats_expert_tp_like_ep(ep_size, expert_tp_size, expected_compile_cfg):
+    model = object.__new__(Qwen3_5_VLTextMoE)
+    model.config = Qwen3_5_VLTextMoE35BA3BConfig(ep_size=ep_size, expert_tp_size=expert_tp_size)
+    assert model.default_compile_cfg == expected_compile_cfg
+
+
+def test_naive_dispatcher_compile_result_typeddicts_have_no_optional_keys():
+    # 中文注释：non-EP 默认会 compile MoEDecoderLayer.forward，Dynamo 不支持 optional-key TypedDict。
+    assert NaivePreDispatchResult.__optional_keys__ == frozenset()
+    assert NaiveDispatchResult.__optional_keys__ == frozenset()
+    assert NaivePreCombineResult.__optional_keys__ == frozenset()
+    assert NaiveCombineResult.__optional_keys__ == frozenset()
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index 41b0b90f0..edf9d0e3a 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -162,6 +162,11 @@ def build(self) -> "MoE":
         return MoE(self)
 
 
+def use_moe_ep_compile_cfg(config: MoEConfig) -> bool:
+    # 中文注释：ExpertTP 也会跨 rank 进入 dispatcher 通信段，compile 边界应和 EP 路径一致。
+    return config.ep_size > 1 or config.expert_tp_size > 1
+
+
 class MoE(BaseModel):
     """Transformer decoder consisting of *config.num_hidden_layers* layers.
     Each layer is a [`InternLM3DecoderLayer`]
@@ -1083,7 +1088,7 @@ def fully_shard(
     @property
     @override
     def default_compile_cfg(self) -> dict[str, TorchCompileOption]:
-        if self.config.ep_size > 1:
+        if use_moe_ep_compile_cfg(self.config):
             return MOE_EP_COMPILE_CFG
         else:
             return MOE_NON_EP_COMPILE_CFG
diff --git a/xtuner/v1/model/moe/qwen3_5_text.py b/xtuner/v1/model/moe/qwen3_5_text.py
index 19acfc7a7..aa860ed7d 100644
--- a/xtuner/v1/model/moe/qwen3_5_text.py
+++ b/xtuner/v1/model/moe/qwen3_5_text.py
@@ -10,7 +10,7 @@
     HFSaveCfg,
     TorchCompileOption,
 )
-from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig, ZLossConfig
+from xtuner.v1.model.moe.moe import BalancingLossConfig, MoEConfig, ZLossConfig, use_moe_ep_compile_cfg
 from xtuner.v1.module.attention import GatedDeltaNetConfig, MHAConfig
 from xtuner.v1.module.rope import RopeScalingConfig
 from xtuner.v1.module.router.greedy import GreedyRouterConfig
@@ -191,7 +191,7 @@ def param_to_safetensor(
     @property
     @override
     def default_compile_cfg(self) -> dict[str, TorchCompileOption]:
-        if self.config.ep_size > 1:
+        if use_moe_ep_compile_cfg(self.config):
             return MOE_EP_COMPILE_CFG
         else:
             return MOE_NON_EP_COMPILE_CFG
diff --git a/xtuner/v1/module/dispatcher/base.py b/xtuner/v1/module/dispatcher/base.py
index 94d1f0ed3..1f07ad387 100644
--- a/xtuner/v1/module/dispatcher/base.py
+++ b/xtuner/v1/module/dispatcher/base.py
@@ -187,12 +187,13 @@ class DispacherInterface(
 ): ...
 
 
-class NaivePreDispatchResult(PreDispatchResult, total=False):
+class NaivePreDispatchResult(PreDispatchResult):
+    # 中文注释：这些 key 必须始终存在；torch.compile 不支持 optional-key TypedDict。
     forward_finished_event: torch.cuda.Event | None
     backward_previous_event: torch.cuda.Event | None
 
 
-class NaiveDispatchResult(DispatchResult, total=False):
+class NaiveDispatchResult(DispatchResult):
     topk_ids: torch.Tensor
     tp_size_meta: list[int]
     forward_finished_event: torch.cuda.Event | None
@@ -204,12 +205,12 @@ class NaivePostDispatchResult(PostDispatchResult):
     row_ids_map: torch.Tensor
 
 
-class NaivePreCombineResult(PreCombineResult, total=False):
+class NaivePreCombineResult(PreCombineResult):
     forward_finished_event: torch.cuda.Event | None
     backward_previous_event: torch.cuda.Event | None
 
 
-class NaiveCombineResult(CombineResult, total=False):
+class NaiveCombineResult(CombineResult):
     forward_finished_event: torch.cuda.Event | None
     backward_previous_event: torch.cuda.Event | None
 
@@ -278,6 +279,8 @@ def dispatch_preprocess(
         return NaivePreDispatchResult(
             hidden_states=hidden_states,
             topk_ids=topk_ids,
+            forward_finished_event=None,
+            backward_previous_event=None,
         )
 
     @override
@@ -354,11 +357,19 @@ def dispatch(
                 topk_ids=topk_ids,
                 topk_weights=topk_weights,
                 tp_size_meta=tp_size_meta,
+                forward_finished_event=None,
+                backward_previous_event=None,
+                topk_weights_backward_previous_event=None,
             )
 
         return NaiveDispatchResult(
             hidden_states=pre_dispatched["hidden_states"],
+            topk_ids=pre_dispatched["topk_ids"],
             topk_weights=topk_weights,
+            tp_size_meta=[],
+            forward_finished_event=None,
+            backward_previous_event=None,
+            topk_weights_backward_previous_event=None,
         )
 
     @override
@@ -487,9 +498,17 @@ def combine(
                     pre_combined["hidden_states"],
                     dispatched["tp_size_meta"],
                 )
-                return NaiveCombineResult(hidden_states=hidden_states)
+                return NaiveCombineResult(
+                    hidden_states=hidden_states,
+                    forward_finished_event=None,
+                    backward_previous_event=None,
+                )
 
-            return NaiveCombineResult(hidden_states=pre_combined["hidden_states"])
+            return NaiveCombineResult(
+                hidden_states=pre_combined["hidden_states"],
+                forward_finished_event=None,
+                backward_previous_event=None,
+            )
 
     @override
     def combine_postprocess(
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index f7cae1aff..a97a401bd 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -433,6 +433,44 @@ router_weights: [N, E]
 第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
 语义相同（scatter，1D indices 无 topk 展开），只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。
 
+## torch.compile 与 dispatcher 边界
+
+`FSDPConfig.torch_compile=True` 目前只是一个兼容入口，真正决定 compile 行为的是
+`XTunerBaseModelConfig.compile_cfg`：
+
+- `compile_cfg=None` 或 `True`：使用模型自己的 `default_compile_cfg`。
+- `compile_cfg=False`：关闭 compile。
+- `compile_cfg=dict[...]`：用户显式指定 compile target。
+- `FSDPConfig.torch_compile=False` 会在 trainer 配置解析阶段把 `model_cfg.compile_cfg` 强制设成 `False`；反过来
+  `FSDPConfig.torch_compile=True` 不会强制覆盖用户自定义的 `compile_cfg`。
+
+对 MoE 来说，默认 compile target 会根据 dispatcher 是否包含跨 rank 通信编排分两类：
+
+- `ep_size == 1` 且 `expert_tp_size == 1`：使用 `MOE_NON_EP_COMPILE_CFG`。普通 MoE 会把
+  `MoEDecoderLayer.forward` 作为 compile target，同时也 compile `MoEBlock.forward`、
+  `_pre_moe_forward`、`_shared_experts_forward`、`_post_moe_forward`、dense layer 和 float8 相关函数。
+- `ep_size > 1` 或 `expert_tp_size > 1`：使用 `MOE_EP_COMPILE_CFG`。它从 non-EP 配置复制而来，但显式删除
+  `MoEDecoderLayer.forward`，保留局部计算函数的 compile。
+
+`qwen3_5_text` 的 non-EP 配置也包含 `MoEDecoderLayer.forward`，但该 target 使用 `fullgraph=False`；EP 开启后同样会从
+默认配置中删除顶层 `MoEDecoderLayer.forward`。
+
+这个差异是 dispatcher 边界的核心：EP 或 ExpertTP 开启后，`MoEDecoderLayer.forward` 顶层会承载
+`dispatch_preprocess -> dispatch -> dispatch_postprocess -> expert -> combine_preprocess -> combine -> combine_postprocess`
+的变长通信编排，以及 Domino micro batch 的多输入分支、CUDA event、autograd hook、DeepEP handle 等动态对象。
+这些部分不适合作为稳定的 fullgraph compile 边界，因此当前设计让 dispatcher 编排保持 eager Python，只把相对稳定的本地计算块交给
+`torch.compile`。
+
+这也意味着 compile 不会消除前面描述的 dispatcher host metadata 同步：
+
+- `TorchAll2AllDispatcher` 仍需要在 dispatch 阶段拿到 Python `input_splits` / `output_splits`。
+- `DeepEPDispatcher` 仍可能在库内部等待 receive count，并把 `num_recv_tokens_per_expert_list` 暴露给 Python。
+- TP+EP 路径仍需要 TP size meta 来发起变长 TP AllGather / ReduceScatterSum。
+
+因此，对 Domino EP 来说，compile 的收益主要是缩短 `_pre_moe_forward`、expert block、`_post_moe_forward` 等本地计算段；
+它不能把 dispatcher 的 host 等待变成 GPU-only 异步，也不能改变 2.1 和 DeepEP “Host metadata 同步”小节里的重叠约束。
+如果 host metadata 等待超过另一个 micro batch 能覆盖的计算窗口，真实 overlap 仍会下降。
+
 ## DeepEPDispatcher: DeepEP Buffer dispatch/combine 原理
 
 `DeepEPDispatcher` 仍然暴露和其他 dispatcher 一样的六阶段接口，但它把 EP all2all 的 routing layout、通信 handle

From c37340517d1ddcf9d271592e29e9b335f7b587ea Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 03:24:08 +0000
Subject: [PATCH 15/25] Validate ExpertTP-only training

---
 tests/engine/test_moe_train_engine_tpep.py    | 172 ++++++++++++++++++
 tests/model/test_moe_expert_tp_without_ep.py  |   7 +-
 xtuner/v1/model/moe/moe.py                    |  52 +++---
 .../module/decoder_layer/moe_decoder_layer.py |  12 +-
 .../module/grouped_linear/moe_group_linear.py |  22 +--
 5 files changed, 223 insertions(+), 42 deletions(-)

diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
index 733a15ec2..56b000680 100644
--- a/tests/engine/test_moe_train_engine_tpep.py
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -41,6 +41,7 @@
 from xtuner.v1.engine.train_engine import TrainEngine
 from xtuner.v1.loss.ce_loss import CELossConfig
 from xtuner.v1.module.attention import MHAConfig
+from xtuner.v1.module.dispatcher.base import NaiveDispatcher
 from xtuner.v1.module.grouped_linear.moe_group_linear import GroupedLinear
 from xtuner.v1.module.router.greedy import GreedyRouterConfig
 from xtuner.v1.model.base import ModelItem
@@ -294,6 +295,177 @@ def _slice_tpep_bias(grouped_linear: GroupedLinear, full_bias: torch.Tensor) ->
     return expert_bias.reshape(grouped_linear.bias.shape)
 
 
+class TestMoETrainEngineExpertTPOnly(DeterministicDDPTestCase):
+    """Verify ExpertTP-only training matches the non-ExpertTP baseline."""
+
+    @parametrize.parametrize(
+        "device,expert_tp_size",
+        [
+            ("cuda", 2),
+        ],
+    )
+    def test_expert_tp_only_engine_constructs_and_trains(self, device: str, expert_tp_size: int) -> None:
+        pg = self.create_pg(device)
+
+        engine = _build_engine(ep_size=1, expert_tp_size=expert_tp_size)
+        engine.init_model_weights()
+
+        assert engine.model.ep_mesh is not None
+        assert engine.model.expert_tp_mesh is not None
+        assert engine.model.ep_mesh.size() == 1
+        assert engine.model.expert_tp_mesh.size() == expert_tp_size
+        assert engine.model.expert_tp_mesh.mesh_dim_names == (f"{engine.model.config.mesh_prefix}.etp",)
+        assert isinstance(engine.model.layers["0"].dispatcher, NaiveDispatcher)
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        loss_val = _run_train_step_without_clip(engine, loss_cfg, input_ids, labels)
+        grad_norm = engine.clip_grad_norm()
+        engine.step_optimizer(grad_norm)
+
+        assert torch.isfinite(torch.tensor(loss_val))
+        assert torch.isfinite(grad_norm)
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,expert_tp_size",
+        [
+            ("cuda", 2),
+        ],
+    )
+    def test_expert_tp_only_matches_single_with_distinct_source_slices(
+        self, device: str, expert_tp_size: int
+    ) -> None:
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
+        engine_ref.init_model_weights()
+
+        engine_etp = _build_engine(ep_size=1, expert_tp_size=expert_tp_size)
+        engine_etp.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_etp)
+        dist.barrier()
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        loss_etp, _, norm_etp = _run_one_step_with_norm(engine_etp, loss_cfg, input_ids, labels)
+        loss_ref, _, norm_ref = _run_one_step_with_norm(engine_ref, loss_cfg, input_ids, labels)
+
+        torch.testing.assert_close(
+            torch.tensor(loss_etp),
+            torch.tensor(loss_ref),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        gate_grad_ref = _get_param_grad(engine_ref, "layers.0.gate.weight")
+        gate_grad_etp = _get_param_grad(engine_etp, "layers.0.gate.weight")
+        torch.testing.assert_close(
+            gate_grad_etp,
+            gate_grad_ref,
+            atol=BF16_GEMM_ATOL,
+            rtol=BF16_GEMM_RTOL,
+        )
+
+        for module_suffix, fused_gate_up in (
+            ("layers.0.experts.fused_w1w3", True),
+            ("layers.0.experts.fused_w2", False),
+        ):
+            ref_grad = _get_param_grad(engine_ref, f"{module_suffix}.weight")
+            etp_grad = _get_param_grad(engine_etp, f"{module_suffix}.weight")
+            etp_module = _get_tpep_grouped_linear(engine_etp, module_suffix)
+            expected_etp_grad = _slice_tpep_weight(etp_module, ref_grad, fused_gate_up=fused_gate_up)
+            torch.testing.assert_close(
+                etp_grad,
+                expected_etp_grad,
+                atol=BF16_GEMM_ATOL,
+                rtol=BF16_GEMM_RTOL,
+            )
+
+        torch.testing.assert_close(
+            norm_etp,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @parametrize.parametrize(
+        "device,expert_tp_size",
+        [
+            ("cuda", 2),
+        ],
+    )
+    def test_expert_tp_only_expert_grad_norm_matches_single_with_distinct_source_slices(
+        self, device: str, expert_tp_size: int
+    ) -> None:
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=1)
+        engine_ref.init_model_weights()
+
+        engine_etp = _build_engine(ep_size=1, expert_tp_size=expert_tp_size)
+        engine_etp.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_etp)
+        dist.barrier()
+
+        input_ids, labels = _make_engine_input(
+            torch.device(device, dist.get_rank() % torch.cuda.device_count()),
+            seed_offset=dist.get_rank(),
+        )
+        loss_cfg = CELossConfig()
+
+        _run_train_step_without_clip(engine_etp, loss_cfg, input_ids, labels)
+        _run_train_step_without_clip(engine_ref, loss_cfg, input_ids, labels)
+        _zero_non_expert_grads(engine_etp)
+        _zero_non_expert_grads(engine_ref)
+
+        norm_etp = engine_etp.clip_grad_norm(do_clip=False).detach().float().cpu()
+        norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+        torch.testing.assert_close(
+            norm_etp,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        # ExpertTP-only topology: EP=1, TP=2, DP=1.
+        return 2
+
+    @property
+    def destroy_pg_upon_exit(self) -> bool:
+        return False
+
+
 class TestMoETrainEngineTPEP(DeterministicDDPTestCase):
     """Verify EP+TP training matches single-GPU (EP=1, TP=1) forward and backward."""
 
diff --git a/tests/model/test_moe_expert_tp_without_ep.py b/tests/model/test_moe_expert_tp_without_ep.py
index 3993c2dce..94bcc2cc1 100644
--- a/tests/model/test_moe_expert_tp_without_ep.py
+++ b/tests/model/test_moe_expert_tp_without_ep.py
@@ -54,11 +54,14 @@ def test_builds_real_ep_ownership_mesh_for_expert_tp_without_ep(self) -> None:
 
         # 中文注释：不开 EP 但开启 expert TP 时，EP ownership 维度仍然真实存在，只是 size=1。
         assert model.ep_mesh is not None
-        assert model.tp_mesh is not None
+        assert model.expert_tp_mesh is not None
         assert model.ep_mesh.size() == 1
-        assert model.tp_mesh.size() == 2
+        assert model.expert_tp_mesh.size() == 2
+        assert model.expert_tp_mesh.mesh_dim_names == (f"{model.config.mesh_prefix}.etp",)
         assert layer.experts.fused_w1w3.ep_size == 1
         assert layer.experts.fused_w1w3.tp_size == 2
+        assert layer.experts.fused_w1w3.expert_tp_mesh is not None
+        assert layer.experts.fused_w1w3.expert_tp_mesh.mesh_dim_names == (f"{model.config.mesh_prefix}.etp",)
         assert isinstance(layer.dispatcher, NaiveDispatcher)
 
         dist.barrier()
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
index edf9d0e3a..642c87c3a 100644
--- a/xtuner/v1/model/moe/moe.py
+++ b/xtuner/v1/model/moe/moe.py
@@ -177,7 +177,7 @@ class MoE(BaseModel):
 
     config: MoEConfig
     ep_mesh: DeviceMesh | None = None
-    tp_mesh: DeviceMesh | None = None
+    expert_tp_mesh: DeviceMesh | None = None
 
     def __init__(self, config: MoEConfig):
         super().__init__(config)
@@ -195,11 +195,11 @@ def __init__(self, config: MoEConfig):
                     mesh_dim_names=(
                         f"{self.config.mesh_prefix}.dp",
                         f"{self.config.mesh_prefix}.ep",
-                        f"{self.config.mesh_prefix}.tp",
+                        f"{self.config.mesh_prefix}.etp",
                     ),
                 )
                 self.ep_mesh = _init_mesh[f"{self.config.mesh_prefix}.ep"]
-                self.tp_mesh = _init_mesh[f"{self.config.mesh_prefix}.tp"]
+                self.expert_tp_mesh = _init_mesh[f"{self.config.mesh_prefix}.etp"]
             else:
                 _init_mesh = init_device_mesh(
                     DEVICE,
@@ -207,10 +207,10 @@ def __init__(self, config: MoEConfig):
                     mesh_dim_names=(f"{self.config.mesh_prefix}.dp", f"{self.config.mesh_prefix}.ep"),
                 )
                 self.ep_mesh = _init_mesh[f"{self.config.mesh_prefix}.ep"]
-                self.tp_mesh = None
+                self.expert_tp_mesh = None
         else:
             self.ep_mesh = None
-            self.tp_mesh = None
+            self.expert_tp_mesh = None
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, type=config.rms_norm_type)
         self.lm_head = LMHead(config.hidden_size, config.vocab_size, bias=False)
@@ -847,7 +847,7 @@ def build_layers(self, config: MoEConfig) -> nn.ModuleDict:
                     layer_idx=layer_idx,
                     dispatcher=config.dispatcher,
                     ep_mesh=self.ep_mesh,
-                    tp_mesh=self.tp_mesh,
+                    expert_tp_mesh=self.expert_tp_mesh,
                 )
                 if self.config.freeze_routers:
                     layers[str(layer_idx)].gate.requires_grad_(False)
@@ -912,7 +912,7 @@ def build_mtp_block(self, config: MoEConfig) -> MTPBlock:
                 layer_idx=config.num_hidden_layers + i,
                 dispatcher=config.dispatcher,
                 ep_mesh=self.ep_mesh,
-                tp_mesh=self.tp_mesh,
+                expert_tp_mesh=self.expert_tp_mesh,
             )
 
             # Wrap decoder layer in MTPLayer
@@ -979,7 +979,10 @@ def fully_shard(
             for param in self.parameters():
                 param.requires_grad = False
 
-        if self.ep_mesh.size() > 1:
+        tp_enabled = self.expert_tp_mesh is not None and self.expert_tp_mesh.size() > 1
+        if self.ep_mesh.size() > 1 or tp_enabled:
+            # 中文注释：不开 EP 但开启 expert TP 时，非 expert 参数仍是 TP rank 间的逻辑副本，
+            # 需要显式放到 Replicate DTensor 上，后续梯度才会跨 expert TP 平均。
             self._replicate_other_params(self)
 
         # Although rotary_emb was already constructed in __init__, it was built on the meta device.
@@ -1104,10 +1107,13 @@ def scale_and_reduce_grad(self):
             if param.grad is None:
                 continue
 
-            ep_enabled = self.ep_mesh is not None and self.ep_mesh.size() > 1
-            # Scale moe parameters
-            if ep_enabled and ".experts" in name:
-                param.grad.div_(self.ep_mesh.size() * self.config.expert_tp_size)  # type: ignore
+            expert_parallel_size = (
+                self.ep_mesh.size() if self.ep_mesh is not None else 1
+            ) * self.config.expert_tp_size
+            # 中文注释：expert 参数会在 EP 和 expert TP 维度上看到全量 token 梯度和，
+            # 需要按参与该 expert 计算的 rank 数平均，才能对齐普通 DP/FSDP baseline。
+            if expert_parallel_size > 1 and ".experts" in name:
+                param.grad.div_(expert_parallel_size)  # type: ignore
                 continue
 
             if isinstance(param, DTensor):
@@ -1164,11 +1170,11 @@ def cal_grad_norm(self, grads: list[DTensor], dtype=torch.float32):
                         raise ValueError(f"Unsupported placement type {placement} in clip_grad_norm")
 
             if self.config.expert_tp_size > 1 and ".experts" in name:
-                assert self.ep_mesh is not None and self.tp_mesh is not None
+                assert self.ep_mesh is not None and self.expert_tp_mesh is not None
                 # expert 参数的 EP / expert TP 分片不是 DTensor placement，
                 # norm square 需要显式跨这两个维度求和，clip 系数才是全局的。
                 dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=self.ep_mesh.get_group())
-                dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=self.tp_mesh.get_group())
+                dist.all_reduce(local_norm_squared, op=ReduceOp.SUM, group=self.expert_tp_mesh.get_group())
 
             total_norm_squared += local_norm_squared
 
@@ -1192,7 +1198,7 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
                     mesh_dim_names=(
                         f"{self.config.mesh_prefix}.fsdp",
                         f"{self.config.mesh_prefix}.ep",
-                        f"{self.config.mesh_prefix}.tp",
+                        f"{self.config.mesh_prefix}.etp",
                     ),
                 )
             else:
@@ -1240,12 +1246,12 @@ def _init_device_mesh(self, fsdp_config: FSDPConfig):
                 self.ep_mesh = model_mesh[f"{self.config.mesh_prefix}.ep"]
 
             if expert_tp_size > 1:
-                new_tp_mesh = model_mesh[f"{self.config.mesh_prefix}.tp"]
-                if self.tp_mesh is not None:
-                    assert new_tp_mesh.mesh_dim_names == self.tp_mesh.mesh_dim_names
-                    assert torch.equal(self.tp_mesh.mesh, new_tp_mesh.mesh)
+                new_expert_tp_mesh = model_mesh[f"{self.config.mesh_prefix}.etp"]
+                if self.expert_tp_mesh is not None:
+                    assert new_expert_tp_mesh.mesh_dim_names == self.expert_tp_mesh.mesh_dim_names
+                    assert torch.equal(self.expert_tp_mesh.mesh, new_expert_tp_mesh.mesh)
                 else:
-                    self.tp_mesh = new_tp_mesh
+                    self.expert_tp_mesh = new_expert_tp_mesh
 
             self.fsdp_mesh = model_mesh[f"{self.config.mesh_prefix}.fsdp"]
         else:
@@ -1278,14 +1284,14 @@ def traverse(module: nn.Module) -> None:
                 assert self.ep_mesh is not None
                 replicate_mesh = self.ep_mesh
                 placements = [Replicate()]
-                if self.tp_mesh is not None and self.tp_mesh.size() > 1:
+                if self.expert_tp_mesh is not None and self.expert_tp_mesh.size() > 1:
                     assert self._world_mesh is not None
                     # 非 expert 参数在 EP 和 expert TP 上都是逻辑副本。
                     # FSDP 只支持一维 TP/Replicate 布局，所以这里先把
                     # EP x expert TP 子网格压平成一个 Replicate 维度。
                     replicate_mesh = self._world_mesh[
-                        (f"{self.config.mesh_prefix}.ep", f"{self.config.mesh_prefix}.tp")
-                    ]._flatten(mesh_dim_name=f"{self.config.mesh_prefix}.ep_tp")
+                        (f"{self.config.mesh_prefix}.ep", f"{self.config.mesh_prefix}.etp")
+                    ]._flatten(mesh_dim_name=f"{self.config.mesh_prefix}.ep_etp")
                 dist_param = nn.Parameter(
                     distribute_tensor(param, replicate_mesh, placements),
                     requires_grad=param.requires_grad,
diff --git a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
index b0deb154a..7e7bb8c8b 100644
--- a/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
+++ b/xtuner/v1/module/decoder_layer/moe_decoder_layer.py
@@ -150,7 +150,7 @@ def __init__(
         n_routed_experts: int,
         moe_bias: bool = False,
         ep_mesh: DeviceMesh | None = None,
-        tp_mesh: DeviceMesh | None = None,
+        expert_tp_mesh: DeviceMesh | None = None,
         float8_cfg: Float8Config | None = None,
         moe_act_fn_cfg: MoEActFnConfig,
     ):
@@ -167,7 +167,7 @@ def __init__(
             self.num_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=self.ep_mesh,
-            tp_mesh=tp_mesh,
+            expert_tp_mesh=expert_tp_mesh,
             parallel_style="column",
             float8_cfg=float8_cfg,
         )
@@ -177,7 +177,7 @@ def __init__(
             self.num_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=self.ep_mesh,
-            tp_mesh=tp_mesh,
+            expert_tp_mesh=expert_tp_mesh,
             parallel_style="row",
             float8_cfg=float8_cfg,
         )
@@ -220,7 +220,7 @@ def __init__(
         layer_idx: int = 0,
         dispatcher: Literal["deepep", "all2all", "agrs"] | None,
         ep_mesh: DeviceMesh | None = None,
-        tp_mesh: DeviceMesh | None = None,
+        expert_tp_mesh: DeviceMesh | None = None,
     ):
         super().__init__()
         self.ep_mesh = ep_mesh
@@ -274,13 +274,13 @@ def __init__(
             n_routed_experts=n_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=ep_mesh,
-            tp_mesh=tp_mesh,
+            expert_tp_mesh=expert_tp_mesh,
             float8_cfg=float8_cfg,
             moe_act_fn_cfg=moe_act_fn_cfg,
         )
         # TODO: (yehaochen) Maybe should be replaced by build_dispatcher
         process_group = ep_mesh.get_group() if ep_mesh is not None else None
-        tp_group = tp_mesh.get_group() if tp_mesh is not None else None
+        tp_group = expert_tp_mesh.get_group() if expert_tp_mesh is not None else None
         self.dispatcher = build_dispatcher(
             dispatcher=dispatcher,
             n_routed_experts=n_routed_experts,
diff --git a/xtuner/v1/module/grouped_linear/moe_group_linear.py b/xtuner/v1/module/grouped_linear/moe_group_linear.py
index 2887c1958..eb1321988 100644
--- a/xtuner/v1/module/grouped_linear/moe_group_linear.py
+++ b/xtuner/v1/module/grouped_linear/moe_group_linear.py
@@ -22,7 +22,7 @@ def __init__(
         num_routed_experts: int,
         moe_bias: bool = False,
         ep_mesh: DeviceMesh | None = None,
-        tp_mesh: DeviceMesh | None = None,
+        expert_tp_mesh: DeviceMesh | None = None,
         parallel_style: GroupedLinearParallelStyle | None = None,
     ):
         super().__init__()
@@ -31,15 +31,15 @@ def __init__(
         self.num_routed_experts = num_routed_experts
 
         self.ep_mesh = ep_mesh
-        self.tp_mesh = tp_mesh
+        self.expert_tp_mesh = expert_tp_mesh
         self.parallel_style: GroupedLinearParallelStyle | None = parallel_style
         self.ep_size = ep_mesh.size() if ep_mesh is not None else 1
-        self.tp_size = tp_mesh.size() if tp_mesh is not None else 1
+        self.tp_size = expert_tp_mesh.size() if expert_tp_mesh is not None else 1
         self.ep_rank = ep_mesh.get_local_rank() if ep_mesh is not None else 0
-        self.tp_rank = tp_mesh.get_local_rank() if tp_mesh is not None else 0
-        self.tp_enabled = self.tp_mesh is not None and self.tp_size > 1 and self.parallel_style is not None
-        if self.tp_mesh is not None and self.tp_mesh.size() > 1 and self.parallel_style is None:
-            raise ValueError("parallel_style must be set when tp_mesh size is greater than 1.")
+        self.tp_rank = expert_tp_mesh.get_local_rank() if expert_tp_mesh is not None else 0
+        self.tp_enabled = self.expert_tp_mesh is not None and self.tp_size > 1 and self.parallel_style is not None
+        if self.expert_tp_mesh is not None and self.expert_tp_mesh.size() > 1 and self.parallel_style is None:
+            raise ValueError("parallel_style must be set when expert_tp_mesh size is greater than 1.")
         if self.num_routed_experts % self.ep_size != 0:
             raise ValueError(
                 f"num_routed_experts ({self.num_routed_experts}) must be divisible by ep_size ({self.ep_size})."
@@ -106,7 +106,7 @@ def build_grouped_linear(
     num_routed_experts: int,
     moe_bias: bool = False,
     ep_mesh: DeviceMesh | None = None,
-    tp_mesh: DeviceMesh | None = None,
+    expert_tp_mesh: DeviceMesh | None = None,
     parallel_style: GroupedLinearParallelStyle | None = None,
     float8_cfg: Float8Config | None = None,
 ):
@@ -118,12 +118,12 @@ def build_grouped_linear(
             num_routed_experts,
             moe_bias=moe_bias,
             ep_mesh=ep_mesh,
-            tp_mesh=tp_mesh,
+            expert_tp_mesh=expert_tp_mesh,
             parallel_style=parallel_style,
         )
     elif float8_cfg.scaling_granularity_grouped_gemm == ScalingGranularity.TILEWISE:
-        if tp_mesh is not None and tp_mesh.size() > 1:
-            raise NotImplementedError("Tile-wise float8 grouped linear does not support TP sharding yet.")
+        if expert_tp_mesh is not None and expert_tp_mesh.size() > 1:
+            raise NotImplementedError("Tile-wise float8 grouped linear does not support expert TP sharding yet.")
         return TileWiseFloat8GroupedLinear(
             in_features, out_features, num_routed_experts, moe_bias=moe_bias, ep_mesh=ep_mesh
         )

From 92741d038b7c904a97b2e9aae2d8b0ca6bcea56f Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 03:46:11 +0000
Subject: [PATCH 16/25] Add Domino ExpertTP-only engine test

---
 tests/engine/test_moe_train_engine_tpep.py | 206 ++++++++++++++++++++-
 1 file changed, 198 insertions(+), 8 deletions(-)

diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
index 56b000680..65bcf4d1c 100644
--- a/tests/engine/test_moe_train_engine_tpep.py
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -98,7 +98,12 @@ def _build_tiny_moe_cfg(ep_size: int = 1, expert_tp_size: int = 1) -> Qwen3MoECo
     )
 
 
-def _build_engine(ep_size: int, expert_tp_size: int, data_tp_size: int = 1) -> TrainEngine:
+def _build_engine(
+    ep_size: int,
+    expert_tp_size: int,
+    data_tp_size: int = 1,
+    intra_layer_micro_batch: int = 1,
+) -> TrainEngine:
     moe_cfg = _build_tiny_moe_cfg(ep_size, expert_tp_size)
     optim_cfg = AdamWConfig()
     fsdp_cfg = FSDPConfig(
@@ -106,7 +111,12 @@ def _build_engine(ep_size: int, expert_tp_size: int, data_tp_size: int = 1) -> T
         tp_size=data_tp_size,
         cpu_offload=False,
     )
-    return TrainEngine(model_cfg=moe_cfg, optim_cfg=optim_cfg, fsdp_cfg=fsdp_cfg)
+    return TrainEngine(
+        model_cfg=moe_cfg,
+        optim_cfg=optim_cfg,
+        fsdp_cfg=fsdp_cfg,
+        intra_layer_micro_batch=intra_layer_micro_batch,
+    )
 
 
 def _make_engine_input(device: torch.device, seed_offset: int = 0) -> tuple[torch.Tensor, torch.Tensor]:
@@ -160,19 +170,122 @@ def _run_train_step_without_clip(
     input_ids: torch.Tensor,
     labels: torch.Tensor,
 ) -> float:
-    seq_ctx = SequenceContext.from_input_ids((input_ids,), device=DEVICE)
-    shifted_labels = labels.to(DEVICE)
+    engine_input = _make_engine_items(loss_cfg, [(input_ids, labels)])
+    step_info = engine.train_step(engine_input)
+    return step_info["logs_info"]["reduced_llm_loss"]
+
+
+def _make_engine_items(
+    loss_cfg: CELossConfig,
+    batches: list[tuple[torch.Tensor, torch.Tensor]],
+) -> list[ModelItem]:
+    loss_ctx_list = []
+    seq_ctx_list = []
+    for input_ids, labels in batches:
+        seq_ctx_list.append(SequenceContext.from_input_ids((input_ids,), device=DEVICE))
+        shifted_labels = labels.to(DEVICE)
+        loss_ctx_list.append(loss_cfg.build(data={"shifted_labels": shifted_labels}, sp_mesh=None))
 
     LossContext = loss_cfg.loss_ctx_cls
-    loss_ctx = loss_cfg.build(data={"shifted_labels": shifted_labels}, sp_mesh=None)
-    loss_ctx_list = LossContext.build_batches([loss_ctx])
-    loss_ctx = loss_ctx_list[0]
+    loss_ctx_list = LossContext.build_batches(loss_ctx_list)
+    return [
+        ModelItem(seq_ctx=seq_ctx, loss_ctx={"lm": loss_ctx})
+        for seq_ctx, loss_ctx in zip(seq_ctx_list, loss_ctx_list)
+    ]
 
-    engine_input = [ModelItem(seq_ctx=seq_ctx, loss_ctx={"lm": loss_ctx})]
+
+def _run_train_step_items_without_clip(
+    engine: TrainEngine,
+    loss_cfg: CELossConfig,
+    batches: list[tuple[torch.Tensor, torch.Tensor]],
+) -> float:
+    engine_input = _make_engine_items(loss_cfg, batches)
     step_info = engine.train_step(engine_input)
     return step_info["logs_info"]["reduced_llm_loss"]
 
 
+def _record_expert_tp_collective_stages(engine: TrainEngine) -> dict[str, list[str]]:
+    stages: dict[str, list[str]] = {
+        "async_op_true": [],
+        "async_all_gather": [],
+        "async_all_gather_metadata": [],
+        "async_reduce_scatter_sum": [],
+    }
+    current_stage: list[str] = []
+
+    for layer in engine.model.layers.values():
+        dispatcher = layer.dispatcher
+        expert_tp = dispatcher._expert_tp
+        if expert_tp is None:
+            continue
+
+        for stage_name in (
+            "dispatch_preprocess",
+            "dispatch",
+            "dispatch_postprocess",
+            "combine_preprocess",
+            "combine",
+            "combine_postprocess",
+        ):
+            original_stage = getattr(dispatcher, stage_name)
+
+            def stage_wrapper(*args, _original_stage=original_stage, _stage_name=stage_name, **kwargs):
+                if kwargs.get("async_op", False):
+                    stages["async_op_true"].append(_stage_name)
+                current_stage.append(_stage_name)
+                try:
+                    return _original_stage(*args, **kwargs)
+                finally:
+                    current_stage.pop()
+
+            setattr(dispatcher, stage_name, stage_wrapper)
+
+        for collective_name in (
+            "async_all_gather",
+            "async_all_gather_metadata",
+            "async_reduce_scatter_sum",
+        ):
+            original_collective = getattr(expert_tp, collective_name)
+
+            def collective_wrapper(
+                *args,
+                _original_collective=original_collective,
+                _collective_name=collective_name,
+                **kwargs,
+            ):
+                stages[_collective_name].append(current_stage[-1] if current_stage else "<outside>")
+                return _original_collective(*args, **kwargs)
+
+            setattr(expert_tp, collective_name, collective_wrapper)
+
+    return stages
+
+
+def _assert_domino_expert_tp_collective_stages(stages: dict[str, list[str]]) -> None:
+    assert set(stages["async_op_true"]) == {
+        "dispatch_preprocess",
+        "dispatch",
+        "dispatch_postprocess",
+        "combine_preprocess",
+        "combine",
+        "combine_postprocess",
+    }
+    assert stages["async_all_gather"]
+    assert stages["async_all_gather_metadata"]
+    assert stages["async_reduce_scatter_sum"]
+    assert set(stages["async_all_gather"]) == {"dispatch"}
+    assert set(stages["async_all_gather_metadata"]) == {"dispatch"}
+    assert set(stages["async_reduce_scatter_sum"]) == {"combine"}
+
+
+def _assert_rank_inputs_are_distinct(batches: list[tuple[torch.Tensor, torch.Tensor]]) -> None:
+    local_input_ids = tuple(tuple(input_ids.detach().cpu().reshape(-1).tolist()) for input_ids, _ in batches)
+    gathered_input_ids: list[tuple[tuple[int, ...], ...] | None] = [None for _ in range(dist.get_world_size())]
+    dist.all_gather_object(gathered_input_ids, local_input_ids)
+    # ExpertTP-only 下每个 TP rank 使用不同样本，避免重复输入掩盖 shard 问题。
+    assert len(set(gathered_input_ids)) == len(gathered_input_ids)
+
+
 def _get_param_grad(engine: TrainEngine, name_suffix: str) -> torch.Tensor:
     for name, param in engine.model.named_parameters():
         if _canonical_name(name).endswith(name_suffix):
@@ -245,6 +358,22 @@ def _sync_engine_weights(engine_ref: TrainEngine, engine_tpep: TrainEngine) -> N
                 _copy_param_from_full(param, ref_full)
 
 
+def _copy_matching_engine_weights(engine_src: TrainEngine, engine_dst: TrainEngine) -> None:
+    """Copy weights between engines that already use the same parameter layout."""
+    src_params = dict(engine_src.model.named_parameters())
+
+    with torch.no_grad():
+        for name, dst_param in engine_dst.model.named_parameters():
+            src_param = src_params[name].detach()
+            if isinstance(dst_param, DTensor):
+                assert isinstance(src_param, DTensor), f"Parameter layout mismatch for {name}"
+                # 两个 engine 的并行布局相同，直接拷贝本 rank 的 DTensor shard。
+                dst_param.copy_(src_param.to(dtype=dst_param.dtype))
+            else:
+                src_tensor = _full_tensor(src_param).to(device=dst_param.device, dtype=dst_param.dtype)
+                dst_param.copy_(src_tensor)
+
+
 def _slice_tpep_weight(grouped_linear: GroupedLinear, full_weight: torch.Tensor, *, fused_gate_up: bool) -> torch.Tensor:
     num_experts = grouped_linear.num_routed_experts
     out_features = grouped_linear.out_features
@@ -456,6 +585,67 @@ def test_expert_tp_only_expert_grad_norm_matches_single_with_distinct_source_sli
         except Exception:
             pass
 
+    @parametrize.parametrize(
+        "device,expert_tp_size",
+        [
+            ("cuda", 2),
+        ],
+    )
+    def test_expert_tp_only_domino_micro_batch_matches_sync_baseline(
+        self, device: str, expert_tp_size: int
+    ) -> None:
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=1, expert_tp_size=expert_tp_size)
+        engine_ref.init_model_weights()
+
+        engine_domino = _build_engine(
+            ep_size=1,
+            expert_tp_size=expert_tp_size,
+            intra_layer_micro_batch=2,
+        )
+        engine_domino.init_model_weights()
+        _copy_matching_engine_weights(engine_ref, engine_domino)
+        collective_stages = _record_expert_tp_collective_stages(engine_domino)
+        dist.barrier()
+
+        device_obj = torch.device(device, dist.get_rank() % torch.cuda.device_count())
+        batches = [
+            _make_engine_input(device_obj, seed_offset=dist.get_rank() * 2),
+            _make_engine_input(device_obj, seed_offset=dist.get_rank() * 2 + 1),
+        ]
+        _assert_rank_inputs_are_distinct(batches)
+        loss_cfg = CELossConfig()
+
+        loss_domino = _run_train_step_items_without_clip(engine_domino, loss_cfg, batches)
+        norm_domino = engine_domino.clip_grad_norm(do_clip=False).detach().float().cpu()
+
+        loss_ref = _run_train_step_items_without_clip(engine_ref, loss_cfg, batches)
+        norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+
+        _assert_domino_expert_tp_collective_stages(collective_stages)
+        torch.testing.assert_close(
+            torch.tensor(loss_domino),
+            torch.tensor(loss_ref),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            norm_domino,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        assert torch.isfinite(torch.tensor(loss_domino))
+        assert torch.isfinite(norm_domino)
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
     @property
     def world_size(self) -> int:
         # ExpertTP-only topology: EP=1, TP=2, DP=1.

From aa5c9b8a234b052af1eb0ede7722598e210dc54c Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 08:19:57 +0000
Subject: [PATCH 17/25] Share ExpertTP row collectives in All2All dispatcher

---
 CONTEXT.md                                    |  47 +--
 megatron_tp_ep.md                             |   6 +-
 tests/engine/test_moe_train_engine_tpep.py    | 110 +++++-
 .../module/dispatcher/test_noep_expert_tp.py  |   8 +-
 .../test_torch_all2all_shared_expert_tp.py    | 324 ++++++++++++++++++
 .../test_torch_all2all_tpep_async.py          |  18 +-
 xtuner/v1/module/dispatcher/__init__.py       |   9 +-
 xtuner/v1/module/dispatcher/base.py           |  36 +-
 xtuner/v1/module/dispatcher/expert_tp.py      | 199 +++++++----
 xtuner/v1/module/dispatcher/torch_all2all.py  |  88 ++++-
 .../module/dispatcher/torch_all2all_tpep.py   | 208 +++++------
 xtuner_ep_dispatcher.md                       |  48 +--
 xtuner_ep_domino.md                           |  16 +-
 13 files changed, 838 insertions(+), 279 deletions(-)
 create mode 100644 tests/module/dispatcher/test_torch_all2all_shared_expert_tp.py

diff --git a/CONTEXT.md b/CONTEXT.md
index 298b15b1c..e39ca6501 100644
--- a/CONTEXT.md
+++ b/CONTEXT.md
@@ -4,66 +4,67 @@ This context describes the communication language used by XTuner MoE dispatchers
 
 ## Language
 
-**TP ReduceScatterSum**:
+**TP ReduceScatterRowsSum**:
 对同一 TP group 中完整 token 批的 hidden 做 SUM 归约，并只保留当前 TP rank 负责的 token slice 的通信语义。
 _Avoid_: all_reduce + slice
 
-**Variable TP ReduceScatterSum**:
-使用 **TP size meta** 描述不等长 token slice 的 **TP ReduceScatterSum**。
+**Variable TP ReduceScatterRowsSum**:
+使用 **TP rank row counts** 描述不等长 token slice 的 **TP ReduceScatterRowsSum**。
 _Avoid_: equal-only reduce scatter
 
-**TP size meta**:
-每个 expert TP rank 在 TP AllGather 前、当前 dispatcher token 空间中拥有的 token 行数列表，用来描述变长 TP token slice 的拼接和切分边界。
+**TP rank row counts**:
+每个 expert TP rank 在 TP AllGather 前、当前 dispatcher token 空间中拥有的 token 行数列表。代码中叫
+`tp_rank_row_counts`，用来描述变长 TP token slice 的拼接和切分边界。
 _Avoid_: shape hack, split list
 
 **Token-sliced Expert TP**:
-expert MLP 权重按 TP 切分，并让每个 expert TP rank 只保留自己的 token slice；expert 前用 **TP AllGather** 得到完整 token 批，expert 后用 **TP ReduceScatterSum** 回到本 rank 的 token slice。
+expert MLP 权重按 TP 切分，并让每个 expert TP rank 只保留自己的 token slice；expert 前用 **TP AllGather** 得到完整 token 批，expert 后用 **TP ReduceScatterRowsSum** 回到本 rank 的 token slice。
 _Also called_: ExpertTP in dispatcher code
 _Avoid_: replicated-token expert TP
 
 **Domino-compatible ExpertTP**:
-让 **Token-sliced Expert TP** 的 **TP AllGather** 属于 dispatcher dispatch 通信段，让 **TP ReduceScatterSum** 属于 dispatcher combine 通信段，从而能被 Domino micro-batch 流水隐藏的 MoE expert TP 语义。
+让 **Token-sliced Expert TP** 的 **TP AllGather** 属于 dispatcher dispatch 通信段，让 **TP ReduceScatterRowsSum** 属于 dispatcher combine 通信段，从而能被 Domino micro-batch 流水隐藏的 MoE expert TP 语义。
 _Avoid_: attention TP, dense MLP TP
 
 ## Relationships
 
-- **TP AllGather** 的反向通信是 **TP ReduceScatterSum**。
-- **TP ReduceScatterSum** 的反向通信是 **TP AllGather**。
-- **TP size meta** 定义 **TP ReduceScatterSum** 输出给每个 TP rank 的 token slice 边界。
-- **Token-sliced Expert TP** 是 `expert_tp_size > 1` 的默认语义；`ep_size=1` 时 EP AllToAll 退化为空，但 TP AllGather / TP ReduceScatterSum 仍然保留。
-- **Variable TP ReduceScatterSum** 是 routed MoE token-sliced expert TP 下的默认语义；等长 fast path 只是实现优化。
-- **TP ReduceScatterSum** 的实现策略应集中在一个共享核心函数中，避免 combine forward 和 TP AllGather backward 分叉。
-- **TP ReduceScatterSum** 的输出 shape 严格由当前 TP rank 的 **TP size meta** 决定，允许 0 行，不引入 padding 或 capacity。
+- **TP AllGather** 的反向通信是 **TP ReduceScatterRowsSum**。
+- **TP ReduceScatterRowsSum** 的反向通信是 **TP AllGather**。
+- **TP rank row counts** 定义 **TP ReduceScatterRowsSum** 输出给每个 TP rank 的 token slice 边界。
+- **Token-sliced Expert TP** 是 `expert_tp_size > 1` 的默认语义；`ep_size=1` 时 EP AllToAll 退化为空，但 TP AllGather / TP ReduceScatterRowsSum 仍然保留。
+- **Variable TP ReduceScatterRowsSum** 是 routed MoE token-sliced expert TP 下的默认语义；等长 fast path 只是实现优化。
+- **TP ReduceScatterRowsSum** 的实现策略应集中在一个共享核心函数中，避免 combine forward 和 TP AllGather backward 分叉。
+- **TP ReduceScatterRowsSum** 的输出 shape 严格由当前 TP rank 的 **TP rank row counts** 决定，允许 0 行，不引入 padding 或 capacity。
 - 当 `ep_size=1` 且 `expert_tp_size>1` 时，expert ownership 维度仍然存在，只是大小为 1；所有 routed experts 都属于这个唯一 EP rank。
-- 在 Naive routing + **Token-sliced Expert TP** 下，**TP size meta** 记录 source token rows；在 EP routing + **Token-sliced Expert TP** 下，**TP size meta** 记录 EP routing 后的 route-copy rows。
-- **Token-sliced Expert TP** 的异步边界由 TP AllGather 和 **TP ReduceScatterSum** 定义；这个边界不依赖 EP 是否开启。
+- 在 Naive routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 source token rows；在 EP routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 EP routing 后的 route-copy rows。
+- **Token-sliced Expert TP** 的异步边界由 TP AllGather 和 **TP ReduceScatterRowsSum** 定义；这个边界不依赖 EP 是否开启。
 - 当前支持范围是 Naive routing + **Token-sliced Expert TP** 和 All2All routing + **Token-sliced Expert TP**；DeepEP routing + **Token-sliced Expert TP** 暂不作为目标语义。
 - **Domino-compatible ExpertTP** 只覆盖 MoE routed experts 的 **Token-sliced Expert TP** 通信隐藏，不表示 attention 或 dense MLP 的普通 TP。
 - 进入 routed experts 前，每个 expert TP rank 已经持有不重复的 source token slice；这些 slice 可以来自不同样本，也可以来自同一样本的不同序列片段。
 
 ## Example dialogue
 
-> **Dev:** "combine forward 和 TP AllGather backward 都能叫 **TP ReduceScatterSum** 吗？"
+> **Dev:** "combine forward 和 TP AllGather backward 都能叫 **TP ReduceScatterRowsSum** 吗？"
 > **Domain expert:** "可以。它们都是先跨 TP rank 做 SUM，再只保留当前 rank 的 token slice。具体用 reduce_scatter 还是 all_reduce + slice 是实现细节。"
 
 > **Dev:** "只支持等长 reduce scatter 够吗？"
-> **Domain expert:** "不够。EP routing 后每个 TP rank 的 token 数可能不同，默认要按 **TP size meta** 做 **Variable TP ReduceScatterSum**。"
+> **Domain expert:** "不够。EP routing 后每个 TP rank 的 token 数可能不同，默认要按 **TP rank row counts** 做 **Variable TP ReduceScatterRowsSum**。"
 
 > **Dev:** "等长和变长 reduce scatter 要不要分别写在不同调用点？"
-> **Domain expert:** "不要。调用点只表达 **TP ReduceScatterSum**，共享核心函数内部选择等长 fast path 或变长路径。"
+> **Domain expert:** "不要。调用点只表达 **TP ReduceScatterRowsSum**，共享核心函数内部选择等长 fast path 或变长路径。"
 
 > **Dev:** "如果某个 TP rank 没有 token，要不要 pad 到 1 行或固定容量？"
-> **Domain expert:** "不要。**TP ReduceScatterSum** 输出真实 token slice，0 行就是合法输出。"
+> **Domain expert:** "不要。**TP ReduceScatterRowsSum** 输出真实 token slice，0 行就是合法输出。"
 
 > **Dev:** "不开 EP 只开 expert TP 时，是不是可以让每个 TP rank 都持有完整 token 批，最后做 all-reduce？"
 > **Domain expert:** "不采用这个语义。无 EP expert TP 仍然是 **Token-sliced Expert TP**：前向按 TP token slice 进入 dispatcher，expert 前 all-gather，expert 后 reduce-scatter。"
 
 > **Dev:** "Naive routing + expert TP 时，TP AllGather 是 gather source tokens，还是 gather topK 展开后的 route-copy tokens？"
-> **Domain expert:** "gather source tokens。topK route-copy 展开仍然发生在 expert layout 阶段；expert 输出先 fold 回 source token partial output，再做 **TP ReduceScatterSum**。"
+> **Domain expert:** "gather source tokens。topK route-copy 展开仍然发生在 expert layout 阶段；expert 输出先 fold 回 source token partial output，再做 **TP ReduceScatterRowsSum**。"
 
 > **Dev:** "Naive routing + expert TP 的异步路径要不要和 EP routing + expert TP 使用同一套分段语义？"
-> **Domain expert:** "要。Naive routing 没有 EP AllToAll，但 **TP AllGather** 和 **TP ReduceScatterSum** 仍然是 dispatcher 通信段，异步依赖边界应保持一致。"
+> **Domain expert:** "要。Naive routing 没有 EP AllToAll，但 **TP AllGather** 和 **TP ReduceScatterRowsSum** 仍然是 dispatcher 通信段，异步依赖边界应保持一致。"
 
 ## Flagged ambiguities
 
-- "reduce scatter" 在本上下文中特指 **TP ReduceScatterSum**；不是只做 scatter，也不是不带 SUM 的切分。
+- "reduce scatter" 在本上下文中特指 **TP ReduceScatterRowsSum**；不是只做 scatter，也不是不带 SUM 的切分。
diff --git a/megatron_tp_ep.md b/megatron_tp_ep.md
index e255948fd..2f099abe1 100644
--- a/megatron_tp_ep.md
+++ b/megatron_tp_ep.md
@@ -23,7 +23,7 @@
 
 - `input_splits [EP]`：本 rank 要向各 EP rank 发送多少 token
 - `output_splits [EP]`：本 rank 将从各 EP rank 收到多少 token（仅计我的 TP 切片）
-- `output_splits_tp [TP]`：EP A2A 后，各 TP rank 各持有多少 token（用于后续 AllGather 的不等分）
+- `tp_rank_row_counts [TP]`：EP A2A 后，各 TP rank 各持有多少 token（用于后续 AllGather 的不等分）
 - `num_global_tokens_per_local_expert_cpu`：每个本地专家将处理多少 token（用于 sort_chunks）
 
 ---
@@ -60,7 +60,7 @@ all_to_all(ep_group,
 if self.tp_size > 1:
     global_input_tokens = gather_from_sequence_parallel_region(
         global_input_tokens, group=tp_group,
-        output_split_sizes=output_splits_tp.tolist()
+        output_split_sizes=tp_rank_row_counts.tolist()
     )
 → global_input_tokens [M_total, H]
 ```
@@ -118,7 +118,7 @@ if self.num_local_experts > 1:
 if self.tp_size > 1:
     hidden_states = reduce_scatter_to_sequence_parallel_region(
         hidden_states, group=tp_group,
-        input_split_sizes=output_splits_tp.tolist()
+        input_split_sizes=tp_rank_row_counts.tolist()
     )
 → [M_ep_recv, H]
 ```
diff --git a/tests/engine/test_moe_train_engine_tpep.py b/tests/engine/test_moe_train_engine_tpep.py
index 65bcf4d1c..a00da8dfc 100644
--- a/tests/engine/test_moe_train_engine_tpep.py
+++ b/tests/engine/test_moe_train_engine_tpep.py
@@ -42,6 +42,8 @@
 from xtuner.v1.loss.ce_loss import CELossConfig
 from xtuner.v1.module.attention import MHAConfig
 from xtuner.v1.module.dispatcher.base import NaiveDispatcher
+from xtuner.v1.module.dispatcher.torch_all2all import TorchAll2AllDispatcher
+from xtuner.v1.module.dispatcher.torch_all2all_tpep import TorchAll2AllTPEPDispatcher
 from xtuner.v1.module.grouped_linear.moe_group_linear import GroupedLinear
 from xtuner.v1.module.router.greedy import GreedyRouterConfig
 from xtuner.v1.model.base import ModelItem
@@ -207,9 +209,10 @@ def _run_train_step_items_without_clip(
 def _record_expert_tp_collective_stages(engine: TrainEngine) -> dict[str, list[str]]:
     stages: dict[str, list[str]] = {
         "async_op_true": [],
-        "async_all_gather": [],
-        "async_all_gather_metadata": [],
-        "async_reduce_scatter_sum": [],
+        "async_all_gather_rows": [],
+        "async_all_gather_row_metadata": [],
+        "async_all_gather_per_rank_metadata": [],
+        "async_reduce_scatter_rows_sum": [],
     }
     current_stage: list[str] = []
 
@@ -241,9 +244,10 @@ def stage_wrapper(*args, _original_stage=original_stage, _stage_name=stage_name,
             setattr(dispatcher, stage_name, stage_wrapper)
 
         for collective_name in (
-            "async_all_gather",
-            "async_all_gather_metadata",
-            "async_reduce_scatter_sum",
+            "async_all_gather_rows",
+            "async_all_gather_row_metadata",
+            "async_all_gather_per_rank_metadata",
+            "async_reduce_scatter_rows_sum",
         ):
             original_collective = getattr(expert_tp, collective_name)
 
@@ -270,12 +274,29 @@ def _assert_domino_expert_tp_collective_stages(stages: dict[str, list[str]]) ->
         "combine",
         "combine_postprocess",
     }
-    assert stages["async_all_gather"]
-    assert stages["async_all_gather_metadata"]
-    assert stages["async_reduce_scatter_sum"]
-    assert set(stages["async_all_gather"]) == {"dispatch"}
-    assert set(stages["async_all_gather_metadata"]) == {"dispatch"}
-    assert set(stages["async_reduce_scatter_sum"]) == {"combine"}
+    assert stages["async_all_gather_rows"]
+    assert stages["async_all_gather_row_metadata"]
+    assert stages["async_reduce_scatter_rows_sum"]
+    assert set(stages["async_all_gather_rows"]) == {"dispatch"}
+    assert set(stages["async_all_gather_row_metadata"]) == {"dispatch"}
+    assert set(stages["async_reduce_scatter_rows_sum"]) == {"combine"}
+
+
+def _assert_domino_all2all_expert_tp_collective_stages(stages: dict[str, list[str]]) -> None:
+    assert set(stages["async_op_true"]) == {
+        "dispatch_preprocess",
+        "dispatch",
+        "dispatch_postprocess",
+        "combine_preprocess",
+        "combine",
+        "combine_postprocess",
+    }
+    assert stages["async_all_gather_rows"]
+    assert stages["async_all_gather_per_rank_metadata"]
+    assert stages["async_reduce_scatter_rows_sum"]
+    assert set(stages["async_all_gather_rows"]) == {"dispatch"}
+    assert set(stages["async_all_gather_per_rank_metadata"]) == {"dispatch"}
+    assert set(stages["async_reduce_scatter_rows_sum"]) == {"combine"}
 
 
 def _assert_rank_inputs_are_distinct(batches: list[tuple[torch.Tensor, torch.Tensor]]) -> None:
@@ -892,6 +913,71 @@ def test_tpep_expert_only_grad_norm_matches_single_with_distinct_expert_tp_data(
         except Exception:
             pass
 
+    @parametrize.parametrize(
+        "device,ep_size,expert_tp_size",
+        [
+            ("cuda", 2, 2),
+        ],
+    )
+    def test_tpep_domino_micro_batch_matches_sync_baseline(
+        self, device: str, ep_size: int, expert_tp_size: int
+    ) -> None:
+        pg = self.create_pg(device)
+
+        engine_ref = _build_engine(ep_size=ep_size, expert_tp_size=expert_tp_size)
+        engine_ref.init_model_weights()
+
+        engine_domino = _build_engine(
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+            intra_layer_micro_batch=2,
+        )
+        engine_domino.init_model_weights()
+        _copy_matching_engine_weights(engine_ref, engine_domino)
+
+        for layer in engine_domino.model.layers.values():
+            assert isinstance(layer.dispatcher, TorchAll2AllDispatcher)
+            assert not isinstance(layer.dispatcher, TorchAll2AllTPEPDispatcher)
+        collective_stages = _record_expert_tp_collective_stages(engine_domino)
+        dist.barrier()
+
+        device_obj = torch.device(device, dist.get_rank() % torch.cuda.device_count())
+        batches = [
+            _make_engine_input(device_obj, seed_offset=dist.get_rank() * 2),
+            _make_engine_input(device_obj, seed_offset=dist.get_rank() * 2 + 1),
+        ]
+        _assert_rank_inputs_are_distinct(batches)
+        loss_cfg = CELossConfig()
+
+        loss_domino = _run_train_step_items_without_clip(engine_domino, loss_cfg, batches)
+        norm_domino = engine_domino.clip_grad_norm(do_clip=False).detach().float().cpu()
+
+        loss_ref = _run_train_step_items_without_clip(engine_ref, loss_cfg, batches)
+        norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+
+        _assert_domino_all2all_expert_tp_collective_stages(collective_stages)
+        torch.testing.assert_close(
+            torch.tensor(loss_domino),
+            torch.tensor(loss_ref),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            norm_domino,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        assert torch.isfinite(torch.tensor(loss_domino))
+        assert torch.isfinite(norm_domino)
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
     @parametrize.parametrize(
         "device,ep_size,expert_tp_size",
         [
diff --git a/tests/module/dispatcher/test_noep_expert_tp.py b/tests/module/dispatcher/test_noep_expert_tp.py
index e119ae7ff..5245efca4 100644
--- a/tests/module/dispatcher/test_noep_expert_tp.py
+++ b/tests/module/dispatcher/test_noep_expert_tp.py
@@ -51,7 +51,7 @@ def _run_dispatcher(
         async_op=async_op,
     )
     # 中文注释：dispatcher 测试不跑真实 row-parallel expert；
-    # 每个 TP rank 提供 1/tp_size 的 partial output，真实 ReduceScatterSum 后应回到 baseline。
+    # 每个 TP rank 提供 1/tp_size 的 partial output，真实 ReduceScatterRowsSum 后应回到 baseline。
     experts_results = post_dispatched["hidden_states"] * expert_scale
     pre_combined = dispatcher.combine_preprocess(
         hidden_states=experts_results,
@@ -123,9 +123,9 @@ def test_sync_path_uses_real_tp_collectives(self) -> None:
             expert_scale=1.0 / world_size,
         )
 
-        all_sizes = [tp_rank + 2 for tp_rank in range(world_size)]
-        slice_start = sum(all_sizes[:rank])
-        slice_end = slice_start + all_sizes[rank]
+        tp_rank_row_counts = [tp_rank + 2 for tp_rank in range(world_size)]
+        slice_start = sum(tp_rank_row_counts[:rank])
+        slice_end = slice_start + tp_rank_row_counts[rank]
 
         torch.testing.assert_close(dispatched["hidden_states"], full_hidden)
         torch.testing.assert_close(dispatched["topk_ids"], full_topk_ids)
diff --git a/tests/module/dispatcher/test_torch_all2all_shared_expert_tp.py b/tests/module/dispatcher/test_torch_all2all_shared_expert_tp.py
new file mode 100644
index 000000000..7c2f7809f
--- /dev/null
+++ b/tests/module/dispatcher/test_torch_all2all_shared_expert_tp.py
@@ -0,0 +1,324 @@
+import unittest
+
+import torch
+import torch.distributed as dist
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.module.dispatcher import build_dispatcher
+from xtuner.v1.module.dispatcher.base import DispacherInterface
+from xtuner.v1.module.dispatcher.torch_all2all import TorchAll2AllDispatcher
+from xtuner.v1.module.dispatcher.torch_all2all_tpep import TorchAll2AllTPEPDispatcher
+
+
+def _build_ep_tp_groups(
+    ep_size: int,
+    tp_size: int,
+) -> tuple[dist.ProcessGroup, dist.ProcessGroup, list[dist.ProcessGroup]]:
+    all_groups = []
+    ep_groups = []
+    tp_groups = []
+    for tp_rank in range(tp_size):
+        group = dist.new_group([ep_rank * tp_size + tp_rank for ep_rank in range(ep_size)], backend="nccl")
+        ep_groups.append(group)
+        all_groups.append(group)
+    for ep_rank in range(ep_size):
+        group = dist.new_group([ep_rank * tp_size + tp_rank for tp_rank in range(tp_size)], backend="nccl")
+        tp_groups.append(group)
+        all_groups.append(group)
+
+    rank = dist.get_rank()
+    return ep_groups[rank % tp_size], tp_groups[rank // tp_size], all_groups
+
+
+def _payload_for_rank(rank: int, device: torch.device) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    rows = rank + 2
+    hidden_size = 8
+    token_ids = torch.arange(sum(i + 2 for i in range(rank)), sum(i + 2 for i in range(rank + 1)), device=device)
+    hidden = token_ids.to(torch.float32).unsqueeze(1) * 10 + torch.arange(hidden_size, device=device)
+    topk_ids = torch.stack((token_ids % 4, (token_ids + 1) % 4), dim=1).to(torch.int64)
+    topk_weights = torch.stack(
+        (
+            torch.full((rows,), 1.0, device=device),
+            torch.full((rows,), 0.2 * (rank + 1), device=device),
+        ),
+        dim=1,
+    )
+    return hidden, topk_ids, topk_weights
+
+
+def _run_dispatcher(
+    dispatcher: DispacherInterface,
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    async_op: bool = False,
+):
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids, async_op=async_op)
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+        async_op=async_op,
+    )
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        async_op=async_op,
+    )
+    # 中文注释：dispatcher 级别不跑真实 row-parallel expert，
+    # 两个 TP rank 各提供一半 partial output。
+    experts_results = post_dispatched["hidden_states"] / 2
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_results,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        async_op=async_op,
+    )
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+        async_op=async_op,
+    )
+    result = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+        async_op=async_op,
+    )
+    return result, dispatched, post_dispatched, pre_combined, combined
+
+
+def _record_shared_expert_tp_stages(dispatcher: TorchAll2AllDispatcher) -> dict[str, list[str | int]]:
+    stages: dict[str, list[str | int]] = {
+        "async_op_true": [],
+        "async_all_gather_rows": [],
+        "async_all_gather_per_rank_metadata": [],
+        "async_reduce_scatter_rows_sum": [],
+        "comm_stream": [],
+    }
+    current_stage: list[str] = []
+    expert_tp = dispatcher._expert_tp
+    assert expert_tp is not None
+
+    for stage_name in (
+        "dispatch_preprocess",
+        "dispatch",
+        "dispatch_postprocess",
+        "combine_preprocess",
+        "combine",
+        "combine_postprocess",
+    ):
+        original_stage = getattr(dispatcher, stage_name)
+
+        def stage_wrapper(*args, _original_stage=original_stage, _stage_name=stage_name, **kwargs):
+            if kwargs.get("async_op", False):
+                stages["async_op_true"].append(_stage_name)
+            current_stage.append(_stage_name)
+            try:
+                return _original_stage(*args, **kwargs)
+            finally:
+                current_stage.pop()
+
+        setattr(dispatcher, stage_name, stage_wrapper)
+
+    for collective_name in (
+        "async_all_gather_rows",
+        "async_all_gather_per_rank_metadata",
+        "async_reduce_scatter_rows_sum",
+    ):
+        original_collective = getattr(expert_tp, collective_name)
+
+        def collective_wrapper(
+            *args,
+            _original_collective=original_collective,
+            _collective_name=collective_name,
+            **kwargs,
+        ):
+            stages[_collective_name].append(current_stage[-1] if current_stage else "<outside>")
+            stages["comm_stream"].append(kwargs["comm_stream"].cuda_stream)
+            return _original_collective(*args, **kwargs)
+
+        setattr(expert_tp, collective_name, collective_wrapper)
+
+    return stages
+
+
+def _assert_shared_expert_tp_async_stages(
+    stages: dict[str, list[str | int]],
+    dispatcher: TorchAll2AllDispatcher,
+) -> None:
+    assert set(stages["async_op_true"]) == {
+        "dispatch_preprocess",
+        "dispatch",
+        "dispatch_postprocess",
+        "combine_preprocess",
+        "combine",
+        "combine_postprocess",
+    }
+    assert stages["async_all_gather_rows"] == ["dispatch"]
+    assert stages["async_all_gather_per_rank_metadata"] == ["dispatch"]
+    assert stages["async_reduce_scatter_rows_sum"] == ["combine"]
+    assert set(stages["comm_stream"]) == {dispatcher._comm_stream.cuda_stream}
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA/NCCL is required for real All2All ExpertTP validation.")
+class TestTorchAll2AllSharedExpertTP(DeterministicDDPTestCase):
+    def test_build_dispatcher_uses_shared_all2all_expert_tp(self) -> None:
+        pg = self.create_pg("cuda")
+        torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+        ep_group, tp_group, all_groups = _build_ep_tp_groups(ep_size=2, tp_size=2)
+
+        dispatcher = build_dispatcher(
+            dispatcher="all2all",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+
+        assert isinstance(dispatcher, TorchAll2AllDispatcher)
+        assert not isinstance(dispatcher, TorchAll2AllTPEPDispatcher)
+        assert dispatcher._expert_tp is not None
+
+        dist.barrier()
+        for group in all_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    def test_sync_shared_all2all_matches_legacy_tpep(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+        ep_group, tp_group, all_groups = _build_ep_tp_groups(ep_size=2, tp_size=2)
+
+        shared_dispatcher = build_dispatcher(
+            dispatcher="all2all",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+        legacy_dispatcher = TorchAll2AllTPEPDispatcher(
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+
+        local_hidden, local_topk_ids, local_topk_weights = _payload_for_rank(rank, device)
+        shared_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        shared_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        shared_result, shared_dispatched, shared_post, shared_pre_combined, shared_combined = _run_dispatcher(
+            shared_dispatcher,
+            shared_hidden_leaf * 1.25,
+            local_topk_ids,
+            shared_topk_weights_leaf * 0.5,
+        )
+        shared_loss = shared_result["hidden_states"].square().sum()
+        shared_loss.backward()
+
+        legacy_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        legacy_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        legacy_result, legacy_dispatched, legacy_post, legacy_pre_combined, legacy_combined = _run_dispatcher(
+            legacy_dispatcher,
+            legacy_hidden_leaf * 1.25,
+            local_topk_ids,
+            legacy_topk_weights_leaf * 0.5,
+        )
+        legacy_loss = legacy_result["hidden_states"].square().sum()
+        legacy_loss.backward()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(shared_dispatched["hidden_states"], legacy_dispatched["hidden_states"])
+        torch.testing.assert_close(
+            shared_dispatched["tokens_per_expert_group"],
+            legacy_dispatched["tokens_per_expert_group"],
+        )
+        assert shared_dispatched["tp_rank_row_counts"] == legacy_dispatched["tp_rank_row_counts"]
+        torch.testing.assert_close(shared_post["tokens_per_expert"], legacy_post["tokens_per_expert"])
+        torch.testing.assert_close(shared_pre_combined["hidden_states"], legacy_pre_combined["hidden_states"])
+        torch.testing.assert_close(shared_combined["hidden_states"], legacy_combined["hidden_states"])
+        torch.testing.assert_close(shared_result["hidden_states"], legacy_result["hidden_states"])
+        assert shared_hidden_leaf.grad is not None
+        assert legacy_hidden_leaf.grad is not None
+        assert shared_topk_weights_leaf.grad is not None
+        assert legacy_topk_weights_leaf.grad is not None
+        torch.testing.assert_close(shared_hidden_leaf.grad, legacy_hidden_leaf.grad)
+        torch.testing.assert_close(shared_topk_weights_leaf.grad, legacy_topk_weights_leaf.grad)
+
+        dist.barrier()
+        for group in all_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    def test_async_shared_all2all_uses_dispatcher_comm_stream(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+        ep_group, tp_group, all_groups = _build_ep_tp_groups(ep_size=2, tp_size=2)
+
+        sync_dispatcher = build_dispatcher(
+            dispatcher="all2all",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+        async_dispatcher = build_dispatcher(
+            dispatcher="all2all",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+        assert isinstance(sync_dispatcher, TorchAll2AllDispatcher)
+        assert isinstance(async_dispatcher, TorchAll2AllDispatcher)
+        stages = _record_shared_expert_tp_stages(async_dispatcher)
+
+        local_hidden, local_topk_ids, local_topk_weights = _payload_for_rank(rank, device)
+        sync_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        sync_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        sync_result, *_ = _run_dispatcher(
+            sync_dispatcher,
+            sync_hidden_leaf * 1.25,
+            local_topk_ids,
+            sync_topk_weights_leaf * 0.5,
+        )
+        sync_result["hidden_states"].square().sum().backward()
+
+        async_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        async_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        async_result, *_ = _run_dispatcher(
+            async_dispatcher,
+            async_hidden_leaf * 1.25,
+            local_topk_ids,
+            async_topk_weights_leaf * 0.5,
+            async_op=True,
+        )
+        async_result["hidden_states"].square().sum().backward()
+        torch.cuda.synchronize()
+
+        _assert_shared_expert_tp_async_stages(stages, async_dispatcher)
+        torch.testing.assert_close(async_result["hidden_states"], sync_result["hidden_states"])
+        assert sync_hidden_leaf.grad is not None
+        assert async_hidden_leaf.grad is not None
+        assert sync_topk_weights_leaf.grad is not None
+        assert async_topk_weights_leaf.grad is not None
+        torch.testing.assert_close(async_hidden_leaf.grad, sync_hidden_leaf.grad)
+        torch.testing.assert_close(async_topk_weights_leaf.grad, sync_topk_weights_leaf.grad)
+
+        dist.barrier()
+        for group in all_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @property
+    def destroy_pg_upon_exit(self) -> bool:
+        return False
diff --git a/tests/module/dispatcher/test_torch_all2all_tpep_async.py b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
index 9aba9a3f3..cd3832d34 100644
--- a/tests/module/dispatcher/test_torch_all2all_tpep_async.py
+++ b/tests/module/dispatcher/test_torch_all2all_tpep_async.py
@@ -5,8 +5,8 @@
 from xtuner.v1.module.dispatcher import torch_all2all
 from xtuner.v1.module.dispatcher.torch_all2all_tpep import (
     TorchAll2AllTPEPDispatcher,
-    _async_tp_all_gather,
-    _async_tp_reduce_scatter_sum,
+    _async_tp_all_gather_rows,
+    _async_tp_reduce_scatter_rows_sum,
 )
 
 
@@ -78,7 +78,7 @@ def fake_all_gather(chunks, tensor, group=None) -> None:
 
     # 中文注释：TP 通信的归属边界是 dispatch，postprocess 只能看到已经 gather 好的 token。
     assert dispatched["hidden_states"].shape == (64, 128)
-    assert dispatched["output_splits_tp"] == [32, 32]
+    assert dispatched["tp_rank_row_counts"] == [32, 32]
     torch.testing.assert_close(dispatched["hidden_states"][32:], pre_dispatched["hidden_states"] + 10)
 
 
@@ -112,7 +112,7 @@ def fake_reduce_scatter(output, input_list, op=None, group=None) -> None:
         output.copy_(input_list[getattr(group, "rank", 0)])
 
     def fake_all_reduce(tensor, op=None, group=None) -> None:
-        raise AssertionError("TP ReduceScatterSum should not use all_reduce + slice")
+        raise AssertionError("TP ReduceScatterRowsSum should not use all_reduce + slice")
 
     def fake_all_gather(chunks, tensor, group=None) -> None:
         chunks[0].copy_(tensor)
@@ -201,9 +201,9 @@ def fake_all_reduce(tensor, op=None, group=None) -> None:
     backward_finished_event = torch.cuda.Event()
     forward_previous_event.record()
 
-    out = _async_tp_all_gather(
+    out = _async_tp_all_gather_rows(
         hidden,
-        all_sizes=[2, 2],
+        tp_rank_row_counts=[2, 2],
         tp_group=group,  # type: ignore[arg-type]
         forward_previous_event=forward_previous_event,
         forward_finished_event=forward_finished_event,
@@ -240,7 +240,7 @@ def fake_reduce_scatter(output, input_list, op=None, group=None) -> None:
         output.copy_(input_list[getattr(group, "rank", 0)])
 
     def fake_all_reduce(tensor, op=None, group=None) -> None:
-        raise AssertionError("TP ReduceScatterSum should use reduce_scatter")
+        raise AssertionError("TP ReduceScatterRowsSum should use reduce_scatter")
 
     def fake_all_gather(chunks, tensor, group=None) -> None:
         calls.append(("all_gather", _stream_id()))
@@ -259,9 +259,9 @@ def fake_all_gather(chunks, tensor, group=None) -> None:
     backward_finished_event = torch.cuda.Event()
     forward_previous_event.record()
 
-    out = _async_tp_reduce_scatter_sum(
+    out = _async_tp_reduce_scatter_rows_sum(
         hidden,
-        all_sizes=[1, 3],
+        tp_rank_row_counts=[1, 3],
         tp_group=group,  # type: ignore[arg-type]
         forward_previous_event=forward_previous_event,
         forward_finished_event=forward_finished_event,
diff --git a/xtuner/v1/module/dispatcher/__init__.py b/xtuner/v1/module/dispatcher/__init__.py
index 914a88acc..f763be549 100644
--- a/xtuner/v1/module/dispatcher/__init__.py
+++ b/xtuner/v1/module/dispatcher/__init__.py
@@ -64,17 +64,10 @@ def build_dispatcher(
         )  # type: ignore
     elif dispatcher == "all2all":
         assert ep_group is not None, "TorchAll2AllDispatcher requires a non-null ep_group."
-        if tp_group is not None and tp_group.size() > 1:
-            return TorchAll2AllTPEPDispatcher(
-                n_routed_experts=n_routed_experts,
-                ep_group=ep_group,
-                tp_group=tp_group,
-                training_dtype=training_dtype,
-                generate_dtype=generate_dtype,
-            )  # type: ignore[return-value]
         return TorchAll2AllDispatcher(
             n_routed_experts=n_routed_experts,
             process_group=ep_group,
+            tp_group=tp_group,
             training_dtype=training_dtype,
             generate_dtype=generate_dtype,
         )  # type: ignore[return-value]
diff --git a/xtuner/v1/module/dispatcher/base.py b/xtuner/v1/module/dispatcher/base.py
index 1f07ad387..29b2df60c 100644
--- a/xtuner/v1/module/dispatcher/base.py
+++ b/xtuner/v1/module/dispatcher/base.py
@@ -195,7 +195,7 @@ class NaivePreDispatchResult(PreDispatchResult):
 
 class NaiveDispatchResult(DispatchResult):
     topk_ids: torch.Tensor
-    tp_size_meta: list[int]
+    tp_rank_row_counts: list[int]
     forward_finished_event: torch.cuda.Event | None
     backward_previous_event: torch.cuda.Event | None
     topk_weights_backward_previous_event: torch.cuda.Event | None
@@ -302,7 +302,7 @@ def dispatch(
             assert backward_finished_event is not None, "Use async_op=True for dispatch_preprocess!"
             assert self._comm_stream is not None
 
-            tp_size_meta = self._expert_tp.gather_size_meta(pre_dispatched["hidden_states"])
+            tp_rank_row_counts = self._expert_tp.gather_tp_rank_row_counts(pre_dispatched["hidden_states"])
             # 中文注释：dispatch 内部的 TP AllGather 都排在同一个 comm stream，
             # 互相不需要 event 串行化；只在 dispatch 阶段边界记录最终完成事件。
             forward_finished_event = torch.cuda.Event()
@@ -312,25 +312,25 @@ def dispatch(
             if topk_weights.grad_fn is not None:
                 topk_weights.grad_fn.register_prehook(_get_backward_pre_hook(topk_weights_backward_finished_event))
 
-            hidden_states = self._expert_tp.async_all_gather(
+            hidden_states = self._expert_tp.async_all_gather_rows(
                 pre_dispatched["hidden_states"],
-                all_sizes=tp_size_meta,
+                tp_rank_row_counts=tp_rank_row_counts,
                 forward_previous_event=forward_previous_event,
                 forward_finished_event=None,
                 backward_previous_event=hidden_backward_previous_event,
                 backward_finished_event=backward_finished_event,
                 comm_stream=self._comm_stream,
             )
-            topk_ids = self._expert_tp.async_all_gather_metadata(
+            topk_ids = self._expert_tp.async_all_gather_row_metadata(
                 pre_dispatched["topk_ids"],
-                all_sizes=tp_size_meta,
+                tp_rank_row_counts=tp_rank_row_counts,
                 forward_previous_event=None,
                 forward_finished_event=None,
                 comm_stream=self._comm_stream,
             )
-            topk_weights = self._expert_tp.async_all_gather(
+            topk_weights = self._expert_tp.async_all_gather_rows(
                 topk_weights,
-                all_sizes=tp_size_meta,
+                tp_rank_row_counts=tp_rank_row_counts,
                 forward_previous_event=None,
                 forward_finished_event=forward_finished_event,
                 backward_previous_event=topk_weights_backward_previous_event,
@@ -342,21 +342,21 @@ def dispatch(
                 hidden_states=hidden_states,
                 topk_ids=topk_ids,
                 topk_weights=topk_weights,
-                tp_size_meta=tp_size_meta,
+                tp_rank_row_counts=tp_rank_row_counts,
                 forward_finished_event=forward_finished_event,
                 backward_previous_event=hidden_backward_previous_event,
                 topk_weights_backward_previous_event=topk_weights_backward_previous_event,
             )
 
         if self._expert_tp is not None:
-            hidden_states, tp_size_meta = self._expert_tp.all_gather(pre_dispatched["hidden_states"])
-            topk_ids = self._expert_tp.all_gather_metadata(pre_dispatched["topk_ids"], tp_size_meta)
-            topk_weights = self._expert_tp.all_gather_metadata(topk_weights, tp_size_meta)
+            hidden_states, tp_rank_row_counts = self._expert_tp.all_gather_rows(pre_dispatched["hidden_states"])
+            topk_ids = self._expert_tp.all_gather_row_metadata(pre_dispatched["topk_ids"], tp_rank_row_counts)
+            topk_weights = self._expert_tp.all_gather_row_metadata(topk_weights, tp_rank_row_counts)
             return NaiveDispatchResult(
                 hidden_states=hidden_states,
                 topk_ids=topk_ids,
                 topk_weights=topk_weights,
-                tp_size_meta=tp_size_meta,
+                tp_rank_row_counts=tp_rank_row_counts,
                 forward_finished_event=None,
                 backward_previous_event=None,
                 topk_weights_backward_previous_event=None,
@@ -366,7 +366,7 @@ def dispatch(
             hidden_states=pre_dispatched["hidden_states"],
             topk_ids=pre_dispatched["topk_ids"],
             topk_weights=topk_weights,
-            tp_size_meta=[],
+            tp_rank_row_counts=[],
             forward_finished_event=None,
             backward_previous_event=None,
             topk_weights_backward_previous_event=None,
@@ -479,9 +479,9 @@ def combine(
 
                     forward_finished_event = torch.cuda.Event()
                     backward_previous_event = torch.cuda.Event()
-                    hidden_states = self._expert_tp.async_reduce_scatter_sum(
+                    hidden_states = self._expert_tp.async_reduce_scatter_rows_sum(
                         pre_combined["hidden_states"],
-                        all_sizes=dispatched["tp_size_meta"],
+                        tp_rank_row_counts=dispatched["tp_rank_row_counts"],
                         forward_previous_event=forward_previous_event,
                         forward_finished_event=forward_finished_event,
                         backward_previous_event=backward_previous_event,
@@ -494,9 +494,9 @@ def combine(
                         backward_previous_event=backward_previous_event,
                     )
 
-                hidden_states = self._expert_tp.reduce_scatter_sum(
+                hidden_states = self._expert_tp.reduce_scatter_rows_sum(
                     pre_combined["hidden_states"],
-                    dispatched["tp_size_meta"],
+                    dispatched["tp_rank_row_counts"],
                 )
                 return NaiveCombineResult(
                     hidden_states=hidden_states,
diff --git a/xtuner/v1/module/dispatcher/expert_tp.py b/xtuner/v1/module/dispatcher/expert_tp.py
index 3d5b4b5ef..c0652b455 100644
--- a/xtuner/v1/module/dispatcher/expert_tp.py
+++ b/xtuner/v1/module/dispatcher/expert_tp.py
@@ -14,91 +14,95 @@ def _record_stream(value: Any, stream: torch.cuda.Stream) -> None:
             _record_stream(item, stream)
 
 
-def _tp_all_gather_forward_impl(
+def _tp_all_gather_rows_forward_impl(
     tensor: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
     tensor = tensor.contiguous()
-    chunks = [torch.empty((size, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) for size in all_sizes]
+    chunks = [
+        torch.empty((size, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) for size in tp_rank_row_counts
+    ]
     dist.all_gather(chunks, tensor, group=tp_group)
     return torch.cat(chunks, dim=0), tensor, chunks
 
 
-def _tp_reduce_scatter_sum_impl(
+def _tp_reduce_scatter_rows_sum_impl(
     tensor: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
     tensor = tensor.contiguous()
-    assert tensor.shape[0] == sum(all_sizes), "TP ReduceScatterSum input rows must match TP size meta."
+    assert tensor.shape[0] == sum(tp_rank_row_counts), (
+        "TP ReduceScatterRowsSum input rows must match tp_rank_row_counts."
+    )
 
-    out = tensor.new_empty((all_sizes[tp_rank], *tensor.shape[1:]))
+    out = tensor.new_empty((tp_rank_row_counts[tp_rank], *tensor.shape[1:]))
     if tensor.shape[0] == 0:
         # 中文注释：所有 TP rank 都没有 token 时没有通信量，直接返回当前 rank 的 0 行 slice。
         return out, tensor, []
 
-    if all(size == all_sizes[0] for size in all_sizes):
+    if all(size == tp_rank_row_counts[0] for size in tp_rank_row_counts):
         dist.reduce_scatter_tensor(out, tensor, op=dist.ReduceOp.SUM, group=tp_group)
         return out, tensor, []
 
-    input_chunks = list(torch.split(tensor, all_sizes, dim=0))
+    input_chunks = list(torch.split(tensor, tp_rank_row_counts, dim=0))
     dist.reduce_scatter(out, input_chunks, op=dist.ReduceOp.SUM, group=tp_group)
     return out, tensor, input_chunks
 
 
-def _tp_all_gather_backward_impl(
+def _tp_all_gather_rows_backward_impl(
     grad: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
-    return _tp_reduce_scatter_sum_impl(grad, all_sizes, tp_rank, tp_group)
+    return _tp_reduce_scatter_rows_sum_impl(grad, tp_rank_row_counts, tp_rank, tp_group)
 
 
-def _tp_reduce_scatter_sum_backward_impl(
+def _tp_reduce_scatter_rows_sum_backward_impl(
     grad_slice: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
     grad_slice = grad_slice.contiguous()
     chunks = [
         torch.empty((size, *grad_slice.shape[1:]), dtype=grad_slice.dtype, device=grad_slice.device)
-        for size in all_sizes
+        for size in tp_rank_row_counts
     ]
     dist.all_gather(chunks, grad_slice, group=tp_group)
     return torch.cat(chunks, dim=0), grad_slice, chunks
 
 
-class _TPAllGather(torch.autograd.Function):
+class _TPAllGatherRows(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx: Any,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        gathered, _, _ = _tp_all_gather_forward_impl(tensor, all_sizes, tp_group)
-        ctx.all_sizes = all_sizes
+        gathered, _, _ = _tp_all_gather_rows_forward_impl(tensor, tp_rank_row_counts, tp_group)
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.tp_group = tp_group
         ctx.tp_rank = tp_rank
         return gathered
 
     @staticmethod
     def backward(ctx: Any, grad: torch.Tensor) -> tuple[torch.Tensor, None, None, None, None]:
-        grad_input, _, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        grad_input, _, _ = _tp_all_gather_rows_backward_impl(grad, ctx.tp_rank_row_counts, ctx.tp_rank, ctx.tp_group)
         return grad_input, None, None, None, None
 
 
-class _AsyncTPAllGather(torch.autograd.Function):
+class _AsyncTPAllGatherRows(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx: Any,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
@@ -111,13 +115,14 @@ def forward(
         with torch.cuda.stream(comm_stream):
             if forward_previous_event is not None:
                 comm_stream.wait_event(forward_previous_event)
-            gathered, tensor_for_comm, chunks = _tp_all_gather_forward_impl(tensor, all_sizes, tp_group)
-            # 中文注释：异步路径只增加 stream/event 管理，collective 核心逻辑和同步路径一致。
+            gathered, tensor_for_comm, chunks = _tp_all_gather_rows_forward_impl(tensor, tp_rank_row_counts, tp_group)
+            # 中文注释：异步路径只增加 stream/event 管理；
+            # collective 核心逻辑和同步路径一致。
             _record_stream((tensor_for_comm, chunks, gathered), comm_stream)
             if forward_finished_event is not None:
                 forward_finished_event.record(comm_stream)
 
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.tp_group = tp_group
         ctx.tp_rank = tp_rank
         ctx.backward_previous_event = backward_previous_event
@@ -135,9 +140,9 @@ def backward(
         with torch.cuda.stream(ctx.comm_stream):
             ctx.comm_stream.wait_event(ctx.backward_previous_event)
             ctx.comm_stream.wait_event(grad_ready_event)
-            grad_input, grad_for_comm, chunks = _tp_all_gather_backward_impl(
+            grad_input, grad_for_comm, chunks = _tp_all_gather_rows_backward_impl(
                 grad,
-                ctx.all_sizes,
+                ctx.tp_rank_row_counts,
                 ctx.tp_rank,
                 ctx.tp_group,
             )
@@ -147,33 +152,33 @@ def backward(
         return grad_input, None, None, None, None, None, None, None, None, None
 
 
-class _TPReduceScatterSum(torch.autograd.Function):
+class _TPReduceScatterRowsSum(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx: Any,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        out, _, _ = _tp_reduce_scatter_sum_impl(tensor, all_sizes, tp_rank, tp_group)
-        ctx.all_sizes = all_sizes
+        out, _, _ = _tp_reduce_scatter_rows_sum_impl(tensor, tp_rank_row_counts, tp_rank, tp_group)
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.tp_group = tp_group
         return out
 
     @staticmethod
     def backward(ctx: Any, grad_slice: torch.Tensor) -> tuple[torch.Tensor, None, None, None, None]:
-        full_grad, _, _ = _tp_reduce_scatter_sum_backward_impl(grad_slice, ctx.all_sizes, ctx.tp_group)
+        full_grad, _, _ = _tp_reduce_scatter_rows_sum_backward_impl(grad_slice, ctx.tp_rank_row_counts, ctx.tp_group)
         return full_grad, None, None, None, None
 
 
-class _AsyncTPReduceScatterSum(torch.autograd.Function):
+class _AsyncTPReduceScatterRowsSum(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx: Any,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
@@ -185,12 +190,18 @@ def forward(
     ) -> torch.Tensor:
         with torch.cuda.stream(comm_stream):
             comm_stream.wait_event(forward_previous_event)
-            out, tensor_for_comm, chunks = _tp_reduce_scatter_sum_impl(tensor, all_sizes, tp_rank, tp_group)
-            # 中文注释：TP ReduceScatterSum 属于 combine 通信段，输出事件交给 combine_postprocess 等待。
+            out, tensor_for_comm, chunks = _tp_reduce_scatter_rows_sum_impl(
+                tensor,
+                tp_rank_row_counts,
+                tp_rank,
+                tp_group,
+            )
+            # 中文注释：TP ReduceScatterRowsSum 属于 combine 通信段；
+            # 输出事件交给 combine_postprocess 等待。
             _record_stream((tensor_for_comm, chunks, out), comm_stream)
             forward_finished_event.record(comm_stream)
 
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.tp_group = tp_group
         ctx.backward_previous_event = backward_previous_event
         ctx.backward_finished_event = backward_finished_event
@@ -207,9 +218,9 @@ def backward(
         with torch.cuda.stream(ctx.comm_stream):
             ctx.comm_stream.wait_event(ctx.backward_previous_event)
             ctx.comm_stream.wait_event(grad_ready_event)
-            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_sum_backward_impl(
+            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_rows_sum_backward_impl(
                 grad_slice,
-                ctx.all_sizes,
+                ctx.tp_rank_row_counts,
                 ctx.tp_group,
             )
             _record_stream((grad_slice_for_comm, chunks, full_grad), ctx.comm_stream)
@@ -226,35 +237,64 @@ def __init__(self, tp_group: dist.ProcessGroup) -> None:
         self._tp_group = tp_group
         self._tp_size = tp_group.size()
 
-    def gather_size_meta(self, tensor: torch.Tensor) -> list[int]:
+    @property
+    def size(self) -> int:
+        return self._tp_size
+
+    def gather_tp_rank_row_counts(self, tensor: torch.Tensor, stream: torch.cuda.Stream | None = None) -> list[int]:
         if self._tp_size == 1:
             return [tensor.shape[0]]
 
-        local_size = tensor.new_tensor([tensor.shape[0]], dtype=torch.long)
-        all_sizes_t = tensor.new_empty([self._tp_size], dtype=torch.long)
-        dist.all_gather_into_tensor(all_sizes_t, local_size, group=self._tp_group)
-        return [int(size) for size in all_sizes_t.tolist()]
-
-    def all_gather(self, tensor: torch.Tensor, all_sizes: list[int] | None = None) -> tuple[torch.Tensor, list[int]]:
+        if stream is None:
+            local_size = tensor.new_tensor([tensor.shape[0]], dtype=torch.long)
+            tp_rank_row_counts_t = tensor.new_empty([self._tp_size], dtype=torch.long)
+            dist.all_gather_into_tensor(tp_rank_row_counts_t, local_size, group=self._tp_group)
+        else:
+            # 中文注释：行数要转成 Python list；单独 stream 避免同步
+            # dispatcher comm stream 上的大 tensor 通信。
+            with torch.cuda.stream(stream):
+                local_size = tensor.new_tensor([tensor.shape[0]], dtype=torch.long)
+                tp_rank_row_counts_t = tensor.new_empty([self._tp_size], dtype=torch.long)
+                dist.all_gather_into_tensor(tp_rank_row_counts_t, local_size, group=self._tp_group)
+                _record_stream((local_size, tp_rank_row_counts_t), stream)
+            stream.synchronize()
+        return [int(size) for size in tp_rank_row_counts_t.tolist()]
+
+    def all_gather_rows(
+        self,
+        tensor: torch.Tensor,
+        tp_rank_row_counts: list[int] | None = None,
+    ) -> tuple[torch.Tensor, list[int]]:
         if self._tp_size == 1:
             return tensor, [tensor.shape[0]]
 
-        if all_sizes is None:
-            all_sizes = self.gather_size_meta(tensor)
+        if tp_rank_row_counts is None:
+            tp_rank_row_counts = self.gather_tp_rank_row_counts(tensor)
 
         tp_rank = dist.get_rank(group=self._tp_group)
-        gathered = _TPAllGather.apply(tensor, all_sizes, self._tp_group, self._tp_size, tp_rank)
-        return gathered, all_sizes
+        gathered = _TPAllGatherRows.apply(tensor, tp_rank_row_counts, self._tp_group, self._tp_size, tp_rank)
+        return gathered, tp_rank_row_counts
 
-    def all_gather_metadata(self, tensor: torch.Tensor, all_sizes: list[int]) -> torch.Tensor:
-        # 中文注释：topk_ids/topk_weights 和 hidden 使用同一份 TP size meta，保证 source token 对齐。
-        gathered, _ = self.all_gather(tensor, all_sizes)
+    def all_gather_row_metadata(self, tensor: torch.Tensor, tp_rank_row_counts: list[int]) -> torch.Tensor:
+        # 中文注释：topk_ids/topk_weights 和 hidden 使用同一份
+        # tp_rank_row_counts，保证 source token 对齐。
+        gathered, _ = self.all_gather_rows(tensor, tp_rank_row_counts)
         return gathered
 
-    def async_all_gather(
+    def all_gather_per_rank_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
+        # 中文注释：tokens_per_expert_group 这类固定形状 meta
+        # 不沿 token 维变长，使用独立 gather。
+        if self._tp_size == 1:
+            return tensor.unsqueeze(0)
+
+        gathered = tensor.new_empty((self._tp_size, *tensor.shape))
+        dist.all_gather_into_tensor(gathered, tensor.contiguous(), group=self._tp_group)
+        return gathered
+
+    def async_all_gather_rows(
         self,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         forward_previous_event: torch.cuda.Event | None,
         forward_finished_event: torch.cuda.Event | None,
         backward_previous_event: torch.cuda.Event,
@@ -267,9 +307,9 @@ def async_all_gather(
             return tensor
 
         tp_rank = dist.get_rank(group=self._tp_group)
-        return _AsyncTPAllGather.apply(
+        return _AsyncTPAllGatherRows.apply(
             tensor,
-            all_sizes,
+            tp_rank_row_counts,
             self._tp_group,
             self._tp_size,
             tp_rank,
@@ -280,10 +320,10 @@ def async_all_gather(
             comm_stream,
         )
 
-    def async_all_gather_metadata(
+    def async_all_gather_row_metadata(
         self,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         forward_previous_event: torch.cuda.Event | None,
         forward_finished_event: torch.cuda.Event | None,
         comm_stream: torch.cuda.Stream,
@@ -296,23 +336,50 @@ def async_all_gather_metadata(
         with torch.cuda.stream(comm_stream):
             if forward_previous_event is not None:
                 comm_stream.wait_event(forward_previous_event)
-            gathered, tensor_for_comm, chunks = _tp_all_gather_forward_impl(tensor, all_sizes, self._tp_group)
+            gathered, tensor_for_comm, chunks = _tp_all_gather_rows_forward_impl(
+                tensor,
+                tp_rank_row_counts,
+                self._tp_group,
+            )
             _record_stream((tensor_for_comm, chunks, gathered), comm_stream)
             if forward_finished_event is not None:
                 forward_finished_event.record(comm_stream)
         return gathered
 
-    def reduce_scatter_sum(self, tensor: torch.Tensor, all_sizes: list[int]) -> torch.Tensor:
+    def async_all_gather_per_rank_metadata(
+        self,
+        tensor: torch.Tensor,
+        forward_previous_event: torch.cuda.Event | None,
+        forward_finished_event: torch.cuda.Event | None,
+        comm_stream: torch.cuda.Stream,
+    ) -> torch.Tensor:
+        if self._tp_size == 1:
+            if forward_finished_event is not None:
+                forward_finished_event.record()
+            return tensor.unsqueeze(0)
+
+        gathered = tensor.new_empty((self._tp_size, *tensor.shape))
+        with torch.cuda.stream(comm_stream):
+            if forward_previous_event is not None:
+                comm_stream.wait_event(forward_previous_event)
+            tensor_for_comm = tensor.contiguous()
+            dist.all_gather_into_tensor(gathered, tensor_for_comm, group=self._tp_group)
+            _record_stream((tensor_for_comm, gathered), comm_stream)
+            if forward_finished_event is not None:
+                forward_finished_event.record(comm_stream)
+        return gathered
+
+    def reduce_scatter_rows_sum(self, tensor: torch.Tensor, tp_rank_row_counts: list[int]) -> torch.Tensor:
         if self._tp_size == 1:
             return tensor
 
         tp_rank = dist.get_rank(group=self._tp_group)
-        return _TPReduceScatterSum.apply(tensor, all_sizes, self._tp_group, self._tp_size, tp_rank)
+        return _TPReduceScatterRowsSum.apply(tensor, tp_rank_row_counts, self._tp_group, self._tp_size, tp_rank)
 
-    def async_reduce_scatter_sum(
+    def async_reduce_scatter_rows_sum(
         self,
         tensor: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         forward_previous_event: torch.cuda.Event,
         forward_finished_event: torch.cuda.Event,
         backward_previous_event: torch.cuda.Event,
@@ -324,9 +391,9 @@ def async_reduce_scatter_sum(
             return tensor
 
         tp_rank = dist.get_rank(group=self._tp_group)
-        return _AsyncTPReduceScatterSum.apply(
+        return _AsyncTPReduceScatterRowsSum.apply(
             tensor,
-            all_sizes,
+            tp_rank_row_counts,
             self._tp_group,
             self._tp_size,
             tp_rank,
diff --git a/xtuner/v1/module/dispatcher/torch_all2all.py b/xtuner/v1/module/dispatcher/torch_all2all.py
index ba1d021e6..6cfd844ad 100644
--- a/xtuner/v1/module/dispatcher/torch_all2all.py
+++ b/xtuner/v1/module/dispatcher/torch_all2all.py
@@ -19,6 +19,7 @@
     PreCombineResult,
     PreDispatchResult,
 )
+from .expert_tp import ExpertTP
 
 
 if get_device() == "npu":
@@ -51,6 +52,7 @@ class TorchAll2AllDispatchResult(DispatchResult):
     tokens_per_expert_group: torch.Tensor
     input_splits: list[int]
     output_splits: list[int]
+    tp_rank_row_counts: list[int]
     forward_finished_event: torch.cuda.Event | None
     backward_previous_event: torch.cuda.Event | None
 
@@ -285,6 +287,7 @@ class TorchAll2AllDispatcher(
     ]
 ):
     _comm_stream = None
+    _tp_row_count_stream: torch.cuda.Stream | None = None
     _process_group: dist.ProcessGroup
 
     def __init__(
@@ -292,6 +295,7 @@ def __init__(
         *,
         n_routed_experts: int,
         process_group: torch.distributed.ProcessGroup,
+        tp_group: torch.distributed.ProcessGroup | None = None,
         training_dtype: Literal["fp8", "bf16"] = "bf16",
         generate_dtype: Literal["fp8", "bf16"] = "bf16",
     ):
@@ -314,6 +318,10 @@ def __init__(
         )
         if TorchAll2AllDispatcher._comm_stream is None:
             TorchAll2AllDispatcher._comm_stream = cast(torch.cuda.Stream, torch.cuda.Stream(device=DEVICE))
+        self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
+        if self._expert_tp is not None and TorchAll2AllDispatcher._tp_row_count_stream is None:
+            TorchAll2AllDispatcher._tp_row_count_stream = torch.cuda.Stream(device=DEVICE)
+        self._tp_row_count_stream = TorchAll2AllDispatcher._tp_row_count_stream
         # if training_dtype == "fp8":
         #     raise NotImplementedError
 
@@ -368,6 +376,10 @@ def dispatch(
                 self._n_routed_experts,
                 self._process_group,
             )
+            tp_rank_row_counts = [hidden_states.shape[0]]
+            if self._expert_tp is not None:
+                hidden_states, tp_rank_row_counts = self._expert_tp.all_gather_rows(hidden_states)
+                tokens_per_expert_group = self._expert_tp.all_gather_per_rank_metadata(tokens_per_expert_group)
             if decoding:
                 raise NotImplementedError
             else:
@@ -377,6 +389,7 @@ def dispatch(
                     tokens_per_expert_group=cast(torch.Tensor, tokens_per_expert_group),
                     input_splits=cast(list[int], input_splits),
                     output_splits=cast(list[int], output_splits),
+                    tp_rank_row_counts=tp_rank_row_counts,
                     forward_finished_event=None,
                     backward_previous_event=None,
                 )
@@ -400,6 +413,36 @@ def dispatch(
                 self._comm_stream,
                 self._process_group,
             )
+            tp_rank_row_counts = [hidden_states.shape[0]]
+            if self._expert_tp is not None:
+                comm_stream = cast(torch.cuda.Stream, self._comm_stream)
+                assert self._tp_row_count_stream is not None
+                # 中文注释：只同步 TP 变长 tp_rank_row_counts；
+                # hidden/counts TP 通信继续排在 dispatcher comm stream。
+                tp_rank_row_counts = self._expert_tp.gather_tp_rank_row_counts(
+                    hidden_states,
+                    stream=self._tp_row_count_stream,
+                )
+                tp_hidden_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+                tp_counts_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+                tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
+                hidden_states = self._expert_tp.async_all_gather_rows(
+                    hidden_states,
+                    tp_rank_row_counts=tp_rank_row_counts,
+                    forward_previous_event=forward_finished_event,
+                    forward_finished_event=tp_hidden_finished_event,
+                    backward_previous_event=tp_backward_previous_event,
+                    backward_finished_event=backward_finished_event,
+                    comm_stream=comm_stream,
+                )
+                tokens_per_expert_group = self._expert_tp.async_all_gather_per_rank_metadata(
+                    tokens_per_expert_group,
+                    forward_previous_event=tp_hidden_finished_event,
+                    forward_finished_event=tp_counts_finished_event,
+                    comm_stream=comm_stream,
+                )
+                forward_finished_event = tp_counts_finished_event
+                backward_previous_event = tp_backward_previous_event
             if decoding:
                 raise NotImplementedError
             else:
@@ -409,6 +452,7 @@ def dispatch(
                     tokens_per_expert_group=tokens_per_expert_group,
                     input_splits=cast(list[int], input_splits),
                     output_splits=cast(list[int], output_splits),
+                    tp_rank_row_counts=tp_rank_row_counts,
                     backward_previous_event=backward_previous_event,
                     forward_finished_event=forward_finished_event,
                 )
@@ -427,9 +471,20 @@ def dispatch_postprocess(
             self.wait_comm_stream(dispatched["forward_finished_event"])
 
         tokens_per_expert_group = dispatched["tokens_per_expert_group"]
-        token_counts = tokens_per_expert_group.ravel()
+        token_counts = tokens_per_expert_group.ravel().to(torch.long)
+        if self._expert_tp is not None:
+            local_expert_ids = self._expert_ids_per_ep_rank.repeat(self._expert_tp.size)
+            output_size = dispatched["hidden_states"].shape[0]
+            tokens_per_expert = tokens_per_expert_group.sum(dim=(0, 1))
+        else:
+            local_expert_ids = self._expert_ids_per_ep_rank
+            output_size = sum(dispatched["output_splits"])
+            tokens_per_expert = tokens_per_expert_group.sum(dim=0)
+
         global_input_tokens_local_experts_indices = torch.repeat_interleave(
-            self._expert_ids_per_ep_rank, token_counts, output_size=sum(dispatched["output_splits"])
+            local_expert_ids,
+            token_counts,
+            output_size=output_size,
         )
 
         # The dispatch result is already permuted, so we can return it directly.
@@ -437,7 +492,6 @@ def dispatch_postprocess(
             dispatched["hidden_states"],
             global_input_tokens_local_experts_indices.to(torch.int32),
         )
-        tokens_per_expert = tokens_per_expert_group.sum(dim=0)
 
         if async_op:
             assert dispatched["backward_previous_event"] is not None, "Please use `async_op=True` for dispatch!"
@@ -513,8 +567,14 @@ def combine(
         decoding: bool = False,
     ) -> CombineResult:
         if not async_op:
+            hidden_states_for_combine = pre_combined["hidden_states"]
+            if self._expert_tp is not None:
+                hidden_states_for_combine = self._expert_tp.reduce_scatter_rows_sum(
+                    hidden_states_for_combine,
+                    dispatched["tp_rank_row_counts"],
+                )
             hidden_states = all_to_all_single_autograd(
-                pre_combined["hidden_states"],
+                hidden_states_for_combine,
                 input_split_sizes=dispatched["output_splits"],
                 output_split_sizes=dispatched["input_splits"],
                 group=self._process_group,
@@ -530,8 +590,26 @@ def combine(
             assert forward_previous_event is not None, "Please use `async_op=True` for combine_preprocess!"
             assert backward_finished_event is not None, "Please use `async_op=True` for combine_preprocess!"
 
+            hidden_states_for_combine = pre_combined["hidden_states"]
+            if self._expert_tp is not None:
+                tp_forward_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
+                tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
+                # 中文注释：TP ReduceScatterRowsSum 属于 combine 通信段，
+                # EP combine 等 TP 输出事件后再发起。
+                hidden_states_for_combine = self._expert_tp.async_reduce_scatter_rows_sum(
+                    hidden_states_for_combine,
+                    tp_rank_row_counts=dispatched["tp_rank_row_counts"],
+                    forward_previous_event=forward_previous_event,
+                    forward_finished_event=tp_forward_finished_event,
+                    backward_previous_event=tp_backward_previous_event,
+                    backward_finished_event=backward_finished_event,
+                    comm_stream=cast(torch.cuda.Stream, self._comm_stream),
+                )
+                forward_previous_event = tp_forward_finished_event
+                backward_finished_event = tp_backward_previous_event
+
             hidden_states = _async_combine(
-                pre_combined["hidden_states"],
+                hidden_states_for_combine,
                 dispatched["output_splits"],
                 dispatched["input_splits"],
                 forward_previous_event,
diff --git a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
index 1774fd708..e084938b2 100644
--- a/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
+++ b/xtuner/v1/module/dispatcher/torch_all2all_tpep.py
@@ -7,7 +7,7 @@
     dispatch_postprocess: permute by local expert (for grouped GEMM)
     [Expert GEMM]       : column-parallel gate/up + row-parallel down projection
     combine_preprocess  : unpermute back to TP-AllGather order
-    combine             : TP ReduceScatterSum → EP AlltoAll reverse
+    combine             : TP ReduceScatterRowsSum → EP AlltoAll reverse
     combine_postprocess : unpermute with topk_weights → [N_local, H] per TP rank
 
 Design rationale (mirrors Megatron MoEAlltoAllTokenDispatcher with TP+EP):
@@ -15,7 +15,7 @@
     parallelism.
   - TP AllGather before experts gives every TP rank the same token batch for its local
     expert weight shard.
-  - TP ReduceScatterSum after the row-parallel down projection sums partial hidden states
+  - TP ReduceScatterRowsSum after the row-parallel down projection sums partial hidden states
     across TP ranks, then returns each rank's original token slice.
 """
 
@@ -45,19 +45,17 @@
 class TorchAll2AllTPEPDispatchResult(TorchAll2AllDispatchResult):
     """Dispatch result after EP AlltoAll and TP AllGather.
 
-    ``output_splits_tp`` records the pre-AllGather token count per TP rank.  The
+    ``tp_rank_row_counts`` records the pre-AllGather token count per TP rank.  The
     later combine phase uses it to restore this TP rank's slice after the
     row-parallel expert output is summed.
 
-    中文注释：TP size meta 指的就是 ``output_splits_tp``。例如 ``tp_size=2``，
+    中文注释：``tp_rank_row_counts`` 是每个 TP rank 在 AllGather 前的行数。例如 ``tp_size=2``，
     EP dispatch 后 TP rank0 的 hidden 是 ``[3, H]``，rank1 是 ``[5, H]``，
-    两个 rank 都会拿到 ``output_splits_tp=[3, 5]``。TP AllGather 用它把
+    两个 rank 都会拿到 ``tp_rank_row_counts=[3, 5]``。TP AllGather 用它把
     变长 hidden 拼成 ``[8, H]``，combine 再按相同边界切回本 rank 的
     ``[3, H]`` 或 ``[5, H]``。
     """
 
-    output_splits_tp: list[int]
-
 
 class TorchAll2AllTPEPPostDispatchResult(TorchAll2AllPostDispatchResult): ...
 
@@ -70,74 +68,79 @@ def _record_stream(value: Any, stream: torch.cuda.Stream) -> None:
             _record_stream(item, stream)
 
 
-def _tp_all_gather_forward_impl(
+def _tp_all_gather_rows_forward_impl(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
     """Run TP AllGather forward and return tensors whose lifetime may need
     recording."""
     hidden = hidden.contiguous()
-    chunks = [torch.empty(s, hidden.shape[1], dtype=hidden.dtype, device=hidden.device) for s in all_sizes]
+    chunks = [torch.empty(s, hidden.shape[1], dtype=hidden.dtype, device=hidden.device) for s in tp_rank_row_counts]
     dist.all_gather(chunks, hidden, group=tp_group)
     return torch.cat(chunks, dim=0), hidden, chunks
 
 
-def _tp_all_gather_backward_impl(
+def _tp_all_gather_rows_backward_impl(
     grad: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
-    return _tp_reduce_scatter_sum_impl(grad, all_sizes, tp_rank, tp_group)
+    return _tp_reduce_scatter_rows_sum_impl(grad, tp_rank_row_counts, tp_rank, tp_group)
 
 
-def _tp_reduce_scatter_sum_impl(
+def _tp_reduce_scatter_rows_sum_impl(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
-    """Run TP ReduceScatterSum and return tensors whose lifetime may need
+    """Run TP ReduceScatterRowsSum and return tensors whose lifetime may need
     recording."""
     hidden = hidden.contiguous()
-    assert hidden.shape[0] == sum(all_sizes), "TP ReduceScatterSum input rows must match TP size meta."
+    assert hidden.shape[0] == sum(tp_rank_row_counts), (
+        "TP ReduceScatterRowsSum input rows must match tp_rank_row_counts."
+    )
 
-    out = hidden.new_empty((all_sizes[tp_rank], *hidden.shape[1:]))
+    out = hidden.new_empty((tp_rank_row_counts[tp_rank], *hidden.shape[1:]))
     if hidden.shape[0] == 0:
         # 中文注释：所有 TP rank 都没有 token 时没有实际通信量，直接返回合法的 0 行 slice。
         return out, hidden, []
 
-    if all(size == all_sizes[0] for size in all_sizes):
+    if all(size == tp_rank_row_counts[0] for size in tp_rank_row_counts):
         dist.reduce_scatter_tensor(out, hidden, op=dist.ReduceOp.SUM, group=tp_group)
         return out, hidden, []
 
-    input_chunks = list(torch.split(hidden, all_sizes, dim=0))
+    input_chunks = list(torch.split(hidden, tp_rank_row_counts, dim=0))
     dist.reduce_scatter(out, input_chunks, op=dist.ReduceOp.SUM, group=tp_group)
     return out, hidden, input_chunks
 
 
-def _tp_reduce_scatter_sum_forward_impl(
+def _tp_reduce_scatter_rows_sum_forward_impl(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_rank: int,
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
-    return _tp_reduce_scatter_sum_impl(hidden, all_sizes, tp_rank, tp_group)
+    return _tp_reduce_scatter_rows_sum_impl(hidden, tp_rank_row_counts, tp_rank, tp_group)
 
 
-def _tp_reduce_scatter_sum_backward_impl(
+def _tp_reduce_scatter_rows_sum_backward_impl(
     grad_slice: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
 ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]:
     grad_slice = grad_slice.contiguous()
-    chunks = [torch.empty(s, grad_slice.shape[1], dtype=grad_slice.dtype, device=grad_slice.device) for s in all_sizes]
+    chunks = [
+        torch.empty(s, grad_slice.shape[1], dtype=grad_slice.dtype, device=grad_slice.device)
+        for s in tp_rank_row_counts
+    ]
     dist.all_gather(chunks, grad_slice, group=tp_group)
     return torch.cat(chunks, dim=0), grad_slice, chunks
 
 
-class _TPAllGather(torch.autograd.Function):
+class _TPAllGatherRows(torch.autograd.Function):
     """TP AllGather with autograd support.
 
     Forward : ``all_gather`` across the TP group, concatenating along the token dim.
@@ -148,16 +151,16 @@ class _TPAllGather(torch.autograd.Function):
     def forward(
         ctx: Any,
         hidden: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        gathered, _, _ = _tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
+        gathered, _, _ = _tp_all_gather_rows_forward_impl(hidden, tp_rank_row_counts, tp_group)
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         return gathered
 
     @staticmethod
@@ -165,11 +168,11 @@ def backward(
         ctx: Any,
         grad: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
-        grad_input, _, _ = _tp_all_gather_backward_impl(grad, ctx.all_sizes, ctx.tp_rank, ctx.tp_group)
+        grad_input, _, _ = _tp_all_gather_rows_backward_impl(grad, ctx.tp_rank_row_counts, ctx.tp_rank, ctx.tp_group)
         return grad_input, None, None, None, None
 
 
-class _AsyncTPAllGather(torch.autograd.Function):
+class _AsyncTPAllGatherRows(torch.autograd.Function):
     """TP AllGather on dispatcher comm stream.
 
     Forward : wait for the previous event, then all-gather token slices.
@@ -181,7 +184,7 @@ class _AsyncTPAllGather(torch.autograd.Function):
     def forward(
         ctx: Any,
         hidden: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
@@ -193,16 +196,17 @@ def forward(
     ) -> torch.Tensor:
         with torch.cuda.stream(comm_stream):
             comm_stream.wait_event(forward_previous_event)
-            gathered, hidden_for_comm, chunks = _tp_all_gather_forward_impl(hidden, all_sizes, tp_group)
+            gathered, hidden_for_comm, chunks = _tp_all_gather_rows_forward_impl(hidden, tp_rank_row_counts, tp_group)
 
-            # 中文注释：同步/异步共用 TP AllGather 核心逻辑；异步只额外管理 stream/event 生命周期。
+            # 中文注释：同步/异步共用 TP AllGather 核心逻辑；
+            # 异步只额外管理 stream/event 生命周期。
             _record_stream((hidden_for_comm, chunks, gathered), comm_stream)
             forward_finished_event.record(comm_stream)
 
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.backward_previous_event = backward_previous_event
         ctx.backward_finished_event = backward_finished_event
         ctx.comm_stream = comm_stream
@@ -215,9 +219,9 @@ def backward(
     ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
         with torch.cuda.stream(ctx.comm_stream):
             ctx.comm_stream.wait_event(ctx.backward_previous_event)
-            grad_input, grad_for_comm, chunks = _tp_all_gather_backward_impl(
+            grad_input, grad_for_comm, chunks = _tp_all_gather_rows_backward_impl(
                 grad,
-                ctx.all_sizes,
+                ctx.tp_rank_row_counts,
                 ctx.tp_rank,
                 ctx.tp_group,
             )
@@ -228,8 +232,8 @@ def backward(
         return grad_input, None, None, None, None, None, None, None, None, None
 
 
-class _TPReduceScatterSum(torch.autograd.Function):
-    """TP ReduceScatterSum with autograd support.
+class _TPReduceScatterRowsSum(torch.autograd.Function):
+    """TP ReduceScatterRowsSum with autograd support.
 
     Forward : ``reduce_scatter`` (SUM) to this TP rank's local token slice.
     Backward: ``all_gather`` the gradient slices to reconstruct the full gradient tensor,
@@ -240,16 +244,16 @@ class _TPReduceScatterSum(torch.autograd.Function):
     def forward(
         ctx: Any,
         hidden: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
     ) -> torch.Tensor:
-        out, _, _ = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+        out, _, _ = _tp_reduce_scatter_rows_sum_forward_impl(hidden, tp_rank_row_counts, tp_rank, tp_group)
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         return out
 
     @staticmethod
@@ -257,18 +261,18 @@ def backward(
         ctx: Any,
         grad_slice: torch.Tensor,
     ) -> tuple[torch.Tensor, None, None, None, None]:
-        full_grad, _, _ = _tp_reduce_scatter_sum_backward_impl(grad_slice, ctx.all_sizes, ctx.tp_group)
+        full_grad, _, _ = _tp_reduce_scatter_rows_sum_backward_impl(grad_slice, ctx.tp_rank_row_counts, ctx.tp_group)
         return full_grad, None, None, None, None
 
 
-class _AsyncTPReduceScatterSum(torch.autograd.Function):
-    """TP ReduceScatterSum on dispatcher comm stream."""
+class _AsyncTPReduceScatterRowsSum(torch.autograd.Function):
+    """TP ReduceScatterRowsSum on dispatcher comm stream."""
 
     @staticmethod
     def forward(
         ctx: Any,
         hidden: torch.Tensor,
-        all_sizes: list[int],
+        tp_rank_row_counts: list[int],
         tp_group: dist.ProcessGroup,
         tp_size: int,
         tp_rank: int,
@@ -280,7 +284,12 @@ def forward(
     ) -> torch.Tensor:
         with torch.cuda.stream(comm_stream):
             comm_stream.wait_event(forward_previous_event)
-            out, hidden_for_comm, chunks = _tp_reduce_scatter_sum_forward_impl(hidden, all_sizes, tp_rank, tp_group)
+            out, hidden_for_comm, chunks = _tp_reduce_scatter_rows_sum_forward_impl(
+                hidden,
+                tp_rank_row_counts,
+                tp_rank,
+                tp_group,
+            )
 
             # 中文注释：同步/异步共用 TP ReduceScatter 核心逻辑；异步只额外管理 stream/event。
             _record_stream((hidden_for_comm, chunks, out), comm_stream)
@@ -289,7 +298,7 @@ def forward(
         ctx.tp_group = tp_group
         ctx.tp_size = tp_size
         ctx.tp_rank = tp_rank
-        ctx.all_sizes = all_sizes
+        ctx.tp_rank_row_counts = tp_rank_row_counts
         ctx.backward_previous_event = backward_previous_event
         ctx.backward_finished_event = backward_finished_event
         ctx.comm_stream = comm_stream
@@ -302,9 +311,9 @@ def backward(
     ) -> tuple[torch.Tensor, None, None, None, None, None, None, None, None, None]:
         with torch.cuda.stream(ctx.comm_stream):
             ctx.comm_stream.wait_event(ctx.backward_previous_event)
-            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_sum_backward_impl(
+            full_grad, grad_slice_for_comm, chunks = _tp_reduce_scatter_rows_sum_backward_impl(
                 grad_slice,
-                ctx.all_sizes,
+                ctx.tp_rank_row_counts,
                 ctx.tp_group,
             )
 
@@ -314,7 +323,7 @@ def backward(
         return full_grad, None, None, None, None, None, None, None, None, None
 
 
-def _tp_all_gather_sizes(
+def _tp_gather_tp_rank_row_counts(
     hidden: torch.Tensor,
     tp_group: dist.ProcessGroup,
     stream: torch.cuda.Stream | None = None,
@@ -327,24 +336,24 @@ def _tp_all_gather_sizes(
 
     if stream is None:
         local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
-        all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
-        dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
+        tp_rank_row_counts_t = hidden.new_empty([tp_size], dtype=torch.long)
+        dist.all_gather_into_tensor(tp_rank_row_counts_t, local_size, group=tp_group)
     else:
         # 中文注释：尺寸通信不依赖计算流，避免为了取 Python list 等待前面的 compute kernel。
         with torch.cuda.stream(stream):
             local_size = hidden.new_tensor([hidden.shape[0]], dtype=torch.long)
-            all_sizes_t = hidden.new_empty([tp_size], dtype=torch.long)
-            dist.all_gather_into_tensor(all_sizes_t, local_size, group=tp_group)
+            tp_rank_row_counts_t = hidden.new_empty([tp_size], dtype=torch.long)
+            dist.all_gather_into_tensor(tp_rank_row_counts_t, local_size, group=tp_group)
             local_size.record_stream(stream)
-            all_sizes_t.record_stream(stream)
+            tp_rank_row_counts_t.record_stream(stream)
         stream.synchronize()
-    return [int(s) for s in all_sizes_t.tolist()]
+    return [int(s) for s in tp_rank_row_counts_t.tolist()]
 
 
-def _tp_all_gather(
+def _tp_all_gather_rows(
     hidden: torch.Tensor,
     tp_group: dist.ProcessGroup,
-    all_sizes: list[int] | None = None,
+    tp_rank_row_counts: list[int] | None = None,
 ) -> tuple[torch.Tensor, list[int]]:
     """All-gather ``hidden`` across the TP group and return the gathered tensor
     plus per-rank sizes."""
@@ -353,16 +362,16 @@ def _tp_all_gather(
         return hidden, [hidden.shape[0]]
 
     tp_rank = dist.get_rank(group=tp_group)
-    if all_sizes is None:
-        all_sizes = _tp_all_gather_sizes(hidden, tp_group)
+    if tp_rank_row_counts is None:
+        tp_rank_row_counts = _tp_gather_tp_rank_row_counts(hidden, tp_group)
 
-    gathered = _TPAllGather.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
-    return gathered, all_sizes
+    gathered = _TPAllGatherRows.apply(hidden, tp_rank_row_counts, tp_group, tp_size, tp_rank)
+    return gathered, tp_rank_row_counts
 
 
-def _async_tp_all_gather(
+def _async_tp_all_gather_rows(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
     forward_previous_event: torch.cuda.Event,
     forward_finished_event: torch.cuda.Event,
@@ -377,9 +386,9 @@ def _async_tp_all_gather(
         return hidden
 
     tp_rank = dist.get_rank(group=tp_group)
-    return _AsyncTPAllGather.apply(
+    return _AsyncTPAllGatherRows.apply(
         hidden,
-        all_sizes,
+        tp_rank_row_counts,
         tp_group,
         tp_size,
         tp_rank,
@@ -391,9 +400,9 @@ def _async_tp_all_gather(
     )
 
 
-def _tp_reduce_scatter_sum(
+def _tp_reduce_scatter_rows_sum(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
 ) -> torch.Tensor:
     """Sum-reduce-scatter ``hidden`` across the TP group, returning this rank's
@@ -403,12 +412,12 @@ def _tp_reduce_scatter_sum(
         return hidden
 
     tp_rank = dist.get_rank(group=tp_group)
-    return _TPReduceScatterSum.apply(hidden, all_sizes, tp_group, tp_size, tp_rank)
+    return _TPReduceScatterRowsSum.apply(hidden, tp_rank_row_counts, tp_group, tp_size, tp_rank)
 
 
-def _async_tp_reduce_scatter_sum(
+def _async_tp_reduce_scatter_rows_sum(
     hidden: torch.Tensor,
-    all_sizes: list[int],
+    tp_rank_row_counts: list[int],
     tp_group: dist.ProcessGroup,
     forward_previous_event: torch.cuda.Event,
     forward_finished_event: torch.cuda.Event,
@@ -416,16 +425,16 @@ def _async_tp_reduce_scatter_sum(
     backward_finished_event: torch.cuda.Event,
     comm_stream: torch.cuda.Stream,
 ) -> torch.Tensor:
-    """Async TP ReduceScatterSum wrapper used by Domino TP+EP path."""
+    """Async TP ReduceScatterRowsSum wrapper used by Domino TP+EP path."""
     tp_size = tp_group.size()
     if tp_size == 1:
         forward_finished_event.record()
         return hidden
 
     tp_rank = dist.get_rank(group=tp_group)
-    return _AsyncTPReduceScatterSum.apply(
+    return _AsyncTPReduceScatterRowsSum.apply(
         hidden,
-        all_sizes,
+        tp_rank_row_counts,
         tp_group,
         tp_size,
         tp_rank,
@@ -437,12 +446,12 @@ def _async_tp_reduce_scatter_sum(
     )
 
 
-def _tp_all_gather_tokens_per_expert_group(
+def _tp_all_gather_per_rank_metadata(
     tokens_per_expert_group: torch.Tensor,
     tp_group: dist.ProcessGroup,
 ) -> torch.Tensor:
     """Gather per-TP expert counts in the same TP-rank order as
-    ``_tp_all_gather``."""
+    ``_tp_all_gather_rows``."""
     tp_size = tp_group.size()
     if tp_size == 1:
         return tokens_per_expert_group.unsqueeze(0)
@@ -452,7 +461,7 @@ def _tp_all_gather_tokens_per_expert_group(
     return gathered
 
 
-def _async_tp_all_gather_tokens_per_expert_group(
+def _async_tp_all_gather_per_rank_metadata(
     tokens_per_expert_group: torch.Tensor,
     tp_group: dist.ProcessGroup,
     forward_previous_event: torch.cuda.Event,
@@ -479,7 +488,7 @@ def _async_tp_all_gather_tokens_per_expert_group(
 
 class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
     """TP+EP dispatcher: wraps ``TorchAll2AllDispatcher`` with TP AllGather and
-    ReduceScatterSum.
+    ReduceScatterRowsSum.
 
     Keeps ``dispatch_preprocess`` and ``combine_postprocess`` from the EP-only
     base class, and moves the TP collectives into the communication methods
@@ -493,10 +502,10 @@ class TorchAll2AllTPEPDispatcher(TorchAll2AllDispatcher):
         generate_dtype (str): Dtype for generation, ``"bf16"`` or ``"fp8"``.
     """
 
-    # 中文注释：_tp_meta_stream 只跑 output_splits_tp 这类小的尺寸 all_gather。
+    # 中文注释：_tp_row_count_stream 只跑 tp_rank_row_counts 这类小的尺寸 all_gather。
     # 尺寸结果要同步回 Python list；如果复用 _comm_stream，会连同前面排队的大块
     # EP AllToAll 一起等完，削弱 Domino 隐藏 TP/EP 通信的效果。
-    _tp_meta_stream: torch.cuda.Stream | None = None
+    _tp_row_count_stream: torch.cuda.Stream | None = None
 
     def __init__(
         self,
@@ -515,9 +524,9 @@ def __init__(
         )
         self._tp_group = tp_group
         self._tp_size = tp_group.size()
-        if TorchAll2AllTPEPDispatcher._tp_meta_stream is None:
-            TorchAll2AllTPEPDispatcher._tp_meta_stream = torch.cuda.Stream()
-        self._tp_meta_stream = TorchAll2AllTPEPDispatcher._tp_meta_stream
+        if TorchAll2AllTPEPDispatcher._tp_row_count_stream is None:
+            TorchAll2AllTPEPDispatcher._tp_row_count_stream = torch.cuda.Stream()
+        self._tp_row_count_stream = TorchAll2AllTPEPDispatcher._tp_row_count_stream
 
     @override
     def dispatch(
@@ -539,20 +548,21 @@ def dispatch(
             assert ep_dispatched["forward_finished_event"] is not None, "Use async_op=True for dispatch!"
             assert ep_dispatched["backward_previous_event"] is not None, "Use async_op=True for dispatch!"
             comm_stream = cast(torch.cuda.Stream, self._comm_stream)
-            # 中文注释：只同步变长 all_gather 的尺寸；大块 TP hidden 通信放到 comm stream 中隐藏。
-            # 这里刻意使用 _tp_meta_stream，避免为了拿 output_splits_tp 的 Python list
+            # 中文注释：只同步变长 all_gather 的尺寸；
+            # 大块 TP hidden 通信放到 comm stream 中隐藏。
+            # 这里刻意使用 _tp_row_count_stream，避免为了拿 tp_rank_row_counts 的 Python list
             # 去同步 _comm_stream 上已经排队的 EP hidden AllToAll。
-            output_splits_tp = _tp_all_gather_sizes(
+            tp_rank_row_counts = _tp_gather_tp_rank_row_counts(
                 ep_dispatched["hidden_states"],
                 self._tp_group,
-                stream=self._tp_meta_stream,
+                stream=self._tp_row_count_stream,
             )
             tp_hidden_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
             tp_counts_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
             tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
-            hidden_states = _async_tp_all_gather(
+            hidden_states = _async_tp_all_gather_rows(
                 ep_dispatched["hidden_states"],
-                all_sizes=output_splits_tp,
+                tp_rank_row_counts=tp_rank_row_counts,
                 tp_group=self._tp_group,
                 forward_previous_event=ep_dispatched["forward_finished_event"],
                 forward_finished_event=tp_hidden_finished_event,
@@ -560,7 +570,7 @@ def dispatch(
                 backward_finished_event=ep_dispatched["backward_previous_event"],
                 comm_stream=comm_stream,
             )
-            tokens_per_expert_group = _async_tp_all_gather_tokens_per_expert_group(
+            tokens_per_expert_group = _async_tp_all_gather_per_rank_metadata(
                 ep_dispatched["tokens_per_expert_group"],
                 tp_group=self._tp_group,
                 forward_previous_event=tp_hidden_finished_event,
@@ -570,11 +580,11 @@ def dispatch(
             forward_finished_event = tp_counts_finished_event
             backward_previous_event = tp_backward_previous_event
         else:
-            hidden_states, output_splits_tp = _tp_all_gather(
+            hidden_states, tp_rank_row_counts = _tp_all_gather_rows(
                 ep_dispatched["hidden_states"],
                 tp_group=self._tp_group,
             )
-            tokens_per_expert_group = _tp_all_gather_tokens_per_expert_group(
+            tokens_per_expert_group = _tp_all_gather_per_rank_metadata(
                 ep_dispatched["tokens_per_expert_group"],
                 tp_group=self._tp_group,
             )
@@ -592,7 +602,7 @@ def dispatch(
             output_splits=ep_dispatched["output_splits"],
             forward_finished_event=forward_finished_event,
             backward_previous_event=backward_previous_event,
-            output_splits_tp=output_splits_tp,
+            tp_rank_row_counts=tp_rank_row_counts,
         )
 
     @override
@@ -704,9 +714,9 @@ def combine(
             tp_forward_finished_event = cast(torch.cuda.Event, torch.cuda.Event())
             tp_backward_previous_event = cast(torch.cuda.Event, torch.cuda.Event())
             # 中文注释：TP ReduceScatter 属于 combine 通信段，EP combine 等它完成后再发起。
-            hidden_states = _async_tp_reduce_scatter_sum(
+            hidden_states = _async_tp_reduce_scatter_rows_sum(
                 pre_combined["hidden_states"],
-                all_sizes=tpep_dispatched["output_splits_tp"],
+                tp_rank_row_counts=tpep_dispatched["tp_rank_row_counts"],
                 tp_group=self._tp_group,
                 forward_previous_event=forward_previous_event,
                 forward_finished_event=tp_forward_finished_event,
@@ -720,9 +730,9 @@ def combine(
                 forward_finished_event=tp_forward_finished_event,
             )
         else:
-            hidden_states = _tp_reduce_scatter_sum(
+            hidden_states = _tp_reduce_scatter_rows_sum(
                 pre_combined["hidden_states"],
-                all_sizes=tpep_dispatched["output_splits_tp"],
+                tp_rank_row_counts=tpep_dispatched["tp_rank_row_counts"],
                 tp_group=self._tp_group,
             )
             pre_combined_for_ep = TorchAll2AllPreCombineResult(
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index a97a401bd..72b6568f6 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -465,7 +465,7 @@ router_weights: [N, E]
 
 - `TorchAll2AllDispatcher` 仍需要在 dispatch 阶段拿到 Python `input_splits` / `output_splits`。
 - `DeepEPDispatcher` 仍可能在库内部等待 receive count，并把 `num_recv_tokens_per_expert_list` 暴露给 Python。
-- TP+EP 路径仍需要 TP size meta 来发起变长 TP AllGather / ReduceScatterSum。
+- TP+EP 路径仍需要 `tp_rank_row_counts` 来发起变长 TP AllGather / ReduceScatterRowsSum。
 
 因此，对 Domino EP 来说，compile 的收益主要是缩短 `_pre_moe_forward`、expert block、`_post_moe_forward` 等本地计算段；
 它不能把 dispatcher 的 host 等待变成 GPU-only 异步，也不能改变 2.1 和 DeepEP “Host metadata 同步”小节里的重叠约束。
@@ -620,18 +620,18 @@ num_recv_tokens_per_expert_list, handle, event
 
 当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 会直接构造 `DeepEPDispatcher`，`tp_group` 没有接入
 DeepEP dispatcher。也就是说，XTuner 当前的 DeepEP 路径是 EP dispatcher，不包含 `TorchAll2AllTPEPDispatcher`
-那套 TP AllGather / TP ReduceScatterSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
-额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterSum，以及相应的 `topk_weights`
+那套 TP AllGather / TP ReduceScatterRowsSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
+额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterRowsSum，以及相应的 `topk_weights`
 event 语义；这部分见 `xtuner_etp.md`。
 
-## TP+EP 中 ReduceScatterSum 与 padding/capacity 取舍
+## TP+EP 中 ReduceScatterRowsSum 与 padding/capacity 取舍
 
 `TorchAll2AllTPEPDispatcher` 在 EP dispatch 之后会额外做 TP AllGather，在 combine 阶段会做 TP
-ReduceScatterSum。这里的 **TP ReduceScatterSum** 是语义名：对同一 TP group 中完整 token 批的 hidden 做
+ReduceScatterRowsSum。这里的 **TP ReduceScatterRowsSum** 是语义名：对同一 TP group 中完整 token 批的 hidden 做
 SUM 归约，并只保留当前 TP rank 负责的 token slice。它同时出现在两个方向：
 
-- combine forward：row-parallel expert output 先做 TP ReduceScatterSum，再进入 EP combine all2all。
-- TP AllGather backward：AllGather 的反向也是 TP ReduceScatterSum。
+- combine forward：row-parallel expert output 先做 TP ReduceScatterRowsSum，再进入 EP combine all2all。
+- TP AllGather backward：AllGather 的反向也是 TP ReduceScatterRowsSum。
 
 TP+EP MoE routing 后，同一个 EP rank 上的不同 TP rank 不一定收到相同数量的 token。以 `tp_size=2` 为例：
 
@@ -640,15 +640,15 @@ EP dispatch 后：
   TP rank0 hidden: [3, H]
   TP rank1 hidden: [5, H]
 
-TP size meta:
-  output_splits_tp = [3, 5]
+TP rank row counts:
+  tp_rank_row_counts = [3, 5]
 
 TP AllGather 后每个 TP rank 都看到：
   gathered hidden: [8, H] = rank0 rows [0:3] | rank1 rows [3:8]
 ```
 
-expert 的 row-parallel down projection 后，两个 TP rank 都有 `[8, H]` 的 partial hidden。TP ReduceScatterSum 需要
-对这两个 `[8, H]` 做 SUM，并按同一个 TP size meta 切回：
+expert 的 row-parallel down projection 后，两个 TP rank 都有 `[8, H]` 的 partial hidden。TP ReduceScatterRowsSum 需要
+对这两个 `[8, H]` 做 SUM，并按同一个 `tp_rank_row_counts` 切回：
 
 ```text
 TP rank0 output: rows [0:3] -> [3, H]
@@ -656,21 +656,21 @@ TP rank1 output: rows [3:8] -> [5, H]
 ```
 
 因此当前设计选择是：**优先实现真正的变长 `reduce_scatter`，不引入 padding/capacity**。dispatcher 已经有
-`output_splits_tp` 作为 TP size meta，正好可以作为变长 reduce scatter 的 split 边界：
+`tp_rank_row_counts` 正好可以作为变长 reduce scatter 的 split 边界：
 
 ```python
-input_tensor_list = list(torch.split(hidden.contiguous(), output_splits_tp, dim=0))
+input_tensor_list = list(torch.split(hidden.contiguous(), tp_rank_row_counts, dim=0))
 output = torch.empty_like(input_tensor_list[tp_rank])
 dist.reduce_scatter(output, input_tensor_list, op=dist.ReduceOp.SUM, group=tp_group)
 ```
 
-当 `output_splits_tp` 全部相等时，可以在共享核心函数内部走等长 fast path：
+当 `tp_rank_row_counts` 全部相等时，可以在共享核心函数内部走等长 fast path：
 
 ```python
 dist.reduce_scatter_tensor(output, hidden.contiguous(), op=dist.ReduceOp.SUM, group=tp_group)
 ```
 
-但这只是实现优化，不改变 dispatcher 对外的 TP size meta 语义。真正的 ReduceScatterSum 实现应集中在一个共享核心
+但这只是实现优化，不改变 dispatcher 对外的 `tp_rank_row_counts` 语义。真正的 ReduceScatterRowsSum 实现应集中在一个共享核心
 函数中，避免 combine forward 和 TP AllGather backward 分叉。
 
 ### 为什么不先做 padding/capacity
@@ -679,31 +679,31 @@ padding 和 capacity 带来的收益不同，需要分开看：
 
 - **padding 的收益** 是把一次变长 collective 包装成等长 collective。通信前把每个 TP rank 的真实 slice pad 到同一
   长度，通信时就可以使用 `reduce_scatter_tensor` / `all_gather_into_tensor` 这类 tensor fast path。若 capacity
-  仍由本 step 的 `max(output_splits_tp)` 动态决定，padding 只减少大块 hidden collective 的 variable-list
-  split 开销，不能消除 TP size meta 的 CPU 同步。
+  仍由本 step 的 `max(tp_rank_row_counts)` 动态决定，padding 只减少大块 hidden collective 的 variable-list
+  split 开销，不能消除 `tp_rank_row_counts` 的 CPU 同步。
 - **固定 capacity 的收益** 是让这个等长长度跨 step 稳定下来。只有 capacity 是配置值或静态上界时，shape 才稳定，
   大块通信 shape 才能从本 step 的 Python split list 中解耦，后续也才更容易做 CUDA graph、buffer 复用或通信
   buffer 预分配。
 - **对 Domino 的影响** 主要来自 host CPU split metadata 同步。只做动态 padding 时，host 仍要拿到
-  `output_splits_tp` 来决定 pad/unpad 边界和本步 capacity，因此这个同步点仍然存在；固定 capacity 才可能减少
+  `tp_rank_row_counts` 来决定 pad/unpad 边界和本步 capacity，因此这个同步点仍然存在；固定 capacity 才可能减少
   运行时 shape 决策，并把大块通信从 split-list 发起路径中移出。这和前面 EP All2All 的 host metadata 同步问题
   类似：host 等 split list 时，已经 enqueue 到 GPU 的另一个 micro batch 计算仍可继续，但 host 不能继续
   enqueue 后续本地算子和通信；如果等待时间超过可覆盖窗口，会压缩 Domino 的真实 overlap。
 
-因此，如果只是每步动态取 `capacity = max(output_splits_tp)`，它仍然需要 TP size meta 的 CPU 同步，只能减少
-variable collective 的 split-list 开销，不能获得固定 shape / CUDA graph，也不能消除 TP size meta 对 Domino
+因此，如果只是每步动态取 `capacity = max(tp_rank_row_counts)`，它仍然需要 `tp_rank_row_counts` 的 CPU 同步，只能减少
+variable collective 的 split-list 开销，不能获得固定 shape / CUDA graph，也不能消除 `tp_rank_row_counts` 对 Domino
 host enqueue 的影响。
 
 但它会把问题从通信层扩散到 layout 层。至少有两种做法：
 
 1. **通信内部 padding，通信后立刻 unpad。**
 
-   例如 TP size meta 是 `[3, 5]`，capacity 取 `5`。AllGather 前把 rank0 的 `[3, H]` pad 到 `[5, H]`，
+   例如 `tp_rank_row_counts` 是 `[3, 5]`，capacity 取 `5`。AllGather 前把 rank0 的 `[3, H]` pad 到 `[5, H]`，
    rank1 保持 `[5, H]`；等长 AllGather 得到 `[10, H]` 后再按真实 sizes compact 回 `[8, H]`。ReduceScatter
    则需要先按 `[3, 5]` 切分、分别 pad 到 `[5, H]`，concat 成 `[10, H]` 后走 `reduce_scatter_tensor`，
    最后再 unpad 成当前 rank 的真实 `[3, H]` 或 `[5, H]`。
 
-   这个方案不改变 expert 看到的 token 数，但增加 pad/unpad copy，并且仍然需要 TP size meta。收益要靠 benchmark
+   这个方案不改变 expert 看到的 token 数，但增加 pad/unpad copy，并且仍然需要 `tp_rank_row_counts`。收益要靠 benchmark
    证明。
 
 2. **端到端 capacity，让 padding token 进入 expert layout。**
@@ -714,5 +714,5 @@ host enqueue 的影响。
 
    这会把改动扩散到 routing、expert layout、postprocess/combine，不适合作为替换 `all_reduce + slice` 的第一步。
 
-因此当前阶段的目标是局部替换：用真正的 TP ReduceScatterSum 取代 `all_reduce + slice`，输出 shape 严格按照
-`output_splits_tp[tp_rank]` 分配，允许 0 行，不做 padding/capacity。
+因此当前阶段的目标是局部替换：用真正的 TP ReduceScatterRowsSum 取代 `all_reduce + slice`，输出 shape 严格按照
+`tp_rank_row_counts[tp_rank]` 分配，允许 0 行，不做 padding/capacity。
diff --git a/xtuner_ep_domino.md b/xtuner_ep_domino.md
index dd20e419b..e9f7747f8 100644
--- a/xtuner_ep_domino.md
+++ b/xtuner_ep_domino.md
@@ -588,23 +588,23 @@ compute stream 中剥离出来，让它们尽可能和另一个 micro batch 的
    `[M_total, hidden]`。
 2. `dispatch_postprocess`：只做本地按 local expert 排序，给 grouped GEMM 使用。
 3. `combine_preprocess`：只做本地 unpermute，把 expert 输出恢复到 TP AllGather 顺序。
-4. `combine`：先做 TP ReduceScatterSum，恢复每个 TP rank 自己的 `[M_ep_recv, hidden]`，再进入 EP combine all2all。
+4. `combine`：先做 TP ReduceScatterRowsSum，恢复每个 TP rank 自己的 `[M_ep_recv, hidden]`，再进入 EP combine all2all。
 
 专家权重本身由 `GroupedLinear` 按 TP 切分：
 
 - `fused_w1w3` 是 column parallel。
 - `fused_w2` 是 row parallel。
 
-当前 TPEP dispatcher 在 `async_op=True` 时也把 TP AllGather / ReduceScatterSum 接入同一套事件链：
+当前 TPEP dispatcher 在 `async_op=True` 时也把 TP AllGather / ReduceScatterRowsSum 接入同一套事件链：
 
 - `dispatch` 中，TP AllGather 在 dispatcher 的 comm stream 上等待 EP dispatch 完成事件；compute stream 只在
   `dispatch_postprocess` 做本地排序前等待 TP AllGather 完成。
-- `combine` 中，TP ReduceScatterSum 在 comm stream 上等待 `combine_preprocess` 的本地 unpermute 完成事件；
-  后续 EP combine 再等待 TP ReduceScatterSum 完成事件。
-- 反向中，TP AllGather / ReduceScatterSum 对应的反向 collective 也在 comm stream 上执行，并通过 autograd hook
+- `combine` 中，TP ReduceScatterRowsSum 在 comm stream 上等待 `combine_preprocess` 的本地 unpermute 完成事件；
+  后续 EP combine 再等待 TP ReduceScatterRowsSum 完成事件。
+- 反向中，TP AllGather / ReduceScatterRowsSum 对应的反向 collective 也在 comm stream 上执行，并通过 autograd hook
   把等待点放在梯度真正被消费的位置。
-- `TP ReduceScatterSum` 使用真正的 reduce-scatter 语义：等长 token slice 走 `reduce_scatter_tensor` fast path，
-  变长 token slice 按 TP size meta 切成 `input_list` 后走 `reduce_scatter`。这避免了 `all_reduce` 后再丢弃非本
+- `TP ReduceScatterRowsSum` 使用真正的 reduce-scatter 语义：等长 token slice 走 `reduce_scatter_tensor` fast path，
+  变长 token slice 按 `tp_rank_row_counts` 切成 `input_list` 后走 `reduce_scatter`。这避免了 `all_reduce` 后再丢弃非本
   rank slice 的额外通信和写入。
 
 因此 TP+EP 下的 Domino 流水不再只覆盖 EP dispatch/combine；TP collectives 也可以和另一个 micro batch 的
@@ -620,7 +620,7 @@ XTuner 当前 Domino EP 实现可以概括为：
   micro-batch forward。
 - 层级 `MoEDecoderLayer._micro_batch_forward` 负责重新排列单层内两个 micro batch 的 attention/gate、EP
   dispatch、expert、combine、shared expert、postprocess。
-- dispatcher 的 `async_op=True` 负责把 EP all2all 以及 TP+EP 中的 TP AllGather / ReduceScatterSum 放到独立
+- dispatcher 的 `async_op=True` 负责把 EP all2all 以及 TP+EP 中的 TP AllGather / ReduceScatterRowsSum 放到独立
   comm stream，并用 CUDA event 和 autograd hook 维持正确依赖。
 - 前向重叠需要按 event 判断：`D0` 可覆盖 `A1/Dpre1`，`D1` 可覆盖 `E0/Cpre0`，`C0/C1` 可覆盖后续
   compute；但每个 micro batch 在 `dispatch_postprocess` / `combine_postprocess` 消费通信结果前仍会等待。

From 941d83ef6a34d016d38784928d623c8fa6782758 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 10:05:24 +0000
Subject: [PATCH 18/25] add deepep doc and validate scripts

---
 .dev_scripts/run_validate_xtuner_deepep_md.sh |  31 ++
 .dev_scripts/validate_xtuner_deepep_md.py     | 445 ++++++++++++++++++
 CONTEXT.md                                    |  31 +-
 xtuner_ep_dispatcher.md                       | 379 +++++++++++----
 4 files changed, 800 insertions(+), 86 deletions(-)
 create mode 100755 .dev_scripts/run_validate_xtuner_deepep_md.sh
 create mode 100644 .dev_scripts/validate_xtuner_deepep_md.py

diff --git a/.dev_scripts/run_validate_xtuner_deepep_md.sh b/.dev_scripts/run_validate_xtuner_deepep_md.sh
new file mode 100755
index 000000000..47bb83fc7
--- /dev/null
+++ b/.dev_scripts/run_validate_xtuner_deepep_md.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# 默认使用用户指定的 pt29_sg59 环境；需要切换时可在命令前覆盖 CONDA_ENV。
+CONDA_ENV="${CONDA_ENV:-pt29_sg59}"
+source $(conda info --base)/etc/profile.d/conda.sh
+conda activate "${CONDA_ENV}"
+
+export XTUNER_EP_DEBUG="${XTUNER_EP_DEBUG:-1}"
+
+# xtuner_ep_dispatcher.md 的 DeepEP 示例固定为 EP=2；默认额外验证 4 份 DP replica。
+EP_SIZE="${EP_SIZE:-2}"
+DP_SIZE="${DP_SIZE:-4}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * DP_SIZE))}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+MASTER_PORT="${MASTER_PORT:-29532}"
+
+# 显式使用当前仓库代码，避免导入 conda 环境或其他目录下安装的 xtuner。
+export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
+export CUDA_VISIBLE_DEVICES
+export EP_SIZE
+export DP_SIZE
+
+cd "${REPO_ROOT}"
+torchrun \
+  --nproc-per-node="${NPROC_PER_NODE}" \
+  --master-port="${MASTER_PORT}" \
+  .dev_scripts/validate_xtuner_deepep_md.py
diff --git a/.dev_scripts/validate_xtuner_deepep_md.py b/.dev_scripts/validate_xtuner_deepep_md.py
new file mode 100644
index 000000000..5bc13eb3c
--- /dev/null
+++ b/.dev_scripts/validate_xtuner_deepep_md.py
@@ -0,0 +1,445 @@
+"""验证 xtuner_ep_dispatcher.md 中 DeepEP 前向示例的中间顺序。
+
+运行方式：
+    EP_SIZE=2 DP_SIZE=4 torchrun --nproc-per-node=8 .dev_scripts/validate_xtuner_deepep_md.py
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+
+
+EP_SIZE = 2
+DEFAULT_DP_SIZE = 4
+N_ROUTED_EXPERTS = 6
+EXPERTS_PER_RANK = 3
+EXPERT_OUTPUT_SCALE = 100.0
+HIDDEN_SIZE = 128
+
+
+@dataclass(frozen=True)
+class RankCase:
+    token_values: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    topk_weights: tuple[tuple[float, float], ...]
+
+
+@dataclass(frozen=True)
+class RankExpected:
+    input_hidden: tuple[float, ...]
+    topk_ids: tuple[tuple[int, int], ...]
+    pre_hidden: tuple[float, ...]
+    dispatch_hidden: tuple[float, ...]
+    dispatch_topk_ids: tuple[int, ...]
+    dispatch_topk_weights: tuple[float, ...]
+    num_recv_tokens_per_expert_list: tuple[int, ...]
+    post_hidden: tuple[float, ...]
+    post_row_ids_map: tuple[int, ...]
+    tokens_per_expert: tuple[float, ...]
+    experts_out: tuple[float, ...]
+    pre_combine_hidden: tuple[float, ...]
+    combine_hidden: tuple[float, ...]
+    post_combine_hidden: tuple[float, ...]
+
+
+@dataclass(frozen=True)
+class ParallelInfo:
+    global_rank: int
+    dp_rank: int
+    ep_rank: int
+    device: torch.device
+    ep_group: dist.ProcessGroup
+
+
+CASES: dict[int, RankCase] = {
+    0: RankCase(
+        token_values=(10.0, 11.0, 12.0, 13.0),
+        topk_ids=((0, 4), (3, 1), (2, 5), (4, 0)),
+        topk_weights=((0.25, 0.75), (0.4, 0.6), (0.7, 0.3), (0.8, 0.2)),
+    ),
+    1: RankCase(
+        token_values=(20.0, 21.0, 22.0, 23.0),
+        topk_ids=((1, 3), (4, 2), (5, 0), (3, 1)),
+        topk_weights=((0.2, 0.8), (0.5, 0.5), (0.9, 0.1), (0.35, 0.65)),
+    ),
+}
+
+
+EXPECTED: dict[int, RankExpected] = {
+    0: RankExpected(
+        input_hidden=(10.0, 11.0, 12.0, 13.0),
+        topk_ids=((0, 4), (3, 1), (2, 5), (4, 0)),
+        pre_hidden=(10.0, 11.0, 12.0, 13.0),
+        dispatch_hidden=(10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0, 23.0),
+        dispatch_topk_ids=(0, -1, -1, 1, 2, -1, -1, 0, 1, -1, -1, 2, -1, 0, -1, 1),
+        dispatch_topk_weights=(0.25, 0.0, 0.0, 0.6, 0.7, 0.0, 0.0, 0.2, 0.2, 0.0, 0.0, 0.5, 0.0, 0.1, 0.0, 0.65),
+        num_recv_tokens_per_expert_list=(3, 3, 2),
+        post_hidden=(10.0, 13.0, 22.0, 11.0, 20.0, 23.0, 12.0, 21.0),
+        post_row_ids_map=(0, -1, 6, -1, 4, -1, -1, -1, -1, 3, -1, 1, -1, 7, 2, 5),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(10.0, 13.0, 22.0, 111.0, 120.0, 123.0, 212.0, 221.0),
+        pre_combine_hidden=(2.5, 66.6, 148.4, 2.6, 24.0, 110.5, 2.2, 79.95),
+        combine_hidden=(310.0, 191.0, 302.0, 333.0),
+        post_combine_hidden=(310.0, 191.0, 302.0, 333.0),
+    ),
+    1: RankExpected(
+        input_hidden=(20.0, 21.0, 22.0, 23.0),
+        topk_ids=((1, 3), (4, 2), (5, 0), (3, 1)),
+        pre_hidden=(20.0, 21.0, 22.0, 23.0),
+        dispatch_hidden=(10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0, 23.0),
+        dispatch_topk_ids=(-1, 1, 0, -1, -1, 2, 1, -1, -1, 0, 1, -1, 2, -1, 0, -1),
+        dispatch_topk_weights=(0.0, 0.75, 0.4, 0.0, 0.0, 0.3, 0.8, 0.0, 0.0, 0.8, 0.5, 0.0, 0.9, 0.0, 0.35, 0.0),
+        num_recv_tokens_per_expert_list=(3, 3, 2),
+        post_hidden=(11.0, 20.0, 23.0, 10.0, 13.0, 21.0, 12.0, 22.0),
+        post_row_ids_map=(-1, 0, -1, 4, -1, 5, 7, 2, 3, -1, 6, -1, 1, -1, -1, -1),
+        tokens_per_expert=(3.0, 3.0, 2.0),
+        experts_out=(311.0, 320.0, 323.0, 410.0, 413.0, 421.0, 512.0, 522.0),
+        pre_combine_hidden=(307.5, 124.4, 153.6, 330.4, 256.0, 210.5, 469.8, 113.05),
+        combine_hidden=(280.0, 321.0, 472.0, 193.0),
+        post_combine_hidden=(280.0, 321.0, 472.0, 193.0),
+    ),
+}
+
+
+def main() -> None:
+    try:
+        parallel_info = _init_distributed()
+        snapshots = _run_xtuner_deepep_case(parallel_info)
+        _validate(parallel_info, snapshots)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+def _init_distributed() -> ParallelInfo:
+    if not torch.cuda.is_available():
+        raise RuntimeError("DeepEPDispatcher 当前依赖 CUDA，请在 GPU 上用 torchrun 运行。")
+
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+
+    ep_size = _get_env_int("EP_SIZE", EP_SIZE)
+    dp_size = _get_env_int("DP_SIZE", DEFAULT_DP_SIZE)
+    world_size = dist.get_world_size()
+    if ep_size != EP_SIZE:
+        raise RuntimeError("xtuner_ep_dispatcher.md 的 DeepEP 示例固定为 EP=2。")
+    if world_size != ep_size * dp_size:
+        raise RuntimeError(
+            f"当前配置要求 world_size = EP_SIZE * DP_SIZE = {ep_size * dp_size}，实际为 {world_size}。"
+        )
+
+    # 与 MoE 初始化保持一致：mesh_shape=(dp, ep)，EP 组为连续 rank 对。
+    ep_mesh = init_device_mesh(
+        "cuda",
+        (dp_size, ep_size),
+        mesh_dim_names=("dp", "ep"),
+    )["ep"]
+
+    global_rank = dist.get_rank()
+    return ParallelInfo(
+        global_rank=global_rank,
+        dp_rank=global_rank // ep_size,
+        ep_rank=ep_mesh.get_local_rank(),
+        device=torch.device("cuda", local_rank),
+        ep_group=ep_mesh.get_group(),
+    )
+
+
+@torch.no_grad()
+def _run_xtuner_deepep_case(parallel_info: ParallelInfo) -> dict[str, Any]:
+    DeepEPDispatcher = _import_deepep_dispatcher()
+
+    case = CASES[parallel_info.ep_rank]
+    hidden_states = torch.zeros(
+        (len(case.token_values), HIDDEN_SIZE),
+        dtype=torch.bfloat16,
+        device=parallel_info.device,
+    )
+    hidden_states[:, 0] = torch.tensor(case.token_values, dtype=torch.bfloat16, device=parallel_info.device)
+    topk_ids = torch.tensor(case.topk_ids, dtype=torch.long, device=parallel_info.device)
+    topk_weights = torch.tensor(case.topk_weights, dtype=torch.float32, device=parallel_info.device)
+
+    dispatcher = DeepEPDispatcher(
+        n_routed_experts=N_ROUTED_EXPERTS,
+        training_dtype="bf16",
+        process_group=parallel_info.ep_group,
+    )
+
+    # 对应文档 1：DeepEP source 侧不做 route-copy 展开，只保留原始 token。
+    pre_dispatched = dispatcher.dispatch_preprocess(hidden_states=hidden_states, topk_ids=topk_ids)
+
+    # 对应文档 2：DeepEP dispatch 按 token->rank 发送 hidden、local topk ids 和 topk weights。
+    dispatched = dispatcher.dispatch(
+        pre_dispatched=pre_dispatched,
+        topk_weights=topk_weights,
+        decoding=False,
+    )
+
+    # 对应文档 3：receiver rank 内按 recv_topk_idx 展开成 local expert grouped 顺序。
+    post_dispatched = dispatcher.dispatch_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+    )
+
+    # 用 expert id 改写输出，确保 DeepEP 在 combine 前的 topK 加权折叠也被验证。
+    experts_out = _mock_local_experts(
+        hidden_states=post_dispatched["hidden_states"],
+        tokens_per_expert=post_dispatched["tokens_per_expert"],
+        ep_rank=parallel_info.ep_rank,
+    )
+
+    # 对应文档 5：expert rank 上先用 recv_topk_weights 做加权折叠，回到 dispatch 后的 source-token 顺序。
+    pre_combined = dispatcher.combine_preprocess(
+        hidden_states=experts_out,
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        decoding=False,
+    )
+
+    # 对应文档 6：DeepEP combine 复用 dispatch handle，把 partial output 送回 source rank 并 SUM。
+    combined = dispatcher.combine(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        decoding=False,
+    )
+
+    # DeepEP 的 topK 加权已经在 combine_preprocess 完成；这里主要是等待 event 并返回 hidden。
+    post_combined = dispatcher.combine_postprocess(
+        pre_dispatched=pre_dispatched,
+        dispatched=dispatched,
+        post_dispatched=post_dispatched,
+        pre_combined=pre_combined,
+        combined=combined,
+    )
+
+    return {
+        "input_hidden": hidden_states,
+        "topk_ids": topk_ids,
+        "pre_hidden": pre_dispatched["hidden_states"],
+        "pre_topk_ids": pre_dispatched["topk_ids"],
+        "dispatch_hidden": dispatched["hidden_states"],
+        "dispatch_topk_ids": dispatched["topk_ids"],
+        "dispatch_topk_weights": dispatched["topk_weights"],
+        "num_recv_tokens_per_expert_list": dispatched["num_recv_tokens_per_expert_list"],
+        "post_hidden": post_dispatched["hidden_states"],
+        "post_row_ids_map": post_dispatched["row_ids_map"],
+        "tokens_per_expert": post_dispatched["tokens_per_expert"],
+        "experts_out": experts_out,
+        "pre_combine_hidden": pre_combined["hidden_states"],
+        "combine_hidden": combined["hidden_states"],
+        "post_combine_hidden": post_combined["hidden_states"],
+    }
+
+
+def _import_deepep_dispatcher() -> Any:
+    try:
+        from xtuner.v1.module.dispatcher.deepep import DeepEPDispatcher
+    except Exception as exc:
+        raise RuntimeError(
+            "DeepEPDispatcher 导入失败，请确认当前 conda 环境中的 deep_ep/deep_ep_cpp "
+            f"与 CUDA/PyTorch ABI 匹配。原始错误：{exc}"
+        ) from exc
+    return DeepEPDispatcher
+
+
+def _mock_local_experts(
+    *,
+    hidden_states: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    ep_rank: int,
+) -> torch.Tensor:
+    local_expert_ids = torch.arange(EXPERTS_PER_RANK, dtype=torch.float32, device=hidden_states.device)
+    local_expert_ids = torch.repeat_interleave(local_expert_ids, tokens_per_expert.to(torch.long))
+    global_expert_ids = ep_rank * EXPERTS_PER_RANK + local_expert_ids
+    experts_out = hidden_states.to(torch.float32)
+    experts_out[:, 0] += global_expert_ids * EXPERT_OUTPUT_SCALE
+    return experts_out.to(hidden_states.dtype)
+
+
+def _validate(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    expected = EXPECTED[parallel_info.ep_rank]
+    error: AssertionError | None = None
+
+    try:
+        if os.getenv("XTUNER_EP_DEBUG", "0") == "1":
+            _print_snapshots(parallel_info, snapshots)
+        _assert_tensor_close(parallel_info, "pre_hidden", snapshots["pre_hidden"], expected.pre_hidden, first_col=True)
+        _assert_tensor_close(parallel_info, "pre_topk_ids", snapshots["pre_topk_ids"], _flatten(expected.topk_ids))
+        _assert_tensor_close(
+            parallel_info,
+            "dispatch_hidden",
+            snapshots["dispatch_hidden"],
+            expected.dispatch_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "dispatch_topk_ids",
+            snapshots["dispatch_topk_ids"],
+            expected.dispatch_topk_ids,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "dispatch_topk_weights",
+            snapshots["dispatch_topk_weights"],
+            expected.dispatch_topk_weights,
+            atol=1e-4,
+        )
+        _assert_list_equal(
+            parallel_info,
+            "num_recv_tokens_per_expert_list",
+            snapshots["num_recv_tokens_per_expert_list"],
+            expected.num_recv_tokens_per_expert_list,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "post_hidden",
+            snapshots["post_hidden"],
+            expected.post_hidden,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "post_row_ids_map",
+            snapshots["post_row_ids_map"],
+            expected.post_row_ids_map,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "tokens_per_expert",
+            snapshots["tokens_per_expert"],
+            expected.tokens_per_expert,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "experts_out",
+            snapshots["experts_out"],
+            expected.experts_out,
+            atol=3.0,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "pre_combine_hidden",
+            snapshots["pre_combine_hidden"],
+            expected.pre_combine_hidden,
+            atol=3.0,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "combine_hidden",
+            snapshots["combine_hidden"],
+            expected.combine_hidden,
+            atol=3.0,
+            first_col=True,
+        )
+        _assert_tensor_close(
+            parallel_info,
+            "post_combine_hidden",
+            snapshots["post_combine_hidden"],
+            expected.post_combine_hidden,
+            atol=3.0,
+            first_col=True,
+        )
+    except AssertionError as exc:
+        error = exc
+
+    failed = torch.tensor([int(error is not None)], dtype=torch.int32, device=parallel_info.device)
+    dist.all_reduce(failed, op=dist.ReduceOp.SUM)
+
+    if failed.item() != 0:
+        if error is not None:
+            raise error
+        raise AssertionError("其他 rank 的 xtuner_ep_dispatcher.md DeepEP 示例校验失败。")
+
+    if parallel_info.global_rank == 0:
+        print("xtuner_ep_dispatcher.md EP=2 DP=4 DeepEP 示例校验通过。")
+
+
+def _assert_tensor_close(
+    parallel_info: ParallelInfo,
+    name: str,
+    actual: torch.Tensor,
+    expected: tuple[float, ...] | tuple[int, ...],
+    *,
+    atol: float = 0.0,
+    first_col: bool = False,
+) -> None:
+    # 文档只跟踪 activation 行来源，不展开 H 维；脚本用第一列承载 token 标识。
+    actual_1d = actual.detach()
+    if first_col and actual_1d.dim() > 1:
+        actual_1d = actual_1d[:, 0]
+    actual_1d = actual_1d.reshape(-1).to(torch.float32)
+    expected_tensor = torch.tensor(expected, dtype=torch.float32, device=actual.device)
+    try:
+        torch.testing.assert_close(actual_1d, expected_tensor, rtol=0.0, atol=atol)
+    except AssertionError as exc:
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank}, dp_rank={parallel_info.dp_rank}, "
+            f"ep_rank={parallel_info.ep_rank} 的 {name} 不符合 xtuner_ep_dispatcher.md DeepEP 示例："
+            f"actual={actual_1d.cpu().tolist()}, expected={expected_tensor.cpu().tolist()}"
+        ) from exc
+
+
+def _assert_list_equal(parallel_info: ParallelInfo, name: str, actual: list[int], expected: tuple[int, ...]) -> None:
+    if actual != list(expected):
+        raise AssertionError(
+            f"global_rank={parallel_info.global_rank}, dp_rank={parallel_info.dp_rank}, "
+            f"ep_rank={parallel_info.ep_rank} 的 {name} 不符合 xtuner_ep_dispatcher.md DeepEP 示例："
+            f"actual={actual}, expected={expected}"
+        )
+
+
+def _flatten(values: tuple[tuple[int, int], ...]) -> tuple[int, ...]:
+    return tuple(item for row in values for item in row)
+
+
+def _get_env_int(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return int(value)
+
+
+def _print_snapshots(parallel_info: ParallelInfo, snapshots: dict[str, Any]) -> None:
+    hidden_names = {
+        "input_hidden",
+        "pre_hidden",
+        "dispatch_hidden",
+        "post_hidden",
+        "experts_out",
+        "pre_combine_hidden",
+        "combine_hidden",
+        "post_combine_hidden",
+    }
+    for name, value in snapshots.items():
+        if isinstance(value, torch.Tensor):
+            tensor = value.detach()
+            if name in hidden_names and tensor.dim() > 1:
+                tensor = tensor[:, 0]
+            print(
+                f"[global_rank={parallel_info.global_rank} dp_rank={parallel_info.dp_rank} "
+                f"ep_rank={parallel_info.ep_rank}] {name}: {tensor.reshape(-1).cpu().tolist()}",
+                flush=True,
+            )
+        else:
+            print(
+                f"[global_rank={parallel_info.global_rank} dp_rank={parallel_info.dp_rank} "
+                f"ep_rank={parallel_info.ep_rank}] {name}: {value}",
+                flush=True,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/CONTEXT.md b/CONTEXT.md
index e39ca6501..4a1aa1dc5 100644
--- a/CONTEXT.md
+++ b/CONTEXT.md
@@ -26,6 +26,10 @@ _Avoid_: replicated-token expert TP
 让 **Token-sliced Expert TP** 的 **TP AllGather** 属于 dispatcher dispatch 通信段，让 **TP ReduceScatterRowsSum** 属于 dispatcher combine 通信段，从而能被 Domino micro-batch 流水隐藏的 MoE expert TP 语义。
 _Avoid_: attention TP, dense MLP TP
 
+**Expert-side topK folding**:
+在拥有 routed expert 的 rank 上，使用收到的 topK weights 将同一 source token 的多个 expert output 加权合并成一行 partial output。
+_Avoid_: source-side DeepEP folding
+
 ## Relationships
 
 - **TP AllGather** 的反向通信是 **TP ReduceScatterRowsSum**。
@@ -36,9 +40,29 @@ _Avoid_: attention TP, dense MLP TP
 - **TP ReduceScatterRowsSum** 的实现策略应集中在一个共享核心函数中，避免 combine forward 和 TP AllGather backward 分叉。
 - **TP ReduceScatterRowsSum** 的输出 shape 严格由当前 TP rank 的 **TP rank row counts** 决定，允许 0 行，不引入 padding 或 capacity。
 - 当 `ep_size=1` 且 `expert_tp_size>1` 时，expert ownership 维度仍然存在，只是大小为 1；所有 routed experts 都属于这个唯一 EP rank。
-- 在 Naive routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 source token rows；在 EP routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 EP routing 后的 route-copy rows。
+- 在 Naive routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 source token rows。
+- 在 All2All routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 EP AllToAll 后的 route-copy rows。
+- 在 DeepEP routing + **Token-sliced Expert TP** 下，**TP rank row counts** 记录 DeepEP dispatch 后的 received source-token rows；local expert route-copy rows 由 DeepEP 的 received topK ids 展开得到。
 - **Token-sliced Expert TP** 的异步边界由 TP AllGather 和 **TP ReduceScatterRowsSum** 定义；这个边界不依赖 EP 是否开启。
-- 当前支持范围是 Naive routing + **Token-sliced Expert TP** 和 All2All routing + **Token-sliced Expert TP**；DeepEP routing + **Token-sliced Expert TP** 暂不作为目标语义。
+- 当前支持范围是 Naive routing、All2All routing、DeepEP routing 与 **Token-sliced Expert TP** 的组合。
+- DeepEP routing + **Token-sliced Expert TP** 保留 **Expert-side topK folding**：DeepEP dispatch 后 TP AllGather hidden、topK ids 和 topK weights；expert output 先按 gathered topK weights 折叠，再做 **TP ReduceScatterRowsSum** 和 DeepEP combine。
+- DeepEP routing + **Token-sliced Expert TP** 的 dispatch TP 段必须使用同一份 **TP rank row counts** 对齐 AllGather hidden、received topK ids 和 received topK weights。
+- DeepEP routing + **Token-sliced Expert TP** 中，received topK ids 是无梯度 row metadata；它参与 TP AllGather 只为保持与 hidden/topK weights 的行顺序一致。
+- DeepEP routing + **Token-sliced Expert TP** 的 TP AllGather 属于 dispatcher `dispatch` 阶段；`dispatch_postprocess` 只消费 gathered 数据并构造 local expert layout。
+- DeepEP routing + **Token-sliced Expert TP** 的 **TP ReduceScatterRowsSum** 属于 dispatcher `combine` 阶段；`combine_preprocess` 只做 **Expert-side topK folding**。
+- DeepEP routing + **Token-sliced Expert TP** 的 grouped GEMM `tokens_per_expert` 来自各 TP rank 的 DeepEP `num_recv_tokens_per_expert_list` 聚合求和；重新扫描 gathered topK ids 只适合作为校验。
+- DeepEP routing + **Token-sliced Expert TP** 中，DeepEP 原始 `num_recv_tokens_per_expert_list` 字段不随 ExpertTP 开启而改变；TP 聚合后的计数只作为 grouped GEMM 的 `tokens_per_expert`。
+- DeepEP routing + **Token-sliced Expert TP** 中，同一 EP rank 内每个 expert TP rank 都对完整 gathered expert input 运行自己的 expert weight shard，输出在 `combine` 阶段通过 **TP ReduceScatterRowsSum** 求和并切回本 TP rank token slice。
+- DeepEP routing + **Token-sliced Expert TP** 必须保留 `async_op=True` 语义；hidden 和 topK weights 的反向通信完成前，不能让上游 backward 消费对应梯度。
+- DeepEP routing + **Token-sliced Expert TP** 的 async backward 边界必须同时覆盖 hidden 分支和 topK weights 分支；`topk_weights.grad_fn` 需要等待 TP weights ReduceScatterRowsSum 与 DeepEP dispatch backward 完成后再继续上游 router backward。
+- DeepEP routing + **Token-sliced Expert TP** 必须支持 topK weights 有梯度和无梯度两种输入；有梯度路径是验证重点。
+- DeepEP routing + **Token-sliced Expert TP** 的 DeepEP `EventOverlap` 与 TP `torch.cuda.Event` 衔接只属于 `DeepEPDispatcher` 内部适配；共享 **Token-sliced Expert TP** helper 不依赖 DeepEP 类型。
+- `dispatcher="deepep"` 在 `expert_tp_size>1` 时仍表示 `DeepEPDispatcher`，由同一个 dispatcher 根据 `tp_group` 接入 **Token-sliced Expert TP**，不引入新的 dispatcher 名称。
+- DeepEP routing + **Token-sliced Expert TP** 的验证应覆盖 dispatcher 六阶段 public API 的真实 forward/backward 路径、模型级 MoE 接线路径和 Domino micro-batch async staging，而不是只验证内部 helper。
+- **Token-sliced Expert TP** 的 TP group 必须位于同一个 expert ownership 内；在 `(fsdp, ep, etp)` mesh 中，同一 TP group 的 ranks 共享相同 EP rank，只在 expert TP rank 上不同。
+- DeepEP routing + **Token-sliced Expert TP** 的首个支持目标是训练 forward/backward；`decoding=True` 仍不属于支持范围。
+- DeepEP routing + **Token-sliced Expert TP** 的首个支持目标只要求 BF16 训练通信路径；FP8 DeepEP 通信 dtype 不属于该目标。
+- `tp_group=None` 时，DeepEP routing 不启用 **Token-sliced Expert TP**，行为必须保持原有 DeepEP-only 语义。
 - **Domino-compatible ExpertTP** 只覆盖 MoE routed experts 的 **Token-sliced Expert TP** 通信隐藏，不表示 attention 或 dense MLP 的普通 TP。
 - 进入 routed experts 前，每个 expert TP rank 已经持有不重复的 source token slice；这些 slice 可以来自不同样本，也可以来自同一样本的不同序列片段。
 
@@ -65,6 +89,9 @@ _Avoid_: attention TP, dense MLP TP
 > **Dev:** "Naive routing + expert TP 的异步路径要不要和 EP routing + expert TP 使用同一套分段语义？"
 > **Domain expert:** "要。Naive routing 没有 EP AllToAll，但 **TP AllGather** 和 **TP ReduceScatterRowsSum** 仍然是 dispatcher 通信段，异步依赖边界应保持一致。"
 
+> **Dev:** "DeepEP + expert TP 的 **TP rank row counts** 是 route-copy 行数吗？"
+> **Domain expert:** "不是。DeepEP dispatch 收到的是 source-token rows；route-copy/local expert 展开发生在 `dispatch_postprocess`，所以 **TP rank row counts** 记录 received source-token rows。"
+
 ## Flagged ambiguities
 
 - "reduce scatter" 在本上下文中特指 **TP ReduceScatterRowsSum**；不是只做 scatter，也不是不带 SUM 的切分。
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index 72b6568f6..a176aab40 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -433,7 +433,99 @@ router_weights: [N, E]
 第二次 `post_dispatched["row_ids_map"] [M_recv]` 是 destination EP rank 上第二次 `permute` 产生的还原 map，
 语义相同（scatter，1D indices 无 topk 展开），只负责 expert 计算后恢复 source-block 顺序，方便反向 all2all。
 
-## torch.compile 与 dispatcher 边界
+## TP+EP 中 ReduceScatterRowsSum 与 padding/capacity 取舍
+
+`TorchAll2AllTPEPDispatcher` 在 EP dispatch 之后会额外做 TP AllGather，在 combine 阶段会做 TP
+ReduceScatterRowsSum。这里的 **TP ReduceScatterRowsSum** 是语义名：对同一 TP group 中完整 token 批的 hidden 做
+SUM 归约，并只保留当前 TP rank 负责的 token slice。它同时出现在两个方向：
+
+- combine forward：row-parallel expert output 先做 TP ReduceScatterRowsSum，再进入 EP combine all2all。
+- TP AllGather backward：AllGather 的反向也是 TP ReduceScatterRowsSum。
+
+TP+EP MoE routing 后，同一个 EP rank 上的不同 TP rank 不一定收到相同数量的 token。以 `tp_size=2` 为例：
+
+```text
+EP dispatch 后：
+  TP rank0 hidden: [3, H]
+  TP rank1 hidden: [5, H]
+
+TP rank row counts:
+  tp_rank_row_counts = [3, 5]
+
+TP AllGather 后每个 TP rank 都看到：
+  gathered hidden: [8, H] = rank0 rows [0:3] | rank1 rows [3:8]
+```
+
+expert 的 row-parallel down projection 后，两个 TP rank 都有 `[8, H]` 的 partial hidden。TP ReduceScatterRowsSum 需要
+对这两个 `[8, H]` 做 SUM，并按同一个 `tp_rank_row_counts` 切回：
+
+```text
+TP rank0 output: rows [0:3] -> [3, H]
+TP rank1 output: rows [3:8] -> [5, H]
+```
+
+因此当前设计选择是：**优先实现真正的变长 `reduce_scatter`，不引入 padding/capacity**。dispatcher 已经有
+`tp_rank_row_counts` 正好可以作为变长 reduce scatter 的 split 边界：
+
+```python
+input_tensor_list = list(torch.split(hidden.contiguous(), tp_rank_row_counts, dim=0))
+output = torch.empty_like(input_tensor_list[tp_rank])
+dist.reduce_scatter(output, input_tensor_list, op=dist.ReduceOp.SUM, group=tp_group)
+```
+
+当 `tp_rank_row_counts` 全部相等时，可以在共享核心函数内部走等长 fast path：
+
+```python
+dist.reduce_scatter_tensor(output, hidden.contiguous(), op=dist.ReduceOp.SUM, group=tp_group)
+```
+
+但这只是实现优化，不改变 dispatcher 对外的 `tp_rank_row_counts` 语义。真正的 ReduceScatterRowsSum 实现应集中在一个共享核心
+函数中，避免 combine forward 和 TP AllGather backward 分叉。
+
+### 为什么不先做 padding/capacity
+
+padding 和 capacity 带来的收益不同，需要分开看：
+
+- **padding 的收益** 是把一次变长 collective 包装成等长 collective。通信前把每个 TP rank 的真实 slice pad 到同一
+  长度，通信时就可以使用 `reduce_scatter_tensor` / `all_gather_into_tensor` 这类 tensor fast path。若 capacity
+  仍由本 step 的 `max(tp_rank_row_counts)` 动态决定，padding 只减少大块 hidden collective 的 variable-list
+  split 开销，不能消除 `tp_rank_row_counts` 的 CPU 同步。
+- **固定 capacity 的收益** 是让这个等长长度跨 step 稳定下来。只有 capacity 是配置值或静态上界时，shape 才稳定，
+  大块通信 shape 才能从本 step 的 Python split list 中解耦，后续也才更容易做 CUDA graph、buffer 复用或通信
+  buffer 预分配。
+- **对 Domino 的影响** 主要来自 host CPU split metadata 同步。只做动态 padding 时，host 仍要拿到
+  `tp_rank_row_counts` 来决定 pad/unpad 边界和本步 capacity，因此这个同步点仍然存在；固定 capacity 才可能减少
+  运行时 shape 决策，并把大块通信从 split-list 发起路径中移出。这和前面 EP All2All 的 host metadata 同步问题
+  类似：host 等 split list 时，已经 enqueue 到 GPU 的另一个 micro batch 计算仍可继续，但 host 不能继续
+  enqueue 后续本地算子和通信；如果等待时间超过可覆盖窗口，会压缩 Domino 的真实 overlap。
+
+因此，如果只是每步动态取 `capacity = max(tp_rank_row_counts)`，它仍然需要 `tp_rank_row_counts` 的 CPU 同步，只能减少
+variable collective 的 split-list 开销，不能获得固定 shape / CUDA graph，也不能消除 `tp_rank_row_counts` 对 Domino
+host enqueue 的影响。
+
+但它会把问题从通信层扩散到 layout 层。至少有两种做法：
+
+1. **通信内部 padding，通信后立刻 unpad。**
+
+   例如 `tp_rank_row_counts` 是 `[3, 5]`，capacity 取 `5`。AllGather 前把 rank0 的 `[3, H]` pad 到 `[5, H]`，
+   rank1 保持 `[5, H]`；等长 AllGather 得到 `[10, H]` 后再按真实 sizes compact 回 `[8, H]`。ReduceScatter
+   则需要先按 `[3, 5]` 切分、分别 pad 到 `[5, H]`，concat 成 `[10, H]` 后走 `reduce_scatter_tensor`，
+   最后再 unpad 成当前 rank 的真实 `[3, H]` 或 `[5, H]`。
+
+   这个方案不改变 expert 看到的 token 数，但增加 pad/unpad copy，并且仍然需要 `tp_rank_row_counts`。收益要靠 benchmark
+   证明。
+
+2. **端到端 capacity，让 padding token 进入 expert layout。**
+
+   这种方案会让 `[tp_size * capacity, H]` 直接进入 `dispatch_postprocess` 和 grouped GEMM。它需要定义 padding
+   token 的 expert 归属、`tokens_per_expert` 是否包含 padding、grouped GEMM 是否计算 padding、combine 如何剔除
+   padding，以及 `row_ids_map` / `topk_weights` 如何保证 padding 不影响真实 token。
+
+   这会把改动扩散到 routing、expert layout、postprocess/combine，不适合作为替换 `all_reduce + slice` 的第一步。
+
+因此当前阶段的目标是局部替换：用真正的 TP ReduceScatterRowsSum 取代 `all_reduce + slice`，输出 shape 严格按照
+`tp_rank_row_counts[tp_rank]` 分配，允许 0 行，不做 padding/capacity。
+# torch.compile 与 dispatcher 边界
 
 `FSDPConfig.torch_compile=True` 目前只是一个兼容入口，真正决定 compile 行为的是
 `XTunerBaseModelConfig.compile_cfg`：
@@ -471,6 +563,7 @@ router_weights: [N, E]
 它不能把 dispatcher 的 host 等待变成 GPU-only 异步，也不能改变 2.1 和 DeepEP “Host metadata 同步”小节里的重叠约束。
 如果 host metadata 等待超过另一个 micro batch 能覆盖的计算窗口，真实 overlap 仍会下降。
 
+# DeepEPDispatcher
 ## DeepEPDispatcher: DeepEP Buffer dispatch/combine 原理
 
 `DeepEPDispatcher` 仍然暴露和其他 dispatcher 一样的六阶段接口，但它把 EP all2all 的 routing layout、通信 handle
@@ -589,130 +682,248 @@ DeepEP 的反向复用相反方向的通信原语：
 这解释了为什么 DeepEP dispatch 是一个 composite autograd op：它的 forward 同时产生 `recv_x` 和
 `recv_topk_weights`，backward 也同时返回 `x` 和 `topk_weights` 的梯度。
 
-### Host metadata 同步
+## DeepEPDispatcher 前向示例
 
-DeepEP 不像 `TorchAll2AllDispatcher` 那样在 XTuner 代码里显式执行：
+继续使用前面 All2All 示例里的配置和 routing：
 
-```python
-to(device=torch.device("cpu")).tolist()
+```text
+EP = 2
+E_local = 3
+E = 6
+K = 2
+每个 EP rank 本地 N = 4 个 token
+
+ep0 owns global expert 0,1,2
+ep1 owns global expert 3,4,5
+
+ep0 source tokens: A0 A1 A2 A3
+ep1 source tokens: B0 B1 B2 B3
 ```
 
-但它仍然存在 host 可见的 metadata 准备点。DeepEP 的 legacy Buffer API 文档和 XTuner 包装都注明：dispatch 内部不知道
-当前 rank 会收到多少 token，因此 CPU 会等待 GPU signal，拿到 receive count 后才能继续。XTuner 代码中的表现是
-`Buffer.dispatch` 返回 Python list：
+routing 仍然是：
 
-```python
-num_recv_tokens_per_expert_list, handle, event
+```text
+ep0 topk_ids:
+A0 -> [0, 4]
+A1 -> [3, 1]
+A2 -> [2, 5]
+A3 -> [4, 0]
+
+ep1 topk_ids:
+B0 -> [1, 3]
+B1 -> [4, 2]
+B2 -> [5, 0]
+B3 -> [3, 1]
 ```
 
-`dispatch_postprocess` 必须用这个 list 计算 `num_out_tokens` 和 `tokens_per_expert`。因此 DeepEP 也不是完全无 host
-同步；只是同步被 DeepEP 的 layout/dispatch handle 机制封装在库内部，不是 PyTorch split-size list 的
-`.tolist()` 同步。
+为了把 weighted combine 写成具体数字，取验证脚本里的 `topk_weights`：
 
-对 Domino EP 来说，两者的影响边界一致：
+```text
+ep0 weights:
+A0 -> [0.25, 0.75]
+A1 -> [0.40, 0.60]
+A2 -> [0.70, 0.30]
+A3 -> [0.80, 0.20]
 
-- 已经 enqueue 到 GPU 的另一个 micro batch 计算不会被 host 同步打断。
-- host 等 metadata 时无法继续 enqueue 后续本地算子和通信。
-- 如果 metadata 等待短于可覆盖的另一个 micro batch 计算，重叠效果基本保留。
-- 如果 metadata 等待更长，`xtuner_ep_domino.md` 7.3 中的理想时间线会被压缩，真实重叠比例下降。
+ep1 weights:
+B0 -> [0.20, 0.80]
+B1 -> [0.50, 0.50]
+B2 -> [0.90, 0.10]
+B3 -> [0.35, 0.65]
+```
 
-### 当前支持边界
+### 1. `dispatch_preprocess`: 不做本地 route-copy 展开
 
-当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 会直接构造 `DeepEPDispatcher`，`tp_group` 没有接入
-DeepEP dispatcher。也就是说，XTuner 当前的 DeepEP 路径是 EP dispatcher，不包含 `TorchAll2AllTPEPDispatcher`
-那套 TP AllGather / TP ReduceScatterRowsSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
-额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterRowsSum，以及相应的 `topk_weights`
-event 语义；这部分见 `xtuner_etp.md`。
+DeepEP 不像 `TorchAll2AllDispatcher` 那样先在 source rank 本地把 token 展开成 `[N*K, H]` 并按 global expert 排序。
+`dispatch_preprocess` 只保留原始 token，并把 `topk_ids` 转成 `int64`：
 
-## TP+EP 中 ReduceScatterRowsSum 与 padding/capacity 取舍
+```text
+pre_dispatched["hidden_states"]: [N, H] = [4, H]
+pre_dispatched["topk_ids"]:      [N, K] = [4, 2]
+```
 
-`TorchAll2AllTPEPDispatcher` 在 EP dispatch 之后会额外做 TP AllGather，在 combine 阶段会做 TP
-ReduceScatterRowsSum。这里的 **TP ReduceScatterRowsSum** 是语义名：对同一 TP group 中完整 token 批的 hidden 做
-SUM 归约，并只保留当前 TP rank 负责的 token slice。它同时出现在两个方向：
+### 2. `dispatch`: 每个目标 EP rank 收一份 source token
 
-- combine forward：row-parallel expert output 先做 TP ReduceScatterRowsSum，再进入 EP combine all2all。
-- TP AllGather backward：AllGather 的反向也是 TP ReduceScatterRowsSum。
+DeepEP 的 layout 先判断每个 token 是否需要发送到某个 EP rank：只要 token 的任意 topK expert 在该 rank，本 token 就向该
+rank 发送一行 hidden。也就是说，通信粒度是 **token 到 rank**，不是一开始就按 expert 展开成 route-copy。
 
-TP+EP MoE routing 后，同一个 EP rank 上的不同 TP rank 不一定收到相同数量的 token。以 `tp_size=2` 为例：
+本例中每个 token 都正好有一个 expert 在 `ep0`、一个 expert 在 `ep1`，所以两个目标 rank 都收到 8 行 source token：
 
 ```text
-EP dispatch 后：
-  TP rank0 hidden: [3, H]
-  TP rank1 hidden: [5, H]
+dispatched row: 0  1  2  3 | 4  5  6  7
+source token:   A0 A1 A2 A3| B0 B1 B2 B3
+```
 
-TP rank row counts:
-  tp_rank_row_counts = [3, 5]
+DeepEP 同时把 global expert id 转成当前 receiver rank 的 local expert id；不属于当前 rank 的 topK slot 写成 `-1`，
+对应 weight 写成 `0`。
 
-TP AllGather 后每个 TP rank 都看到：
-  gathered hidden: [8, H] = rank0 rows [0:3] | rank1 rows [3:8]
+`ep0` 收到：
+
+```text
+recv_topk_idx row:  0      1      2      3    | 4      5      6      7
+source token:       A0     A1     A2     A3   | B0     B1     B2     B3
+recv_topk_idx:      [0,-1] [-1,1] [2,-1] [-1,0] [1,-1] [-1,2] [-1,0] [-1,1]
+recv_topk_weights:  [.25,0] [0,.60] [.70,0] [0,.20] [.20,0] [0,.50] [0,.10] [0,.65]
 ```
 
-expert 的 row-parallel down projection 后，两个 TP rank 都有 `[8, H]` 的 partial hidden。TP ReduceScatterRowsSum 需要
-对这两个 `[8, H]` 做 SUM，并按同一个 `tp_rank_row_counts` 切回：
+`ep1` 收到：
 
 ```text
-TP rank0 output: rows [0:3] -> [3, H]
-TP rank1 output: rows [3:8] -> [5, H]
+recv_topk_idx row:  0      1      2      3    | 4      5      6      7
+source token:       A0     A1     A2     A3   | B0     B1     B2     B3
+recv_topk_idx:      [-1,1] [0,-1] [-1,2] [1,-1] [-1,0] [1,-1] [2,-1] [0,-1]
+recv_topk_weights:  [0,.75] [.40,0] [0,.30] [.80,0] [0,.80] [.50,0] [.90,0] [.35,0]
 ```
 
-因此当前设计选择是：**优先实现真正的变长 `reduce_scatter`，不引入 padding/capacity**。dispatcher 已经有
-`tp_rank_row_counts` 正好可以作为变长 reduce scatter 的 split 边界：
+两边的 local expert token 数都是：
 
-```python
-input_tensor_list = list(torch.split(hidden.contiguous(), tp_rank_row_counts, dim=0))
-output = torch.empty_like(input_tensor_list[tp_rank])
-dist.reduce_scatter(output, input_tensor_list, op=dist.ReduceOp.SUM, group=tp_group)
+```text
+num_recv_tokens_per_expert_list = [3, 3, 2]
 ```
 
-当 `tp_rank_row_counts` 全部相等时，可以在共享核心函数内部走等长 fast path：
+### 3. `dispatch_postprocess`: receiver rank 内展开并按 local expert 分组
+
+`dispatch_postprocess` 对 `recv_topk_idx` 做本地 `permute`。这一步才真正把收到的 token 展开成 local expert 的
+route-copy，并丢掉 `-1` slot。
+
+对 `ep0`：
+
+```text
+post row:        0  1  2 | 3  4  5 | 6  7
+token copy:      A0 A3 B2| A1 B0 B3| A2 B1
+local expert id: 0  0  0 | 1  1  1 | 2  2
+row_ids_map:     [0,-1,6,-1,4,-1,-1,-1,-1,3,-1,1,-1,7,2,5]
+```
+
+对 `ep1`：
+
+```text
+post row:        0  1  2 | 3  4  5 | 6  7
+token copy:      A1 B0 B3| A0 A3 B1| A2 B2
+local expert id: 0  0  0 | 1  1  1 | 2  2
+row_ids_map:     [-1,0,-1,4,-1,5,7,2,3,-1,6,-1,1,-1,-1,-1]
+```
+
+这里的 `row_ids_map` 长度是 `M_recv*K`，因为它对应的是带 `-1` 的 `recv_topk_idx` flat 空间；`-1` slot 在
+`row_ids_map` 里也保持为 `-1`。这和 All2All 例子中 destination rank 第二次 `permute` 的 `[M_recv]` map 不同。
+
+### 4. local experts grouped GEMM
+
+假设为了便于观察，每个 expert 输出第一列为：
+
+```text
+out(token, global_expert_id) = token_value + global_expert_id * 100
+```
+
+那么 `ep0` grouped GEMM 输出：
+
+```text
+post row:        0  1  2 | 3   4   5  | 6   7
+token copy:      A0 A3 B2| A1  B0  B3 | A2  B1
+global expert:   0  0  0 | 1   1   1  | 2   2
+experts_out:     10 13 22| 111 120 123| 212 221
+```
+
+`ep1` grouped GEMM 输出：
+
+```text
+post row:        0   1   2  | 3   4   5  | 6   7
+token copy:      A1  B0  B3 | A0  A3  B1 | A2  B2
+global expert:   3   3   3  | 4   4   4  | 5   5
+experts_out:     311 320 323| 410 413 421| 512 522
+```
+
+### 5. `combine_preprocess`: expert rank 上先做 topK 加权折叠
+
+DeepEP 已经把 `topk_weights` 发送到了 expert rank，所以 `combine_preprocess` 会在 receiver rank 本地执行：
 
 ```python
-dist.reduce_scatter_tensor(output, hidden.contiguous(), op=dist.ReduceOp.SUM, group=tp_group)
+hidden_states = unpermute(experts_out, row_ids_map, probs=recv_topk_weights)
 ```
 
-但这只是实现优化，不改变 dispatcher 对外的 `tp_rank_row_counts` 语义。真正的 ReduceScatterRowsSum 实现应集中在一个共享核心
-函数中，避免 combine forward 和 TP AllGather backward 分叉。
+输出回到 `dispatch` 后的 source-token 顺序 `[A0 A1 A2 A3 | B0 B1 B2 B3]`，但每行已经只包含当前 EP rank 负责的
+expert 加权结果。
 
-### 为什么不先做 padding/capacity
+`ep0`：
 
-padding 和 capacity 带来的收益不同，需要分开看：
+```text
+pre_combined row: 0    1     2     3   | 4   5     6   7
+source token:     A0   A1    A2    A3  | B0  B1    B2  B3
+weighted output:  2.5  66.6  148.4 2.6 | 24  110.5 2.2 79.95
+```
 
-- **padding 的收益** 是把一次变长 collective 包装成等长 collective。通信前把每个 TP rank 的真实 slice pad 到同一
-  长度，通信时就可以使用 `reduce_scatter_tensor` / `all_gather_into_tensor` 这类 tensor fast path。若 capacity
-  仍由本 step 的 `max(tp_rank_row_counts)` 动态决定，padding 只减少大块 hidden collective 的 variable-list
-  split 开销，不能消除 `tp_rank_row_counts` 的 CPU 同步。
-- **固定 capacity 的收益** 是让这个等长长度跨 step 稳定下来。只有 capacity 是配置值或静态上界时，shape 才稳定，
-  大块通信 shape 才能从本 step 的 Python split list 中解耦，后续也才更容易做 CUDA graph、buffer 复用或通信
-  buffer 预分配。
-- **对 Domino 的影响** 主要来自 host CPU split metadata 同步。只做动态 padding 时，host 仍要拿到
-  `tp_rank_row_counts` 来决定 pad/unpad 边界和本步 capacity，因此这个同步点仍然存在；固定 capacity 才可能减少
-  运行时 shape 决策，并把大块通信从 split-list 发起路径中移出。这和前面 EP All2All 的 host metadata 同步问题
-  类似：host 等 split list 时，已经 enqueue 到 GPU 的另一个 micro batch 计算仍可继续，但 host 不能继续
-  enqueue 后续本地算子和通信；如果等待时间超过可覆盖窗口，会压缩 Domino 的真实 overlap。
+`ep1`：
 
-因此，如果只是每步动态取 `capacity = max(tp_rank_row_counts)`，它仍然需要 `tp_rank_row_counts` 的 CPU 同步，只能减少
-variable collective 的 split-list 开销，不能获得固定 shape / CUDA graph，也不能消除 `tp_rank_row_counts` 对 Domino
-host enqueue 的影响。
+```text
+pre_combined row: 0     1      2     3    | 4   5     6     7
+source token:     A0    A1     A2    A3   | B0  B1    B2    B3
+weighted output:  307.5 124.4  153.6 330.4| 256 210.5 469.8 113.05
+```
 
-但它会把问题从通信层扩散到 layout 层。至少有两种做法：
+### 6. `combine`: 使用 DeepEP handle 送回 source rank 并 SUM
 
-1. **通信内部 padding，通信后立刻 unpad。**
+DeepEP combine 复用 dispatch 返回的 `handle`，把这些已经加权的 partial output 送回原 source rank，并对同一个 source
+token 来自不同 EP rank 的 partial output 做 SUM。
 
-   例如 `tp_rank_row_counts` 是 `[3, 5]`，capacity 取 `5`。AllGather 前把 rank0 的 `[3, H]` pad 到 `[5, H]`，
-   rank1 保持 `[5, H]`；等长 AllGather 得到 `[10, H]` 后再按真实 sizes compact 回 `[8, H]`。ReduceScatter
-   则需要先按 `[3, 5]` 切分、分别 pad 到 `[5, H]`，concat 成 `[10, H]` 后走 `reduce_scatter_tensor`，
-   最后再 unpad 成当前 rank 的真实 `[3, H]` 或 `[5, H]`。
+source `ep0` 收回：
 
-   这个方案不改变 expert 看到的 token 数，但增加 pad/unpad copy，并且仍然需要 `tp_rank_row_counts`。收益要靠 benchmark
-   证明。
+```text
+A0 final = 2.5   + 307.5 = 310
+A1 final = 66.6  + 124.4 = 191
+A2 final = 148.4 + 153.6 = 302
+A3 final = 2.6   + 330.4 = 333
+```
 
-2. **端到端 capacity，让 padding token 进入 expert layout。**
+source `ep1` 收回：
 
-   这种方案会让 `[tp_size * capacity, H]` 直接进入 `dispatch_postprocess` 和 grouped GEMM。它需要定义 padding
-   token 的 expert 归属、`tokens_per_expert` 是否包含 padding、grouped GEMM 是否计算 padding、combine 如何剔除
-   padding，以及 `row_ids_map` / `topk_weights` 如何保证 padding 不影响真实 token。
+```text
+B0 final = 24    + 256    = 280
+B1 final = 110.5 + 210.5  = 321
+B2 final = 2.2   + 469.8  = 472
+B3 final = 79.95 + 113.05 = 193
+```
 
-   这会把改动扩散到 routing、expert layout、postprocess/combine，不适合作为替换 `all_reduce + slice` 的第一步。
+因此 DeepEP 的：
 
-因此当前阶段的目标是局部替换：用真正的 TP ReduceScatterRowsSum 取代 `all_reduce + slice`，输出 shape 严格按照
-`tp_rank_row_counts[tp_rank]` 分配，允许 0 行，不做 padding/capacity。
+```text
+combined["hidden_states"]: [N, H] = [4, H]
+post_combined["hidden_states"]: [N, H] = [4, H]
+```
+
+`combine_postprocess` 不再像 All2All 那样使用 source rank 的 `row_id_map` 和 `topk_weights` 做本地 topK 加权合并；DeepEP 的
+topK 加权已经在 `combine_preprocess` 完成，`combine_postprocess` 主要负责 event 等待和返回 hidden。
+
+## Host metadata 同步
+
+DeepEP 不像 `TorchAll2AllDispatcher` 那样在 XTuner 代码里显式执行：
+
+```python
+to(device=torch.device("cpu")).tolist()
+```
+
+但它仍然存在 host 可见的 metadata 准备点。DeepEP 的 legacy Buffer API 文档和 XTuner 包装都注明：dispatch 内部不知道
+当前 rank 会收到多少 token，因此 CPU 会等待 GPU signal，拿到 receive count 后才能继续。XTuner 代码中的表现是
+`Buffer.dispatch` 返回 Python list：
+
+```python
+num_recv_tokens_per_expert_list, handle, event
+```
+
+`dispatch_postprocess` 必须用这个 list 计算 `num_out_tokens` 和 `tokens_per_expert`。因此 DeepEP 也不是完全无 host
+同步；只是同步被 DeepEP 的 layout/dispatch handle 机制封装在库内部，不是 PyTorch split-size list 的
+`.tolist()` 同步。
+
+对 Domino EP 来说，两者的影响边界一致：
+
+- 已经 enqueue 到 GPU 的另一个 micro batch 计算不会被 host 同步打断。
+- host 等 metadata 时无法继续 enqueue 后续本地算子和通信。
+- 如果 metadata 等待短于可覆盖的另一个 micro batch 计算，重叠效果基本保留。
+- 如果 metadata 等待更长，`xtuner_ep_domino.md` 7.3 中的理想时间线会被压缩，真实重叠比例下降。
+
+## 当前支持边界
+
+当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 会直接构造 `DeepEPDispatcher`，`tp_group` 没有接入
+DeepEP dispatcher。也就是说，XTuner 当前的 DeepEP 路径是 EP dispatcher，不包含 `TorchAll2AllTPEPDispatcher`
+那套 TP AllGather / TP ReduceScatterRowsSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
+额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterRowsSum，以及相应的 `topk_weights`
+event 语义；这部分见 `xtuner_etp.md`。

From bf7af7ab79b087b8aa590f540cf62a43f2ae4b2d Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 13:30:33 +0000
Subject: [PATCH 19/25] Add sync DeepEP ExpertTP dispatcher path

---
 .../dispatcher/test_deepep_expert_tp.py       | 182 ++++++++++++++++++
 xtuner/v1/module/dispatcher/__init__.py       |   1 +
 xtuner/v1/module/dispatcher/deepep.py         |  64 +++++-
 3 files changed, 239 insertions(+), 8 deletions(-)
 create mode 100644 tests/module/dispatcher/test_deepep_expert_tp.py

diff --git a/tests/module/dispatcher/test_deepep_expert_tp.py b/tests/module/dispatcher/test_deepep_expert_tp.py
new file mode 100644
index 000000000..8839a122d
--- /dev/null
+++ b/tests/module/dispatcher/test_deepep_expert_tp.py
@@ -0,0 +1,182 @@
+import os
+import unittest
+
+import torch
+import torch.distributed as dist
+from torch.testing._comparison import default_tolerances
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.module.dispatcher import build_dispatcher
+from xtuner.v1.module.dispatcher.deepep import DeepEPDispatcher
+
+
+BF16_RTOL, BF16_ATOL = default_tolerances(torch.bfloat16)
+FLOAT32_RTOL, FLOAT32_ATOL = default_tolerances(torch.float32)
+
+
+def _source_payload(rank: int, device: torch.device) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    rows = rank + 2
+    hidden_size = 128
+    token_markers = torch.arange(rows, device=device, dtype=torch.float32) + rank * 10
+    hidden = token_markers.unsqueeze(1) + torch.arange(hidden_size, device=device, dtype=torch.float32) / 100
+    topk_ids = torch.tensor([0, 1, 2, 3], device=device, dtype=torch.int64).expand(rows, -1).contiguous()
+    slot_offsets = torch.tensor([0.1, 0.2, 0.3, 0.4], device=device, dtype=torch.float32)
+    topk_weights = token_markers.unsqueeze(1) / 1000 + slot_offsets
+    return hidden.to(torch.bfloat16), topk_ids, topk_weights
+
+
+def _build_ep_tp_groups(ep_size: int, tp_size: int, backend: str = "nccl"):
+    ep_groups = [
+        dist.new_group([ep_rank * tp_size + tp_rank for ep_rank in range(ep_size)], backend=backend)
+        for tp_rank in range(tp_size)
+    ]
+    tp_groups = [
+        dist.new_group([ep_rank * tp_size + tp_rank for tp_rank in range(tp_size)], backend=backend)
+        for ep_rank in range(ep_size)
+    ]
+    return ep_groups, tp_groups
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA/NCCL is required for real DeepEP ExpertTP validation.")
+class TestDeepEPExpertTPDispatcher(DeterministicDDPTestCase):
+    def test_sync_path_uses_deepep_received_source_rows_for_expert_tp(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_size = 2
+        tp_size = 2
+        ep_rank = rank // tp_size
+        tp_rank = rank % tp_size
+        ep_groups, tp_groups = _build_ep_tp_groups(ep_size, tp_size)
+        ep_group = ep_groups[tp_rank]
+        tp_group = tp_groups[ep_rank]
+
+        dispatcher = build_dispatcher(
+            dispatcher="deepep",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+        assert isinstance(dispatcher, DeepEPDispatcher)
+
+        local_hidden, local_topk_ids, local_topk_weights = _source_payload(rank, device)
+        hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+
+        pre_dispatched = dispatcher.dispatch_preprocess(
+            hidden_states=hidden_leaf,
+            topk_ids=local_topk_ids,
+        )
+        dispatched = dispatcher.dispatch(
+            pre_dispatched=pre_dispatched,
+            topk_weights=topk_weights_leaf,
+            decoding=False,
+        )
+
+        # 中文注释：DeepEP + ExpertTP 的 TP row counts 描述 DeepEP dispatch
+        # 收到的 source-token rows，不是 topK 展开后的 route-copy rows。
+        expected_tp_rank_row_counts = [
+            sum(ep * tp_size + expected_tp_rank + 2 for ep in range(ep_size))
+            for expected_tp_rank in range(tp_size)
+        ]
+        assert dispatched["tp_rank_row_counts"] == expected_tp_rank_row_counts
+        assert dispatched["hidden_states"].shape[0] == sum(expected_tp_rank_row_counts)
+        assert dispatched["topk_ids"].shape[0] == sum(expected_tp_rank_row_counts)
+        assert dispatched["topk_weights"].shape[0] == sum(expected_tp_rank_row_counts)
+
+        token_markers = dispatched["hidden_states"][:, 0].float()
+        expected_gathered_weights = token_markers.unsqueeze(1) / 1000 + torch.tensor(
+            [0.1, 0.2, 0.3, 0.4],
+            device=device,
+            dtype=torch.float32,
+        )
+        valid_topk_slots = dispatched["topk_ids"] >= 0
+        torch.testing.assert_close(
+            dispatched["topk_weights"][valid_topk_slots],
+            expected_gathered_weights[valid_topk_slots],
+        )
+
+        post_dispatched = dispatcher.dispatch_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+        )
+
+        raw_counts_by_tp_rank: list[list[int] | None] = [None for _ in range(tp_size)]
+        dist.all_gather_object(raw_counts_by_tp_rank, dispatched["num_recv_tokens_per_expert_list"], group=tp_group)
+        expected_tokens_per_expert = torch.tensor(
+            raw_counts_by_tp_rank,
+            dtype=torch.long,
+            device=device,
+        ).sum(dim=0)
+        torch.testing.assert_close(post_dispatched["tokens_per_expert"], expected_tokens_per_expert)
+        assert dispatched["num_recv_tokens_per_expert_list"] == raw_counts_by_tp_rank[tp_rank]
+        assert int(post_dispatched["tokens_per_expert"].sum().item()) > sum(dispatched["tp_rank_row_counts"])
+
+        # 中文注释：dispatcher 测试不模拟真实 row-parallel expert 权重；
+        # 每个 ExpertTP rank 产出 1/tp_size partial，combine 的 ReduceScatterRowsSum
+        # 应恢复完整 expert output 后再交给 DeepEP combine。
+        expert_output = post_dispatched["hidden_states"] / tp_size
+        pre_combined = dispatcher.combine_preprocess(
+            hidden_states=expert_output,
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+        )
+        assert pre_combined["hidden_states"].shape[0] == sum(expected_tp_rank_row_counts)
+
+        combined = dispatcher.combine(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            decoding=False,
+        )
+        assert combined["hidden_states"].shape == local_hidden.shape
+
+        post_combined = dispatcher.combine_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            combined=combined,
+        )
+
+        expected_output = (
+            hidden_leaf.detach().float() * topk_weights_leaf.detach().sum(dim=1, keepdim=True)
+        ).to(post_combined["hidden_states"].dtype)
+        torch.testing.assert_close(
+            post_combined["hidden_states"],
+            expected_output,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        post_combined["hidden_states"].float().sum().backward()
+        assert hidden_leaf.grad is not None
+        assert topk_weights_leaf.grad is not None
+        expected_hidden_grad = topk_weights_leaf.detach().sum(dim=1, keepdim=True).expand_as(hidden_leaf)
+        expected_hidden_grad = expected_hidden_grad.to(hidden_leaf.grad.dtype)
+        expected_topk_grad = hidden_leaf.detach().float().sum(dim=1, keepdim=True).expand_as(topk_weights_leaf)
+        torch.testing.assert_close(
+            hidden_leaf.grad,
+            expected_hidden_grad,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            topk_weights_leaf.grad,
+            expected_topk_grad,
+            atol=FLOAT32_ATOL,
+            rtol=FLOAT32_RTOL,
+        )
+
+        dist.barrier()
+        for group in ep_groups + tp_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    @property
+    def world_size(self) -> int:
+        return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "4"))
diff --git a/xtuner/v1/module/dispatcher/__init__.py b/xtuner/v1/module/dispatcher/__init__.py
index f763be549..5bd340260 100644
--- a/xtuner/v1/module/dispatcher/__init__.py
+++ b/xtuner/v1/module/dispatcher/__init__.py
@@ -59,6 +59,7 @@ def build_dispatcher(
         return DeepEPDispatcher(
             n_routed_experts=n_routed_experts,
             process_group=ep_group,
+            tp_group=tp_group,
             training_dtype=training_dtype,
             generate_dtype=generate_dtype,
         )  # type: ignore
diff --git a/xtuner/v1/module/dispatcher/deepep.py b/xtuner/v1/module/dispatcher/deepep.py
index 00c769701..aa85efbdf 100644
--- a/xtuner/v1/module/dispatcher/deepep.py
+++ b/xtuner/v1/module/dispatcher/deepep.py
@@ -26,6 +26,7 @@
     PreCombineResult,
     PreDispatchResult,
 )
+from .expert_tp import ExpertTP
 
 
 if get_device() == "npu":
@@ -50,6 +51,8 @@ class DeepEPDispatchResult(DispatchResult):
     handle: DeepEPHandle
     topk_ids: torch.Tensor
     num_recv_tokens_per_expert_list: list[int]
+    num_recv_tokens_per_expert_group: torch.Tensor
+    tp_rank_row_counts: list[int]
     forward_finished_event: EventOverlap | None
 
 
@@ -258,6 +261,7 @@ def __init__(
         *,
         n_routed_experts: int,
         process_group: torch.distributed.ProcessGroup,
+        tp_group: torch.distributed.ProcessGroup | None = None,
         training_dtype: Literal["fp8", "bf16"] = "bf16",
         generate_dtype: Literal["fp8", "bf16"] = "bf16",
     ):
@@ -273,6 +277,7 @@ def __init__(
             "Process group must be provided for `DeepEPDispatcher`. "
             "If you are training a MoE model, it means that `expert parallel` is not enabled in the config."
         )
+        self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
 
     @override
     def dispatch_preprocess(
@@ -313,6 +318,9 @@ def dispatch(
         async_op: bool = False,
         decoding: bool = False,
     ) -> DeepEPDispatchResult:
+        if async_op and self._expert_tp is not None:
+            raise NotImplementedError("DeepEP + ExpertTP async dispatcher path is tracked separately.")
+
         (
             dispatched_hidden_states,
             dispatched_topk_idx,
@@ -336,12 +344,39 @@ def dispatch(
         else:
             forward_finished_event = event
 
+        tp_rank_row_counts = [cast(HiddenStates, dispatched_hidden_states).shape[0]]
+        num_recv_tokens_per_expert = torch.tensor(
+            num_recv_tokens_per_expert_list,
+            dtype=torch.long,
+            device=dispatched_topk_weights.device,
+        )
+        num_recv_tokens_per_expert_group = num_recv_tokens_per_expert.unsqueeze(0)
+        if self._expert_tp is not None:
+            # 中文注释：DeepEP dispatch 后的 hidden/topK 仍处于 received source-token row 空间；
+            # 这里的 TP rank row counts 记录 source-token rows，不记录 topK 展开后的 route-copy rows。
+            dispatched_hidden_states = cast(HiddenStates, dispatched_hidden_states)
+            tp_rank_row_counts = self._expert_tp.gather_tp_rank_row_counts(dispatched_hidden_states)
+            dispatched_hidden_states, _ = self._expert_tp.all_gather_rows(
+                dispatched_hidden_states,
+                tp_rank_row_counts,
+            )
+            dispatched_topk_idx = self._expert_tp.all_gather_row_metadata(dispatched_topk_idx, tp_rank_row_counts)
+            dispatched_topk_weights, _ = self._expert_tp.all_gather_rows(
+                dispatched_topk_weights,
+                tp_rank_row_counts,
+            )
+            num_recv_tokens_per_expert_group = self._expert_tp.all_gather_per_rank_metadata(
+                num_recv_tokens_per_expert,
+            )
+
         ret = DeepEPDispatchResult(
             hidden_states=cast(HiddenStates, dispatched_hidden_states),
             topk_weights=dispatched_topk_weights,
             topk_ids=dispatched_topk_idx,
             handle=dispatch_handle,
             num_recv_tokens_per_expert_list=num_recv_tokens_per_expert_list,
+            num_recv_tokens_per_expert_group=num_recv_tokens_per_expert_group,
+            tp_rank_row_counts=tp_rank_row_counts,
             forward_finished_event=forward_finished_event,
         )
         return ret
@@ -359,8 +394,17 @@ def dispatch_postprocess(
             assert dispatched["forward_finished_event"] is not None, "Please use `async_op=True` for dispatch!"
             dispatched["forward_finished_event"].current_stream_wait()
 
-        num_recv_tokens_per_expert_list = dispatched["num_recv_tokens_per_expert_list"]
-        num_out_tokens = sum(dispatched["num_recv_tokens_per_expert_list"])
+        if self._expert_tp is not None:
+            tokens_per_expert = dispatched["num_recv_tokens_per_expert_group"].sum(dim=0).to(torch.long)
+            num_out_tokens = int(tokens_per_expert.sum().item())
+        else:
+            num_recv_tokens_per_expert_list = dispatched["num_recv_tokens_per_expert_list"]
+            num_out_tokens = sum(num_recv_tokens_per_expert_list)
+            tokens_per_expert = torch.tensor(
+                num_recv_tokens_per_expert_list,
+                dtype=torch.long,
+                device=dispatched["topk_weights"].device,
+            )
         recv_topk_idx_numel = dispatched["topk_ids"].numel()
         num_neg_one_idx = recv_topk_idx_numel - num_out_tokens
 
@@ -370,11 +414,6 @@ def dispatch_postprocess(
             num_out_tokens=num_out_tokens,
             num_negative_one_in_indices=num_neg_one_idx,
         )
-        tokens_per_expert = torch.tensor(
-            num_recv_tokens_per_expert_list,
-            dtype=torch.long,
-            device=dispatched["topk_weights"].device,
-        )
 
         if decoding:
             raise NotImplementedError
@@ -444,8 +483,17 @@ def combine(
         else:
             backward_previous_event = None
 
+        hidden_states_for_combine = pre_combined["hidden_states"]
+        if self._expert_tp is not None:
+            # 中文注释：combine 阶段先把各 ExpertTP rank 的 expert partial output 做
+            # TP ReduceScatterRowsSum，回到当前 rank 的 DeepEP received source-token rows。
+            hidden_states_for_combine = self._expert_tp.reduce_scatter_rows_sum(
+                hidden_states_for_combine,
+                dispatched["tp_rank_row_counts"],
+            )
+
         combined_hidden_states, event = _async_combine(
-            pre_combined["hidden_states"],
+            hidden_states_for_combine,
             self._n_routed_experts,
             dispatched["handle"],
             self._process_group,

From 9bb255d6a6b83b626d4135aeeed702a514a20864 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 14:20:16 +0000
Subject: [PATCH 20/25] Add async DeepEP ExpertTP dispatcher path

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dispatcher/test_deepep_expert_tp.py       | 216 ++++++++++++++
 xtuner/v1/module/dispatcher/deepep.py         | 272 +++++++++++++++---
 2 files changed, 454 insertions(+), 34 deletions(-)

diff --git a/tests/module/dispatcher/test_deepep_expert_tp.py b/tests/module/dispatcher/test_deepep_expert_tp.py
index 8839a122d..41d1eab0f 100644
--- a/tests/module/dispatcher/test_deepep_expert_tp.py
+++ b/tests/module/dispatcher/test_deepep_expert_tp.py
@@ -177,6 +177,222 @@ def test_sync_path_uses_deepep_received_source_rows_for_expert_tp(self) -> None:
             dist.destroy_process_group(group)
         dist.destroy_process_group(pg)
 
+    def test_async_path_matches_sync_output_and_gradients(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_size = 2
+        tp_size = 2
+        ep_rank = rank // tp_size
+        tp_rank = rank % tp_size
+        ep_groups, tp_groups = _build_ep_tp_groups(ep_size, tp_size)
+        ep_group = ep_groups[tp_rank]
+        tp_group = tp_groups[ep_rank]
+
+        dispatcher = build_dispatcher(
+            dispatcher="deepep",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+
+        local_hidden, local_topk_ids, local_topk_weights = _source_payload(rank, device)
+
+        sync_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        sync_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        sync_result = self._run_public_api(
+            dispatcher=dispatcher,
+            hidden_states=sync_hidden_leaf * 1.25,
+            topk_ids=local_topk_ids,
+            topk_weights=sync_topk_weights_leaf * 0.5,
+            tp_size=tp_size,
+            async_op=False,
+        )
+        sync_result["hidden_states"].float().sum().backward()
+
+        async_hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        async_topk_weights_leaf = local_topk_weights.detach().clone().requires_grad_(True)
+        async_result = self._run_public_api(
+            dispatcher=dispatcher,
+            hidden_states=async_hidden_leaf * 1.25,
+            topk_ids=local_topk_ids,
+            topk_weights=async_topk_weights_leaf * 0.5,
+            tp_size=tp_size,
+            async_op=True,
+        )
+        async_result["hidden_states"].float().sum().backward()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(
+            async_result["hidden_states"],
+            sync_result["hidden_states"],
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        assert sync_hidden_leaf.grad is not None
+        assert async_hidden_leaf.grad is not None
+        assert sync_topk_weights_leaf.grad is not None
+        assert async_topk_weights_leaf.grad is not None
+        torch.testing.assert_close(
+            async_hidden_leaf.grad,
+            sync_hidden_leaf.grad,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            async_topk_weights_leaf.grad,
+            sync_topk_weights_leaf.grad,
+            atol=FLOAT32_ATOL,
+            rtol=FLOAT32_RTOL,
+        )
+
+        dist.barrier()
+        for group in ep_groups + tp_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    def test_async_path_accepts_topk_weights_without_gradients(self) -> None:
+        pg = self.create_pg("cuda")
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        device = torch.device("cuda", rank % torch.cuda.device_count())
+
+        ep_size = 2
+        tp_size = 2
+        ep_rank = rank // tp_size
+        tp_rank = rank % tp_size
+        ep_groups, tp_groups = _build_ep_tp_groups(ep_size, tp_size)
+        ep_group = ep_groups[tp_rank]
+        tp_group = tp_groups[ep_rank]
+
+        dispatcher = build_dispatcher(
+            dispatcher="deepep",
+            n_routed_experts=4,
+            ep_group=ep_group,
+            tp_group=tp_group,
+        )
+
+        local_hidden, local_topk_ids, local_topk_weights = _source_payload(rank, device)
+        hidden_leaf = local_hidden.detach().clone().requires_grad_(True)
+        topk_weights = local_topk_weights.detach().clone()
+        assert topk_weights.requires_grad is False
+
+        pre_dispatched = dispatcher.dispatch_preprocess(
+            hidden_states=hidden_leaf,
+            topk_ids=local_topk_ids,
+            async_op=True,
+        )
+        dispatched = dispatcher.dispatch(
+            pre_dispatched=pre_dispatched,
+            topk_weights=topk_weights,
+            decoding=False,
+            async_op=True,
+        )
+
+        expected_tp_rank_row_counts = [
+            sum(ep * tp_size + expected_tp_rank + 2 for ep in range(ep_size))
+            for expected_tp_rank in range(tp_size)
+        ]
+        assert dispatched["tp_rank_row_counts"] == expected_tp_rank_row_counts
+        # 中文注释：async dispatch 返回时已经处于 TP AllGather 后的完整 received source-token row 空间。
+        assert dispatched["hidden_states"].shape[0] == sum(expected_tp_rank_row_counts)
+
+        post_dispatched = dispatcher.dispatch_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            async_op=True,
+        )
+        pre_combined = dispatcher.combine_preprocess(
+            hidden_states=post_dispatched["hidden_states"] / tp_size,
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            async_op=True,
+        )
+        assert pre_combined["hidden_states"].shape[0] == sum(expected_tp_rank_row_counts)
+        combined = dispatcher.combine(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            decoding=False,
+            async_op=True,
+        )
+        # 中文注释：TP ReduceScatterRowsSum 属于 combine，combine 后回到本 rank 的 local received rows。
+        assert combined["hidden_states"].shape == local_hidden.shape
+        result = dispatcher.combine_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            combined=combined,
+            async_op=True,
+        )
+
+        result["hidden_states"].float().sum().backward()
+        torch.cuda.synchronize()
+        assert hidden_leaf.grad is not None
+
+        dist.barrier()
+        for group in ep_groups + tp_groups:
+            dist.destroy_process_group(group)
+        dist.destroy_process_group(pg)
+
+    def _run_public_api(
+        self,
+        *,
+        dispatcher,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        tp_size: int,
+        async_op: bool,
+    ) -> dict[str, torch.Tensor]:
+        pre_dispatched = dispatcher.dispatch_preprocess(
+            hidden_states=hidden_states,
+            topk_ids=topk_ids,
+            async_op=async_op,
+        )
+        dispatched = dispatcher.dispatch(
+            pre_dispatched=pre_dispatched,
+            topk_weights=topk_weights,
+            decoding=False,
+            async_op=async_op,
+        )
+        post_dispatched = dispatcher.dispatch_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            async_op=async_op,
+        )
+        # 中文注释：测试 dispatcher public API，不模拟真实 row-parallel expert；
+        # 每个 ExpertTP rank 产出 1/tp_size partial，combine 应归约回完整输出。
+        expert_output = post_dispatched["hidden_states"] / tp_size
+        pre_combined = dispatcher.combine_preprocess(
+            hidden_states=expert_output,
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            async_op=async_op,
+        )
+        combined = dispatcher.combine(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            decoding=False,
+            async_op=async_op,
+        )
+        return dispatcher.combine_postprocess(
+            pre_dispatched=pre_dispatched,
+            dispatched=dispatched,
+            post_dispatched=post_dispatched,
+            pre_combined=pre_combined,
+            combined=combined,
+            async_op=async_op,
+        )
+
     @property
     def world_size(self) -> int:
         return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "4"))
diff --git a/xtuner/v1/module/dispatcher/deepep.py b/xtuner/v1/module/dispatcher/deepep.py
index aa85efbdf..19e425001 100644
--- a/xtuner/v1/module/dispatcher/deepep.py
+++ b/xtuner/v1/module/dispatcher/deepep.py
@@ -54,6 +54,10 @@ class DeepEPDispatchResult(DispatchResult):
     num_recv_tokens_per_expert_group: torch.Tensor
     tp_rank_row_counts: list[int]
     forward_finished_event: EventOverlap | None
+    backward_previous_event: torch.cuda.Event | None
+    hidden_backward_finished_event: torch.cuda.Event | None
+    topk_weights_backward_previous_event: torch.cuda.Event | None
+    topk_weights_backward_finished_event: torch.cuda.Event | None
 
 
 class DeepEPPostDispatchResult(PostDispatchResult):
@@ -63,6 +67,7 @@ class DeepEPPostDispatchResult(PostDispatchResult):
 class DeepEPPreCombineResult(PreCombineResult):
     backward_previous_event: EventOverlap | None
     forward_finished_event: EventOverlap | None
+    tp_backward_finished_event: torch.cuda.Event | None
 
 
 class DeepEPCombineResult(CombineResult):
@@ -87,6 +92,9 @@ def forward(
         group: dist.ProcessGroup,
         forward_previous_event: EventOverlap | None = None,
         backward_finished_event: EventOverlap | None = None,
+        hidden_backward_previous_event: torch.cuda.Event | None = None,
+        topk_weights_backward_previous_event: torch.cuda.Event | None = None,
+        topk_weights_backward_finished_event: EventOverlap | None = None,
     ) -> tuple[
         torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         torch.Tensor,
@@ -119,6 +127,9 @@ def forward(
         ctx.group = group
         ctx.num_experts = num_experts
         ctx.backward_finished_event = backward_finished_event
+        ctx.hidden_backward_previous_event = hidden_backward_previous_event
+        ctx.topk_weights_backward_previous_event = topk_weights_backward_previous_event
+        ctx.topk_weights_backward_finished_event = topk_weights_backward_finished_event
         return (
             recv_x,
             recv_topk_idx,
@@ -135,16 +146,28 @@ def backward(  # type: ignore[invalid-override]
         grad_recv_topk_idx: torch.Tensor,
         grad_recv_topk_weights: torch.Tensor,
         *args,
-    ) -> tuple[torch.Tensor, None, torch.Tensor | None, None, None, None, None, None, None]:
+    ) -> tuple[torch.Tensor, None, torch.Tensor | None, None, None, None, None, None, None, None]:
         # load saved comm handle
         handle = ctx.saved_tensors
+        if ctx.is_async:
+            # 中文注释：DeepEP backward 只能等待 EventOverlap；ExpertTP backward 完成事件
+            # 是 torch.cuda.Event，因此桥接为当前 stream 上的 DeepEP previous_event。
+            if ctx.hidden_backward_previous_event is not None:
+                torch.cuda.current_stream().wait_event(ctx.hidden_backward_previous_event)
+            if ctx.topk_weights_backward_previous_event is not None:
+                torch.cuda.current_stream().wait_event(ctx.topk_weights_backward_previous_event)
+            previous_event = buffer_capture()
+        else:
+            previous_event = buffer_capture()
         combined_grad_x, combined_grad_recv_topk_weights, event = dispatch_backward(
-            grad_recv_x, grad_recv_topk_weights, ctx.num_experts, handle, ctx.group, buffer_capture()
+            grad_recv_x, grad_recv_topk_weights, ctx.num_experts, handle, ctx.group, previous_event
         )
         if not ctx.is_async:
             event.current_stream_wait()
         else:
             ctx.backward_finished_event.event = event.event
+            if ctx.topk_weights_backward_finished_event is not None:
+                ctx.topk_weights_backward_finished_event.event = event.event
         return (
             combined_grad_x,
             None,
@@ -155,6 +178,7 @@ def backward(  # type: ignore[invalid-override]
             None,
             None,
             None,
+            None,
         )
 
 
@@ -172,6 +196,7 @@ def forward(
         forward_previous_event: EventOverlap | None = None,
         backward_previous_event: EventOverlap | None = None,
         backward_finished_event: EventOverlap | None = None,
+        backward_finished_torch_event: torch.cuda.Event | None = None,
     ) -> tuple[torch.Tensor, EventOverlap]:
         if not (
             (forward_previous_event is None) == (backward_finished_event is None) == (backward_previous_event is None)
@@ -197,12 +222,13 @@ def forward(
         ctx.num_experts = num_experts
         ctx.backward_finished_event = backward_finished_event
         ctx.backward_previous_event = backward_previous_event
+        ctx.backward_finished_torch_event = backward_finished_torch_event
         return combined_x, event
 
     @staticmethod
     def backward(  # type: ignore[invalid-override]
         ctx, grad_combined_x: torch.Tensor, *args
-    ) -> tuple[torch.Tensor | tuple[torch.Tensor, torch.Tensor], None, None, None, None, None, None]:
+    ) -> tuple[torch.Tensor | tuple[torch.Tensor, torch.Tensor], None, None, None, None, None, None, None]:
         # load saved comm handle
         handle = ctx.saved_tensors
         if not ctx.is_async:
@@ -216,7 +242,12 @@ def backward(  # type: ignore[invalid-override]
             event.current_stream_wait()
         else:
             ctx.backward_finished_event.event = event.event
-        return grad_x, None, None, None, None, None, None
+            if ctx.backward_finished_torch_event is not None:
+                # 中文注释：TP ReduceScatterRowsSum backward 用 torch.cuda.Event
+                # 等 DeepEP combine backward 完成；桥接逻辑留在 DeepEPDispatcher 内部。
+                event.current_stream_wait()
+                ctx.backward_finished_torch_event.record()
+        return grad_x, None, None, None, None, None, None, None
 
 
 _async_combine = copy_method_signature(DeepEPCombine.forward)(DeepEPCombine.apply)
@@ -243,6 +274,45 @@ def _backward_hook(*_):
     return _backward_hook
 
 
+def get_torch_backward_pre_hook(
+    backward_previous_event: torch.cuda.Event,
+    name: str | None = None,
+    debug: bool = False,
+):
+    def _backward_pre_hook(*_):
+        if debug:
+            logger.info(f"[{name}] backward pre hook")
+        torch.cuda.current_stream().wait_event(backward_previous_event)
+
+    return _backward_pre_hook
+
+
+def get_torch_backward_hook(
+    backward_finished_event: torch.cuda.Event,
+    name: str | None = None,
+    debug: bool = False,
+):
+    def _backward_hook(*_):
+        if debug:
+            logger.info(f"[{name}] backward hook")
+        backward_finished_event.record()
+
+    return _backward_hook
+
+
+def _torch_event_after_event_overlap(event: EventOverlap | None) -> torch.cuda.Event:
+    if event is not None:
+        event.current_stream_wait()
+    torch_event = torch.cuda.Event()
+    torch_event.record()
+    return torch_event
+
+
+def _event_overlap_after_torch_event(event: torch.cuda.Event) -> EventOverlap:
+    torch.cuda.current_stream().wait_event(event)
+    return buffer_capture()
+
+
 class DeepEPDispatcher(
     GenericDispatcher[
         DeepEPPreDispatchResult,
@@ -278,6 +348,8 @@ def __init__(
             "If you are training a MoE model, it means that `expert parallel` is not enabled in the config."
         )
         self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
+        if self._expert_tp is not None and DeepEPDispatcher._comm_stream is None:
+            DeepEPDispatcher._comm_stream = torch.cuda.Stream(device=DEVICE)
 
     @override
     def dispatch_preprocess(
@@ -318,8 +390,25 @@ def dispatch(
         async_op: bool = False,
         decoding: bool = False,
     ) -> DeepEPDispatchResult:
+        hidden_backward_previous_event = None
+        hidden_backward_finished_event = None
+        topk_weights_backward_previous_event = None
+        topk_weights_backward_finished_event = None
+        topk_weights_backward_finished_overlap = None
         if async_op and self._expert_tp is not None:
-            raise NotImplementedError("DeepEP + ExpertTP async dispatcher path is tracked separately.")
+            hidden_backward_previous_event = torch.cuda.Event()
+            hidden_backward_finished_event = torch.cuda.Event()
+            topk_weights_backward_previous_event = torch.cuda.Event()
+            topk_weights_backward_finished_event = torch.cuda.Event()
+            if topk_weights.grad_fn is not None:
+                topk_weights_backward_finished_overlap = EventOverlap(None)
+                topk_weights.grad_fn.register_prehook(
+                    get_backward_pre_hook(
+                        backward_previous_event=topk_weights_backward_finished_overlap,
+                        name="DeepEPDispatcher.dispatch.topk_weights",
+                        debug=XTUNER_DISPATCHER_DEBUG,
+                    )
+                )
 
         (
             dispatched_hidden_states,
@@ -336,6 +425,9 @@ def dispatch(
             self._process_group,
             pre_dispatched["forward_finished_event"],
             pre_dispatched["backward_previous_event"],
+            hidden_backward_finished_event,
+            topk_weights_backward_finished_event,
+            topk_weights_backward_finished_overlap,
         )
 
         if not async_op:
@@ -356,18 +448,60 @@ def dispatch(
             # 这里的 TP rank row counts 记录 source-token rows，不记录 topK 展开后的 route-copy rows。
             dispatched_hidden_states = cast(HiddenStates, dispatched_hidden_states)
             tp_rank_row_counts = self._expert_tp.gather_tp_rank_row_counts(dispatched_hidden_states)
-            dispatched_hidden_states, _ = self._expert_tp.all_gather_rows(
-                dispatched_hidden_states,
-                tp_rank_row_counts,
-            )
-            dispatched_topk_idx = self._expert_tp.all_gather_row_metadata(dispatched_topk_idx, tp_rank_row_counts)
-            dispatched_topk_weights, _ = self._expert_tp.all_gather_rows(
-                dispatched_topk_weights,
-                tp_rank_row_counts,
-            )
-            num_recv_tokens_per_expert_group = self._expert_tp.all_gather_per_rank_metadata(
-                num_recv_tokens_per_expert,
-            )
+            if async_op:
+                assert self._comm_stream is not None
+                assert hidden_backward_previous_event is not None
+                assert hidden_backward_finished_event is not None
+                assert topk_weights_backward_previous_event is not None
+                assert topk_weights_backward_finished_event is not None
+
+                deepep_finished_event = _torch_event_after_event_overlap(event)
+                tp_counts_finished_event = torch.cuda.Event()
+                dispatched_hidden_states = self._expert_tp.async_all_gather_rows(
+                    dispatched_hidden_states,
+                    tp_rank_row_counts=tp_rank_row_counts,
+                    forward_previous_event=deepep_finished_event,
+                    forward_finished_event=None,
+                    backward_previous_event=hidden_backward_previous_event,
+                    backward_finished_event=hidden_backward_finished_event,
+                    comm_stream=self._comm_stream,
+                )
+                dispatched_topk_idx = self._expert_tp.async_all_gather_row_metadata(
+                    dispatched_topk_idx,
+                    tp_rank_row_counts=tp_rank_row_counts,
+                    forward_previous_event=None,
+                    forward_finished_event=None,
+                    comm_stream=self._comm_stream,
+                )
+                dispatched_topk_weights = self._expert_tp.async_all_gather_rows(
+                    dispatched_topk_weights,
+                    tp_rank_row_counts=tp_rank_row_counts,
+                    forward_previous_event=None,
+                    forward_finished_event=None,
+                    backward_previous_event=topk_weights_backward_previous_event,
+                    backward_finished_event=topk_weights_backward_finished_event,
+                    comm_stream=self._comm_stream,
+                )
+                num_recv_tokens_per_expert_group = self._expert_tp.async_all_gather_per_rank_metadata(
+                    num_recv_tokens_per_expert,
+                    forward_previous_event=None,
+                    forward_finished_event=tp_counts_finished_event,
+                    comm_stream=self._comm_stream,
+                )
+                forward_finished_event = _event_overlap_after_torch_event(tp_counts_finished_event)
+            else:
+                dispatched_hidden_states, _ = self._expert_tp.all_gather_rows(
+                    dispatched_hidden_states,
+                    tp_rank_row_counts,
+                )
+                dispatched_topk_idx = self._expert_tp.all_gather_row_metadata(dispatched_topk_idx, tp_rank_row_counts)
+                dispatched_topk_weights, _ = self._expert_tp.all_gather_rows(
+                    dispatched_topk_weights,
+                    tp_rank_row_counts,
+                )
+                num_recv_tokens_per_expert_group = self._expert_tp.all_gather_per_rank_metadata(
+                    num_recv_tokens_per_expert,
+                )
 
         ret = DeepEPDispatchResult(
             hidden_states=cast(HiddenStates, dispatched_hidden_states),
@@ -378,6 +512,10 @@ def dispatch(
             num_recv_tokens_per_expert_group=num_recv_tokens_per_expert_group,
             tp_rank_row_counts=tp_rank_row_counts,
             forward_finished_event=forward_finished_event,
+            backward_previous_event=hidden_backward_previous_event,
+            hidden_backward_finished_event=hidden_backward_finished_event,
+            topk_weights_backward_previous_event=topk_weights_backward_previous_event,
+            topk_weights_backward_finished_event=topk_weights_backward_finished_event,
         )
         return ret
 
@@ -414,6 +552,17 @@ def dispatch_postprocess(
             num_out_tokens=num_out_tokens,
             num_negative_one_in_indices=num_neg_one_idx,
         )
+        if async_op and self._expert_tp is not None:
+            backward_previous_event = dispatched["backward_previous_event"]
+            assert backward_previous_event is not None, "Please use `async_op=True` for dispatch!"
+            if permuted_hidden_states.grad_fn is not None:
+                permuted_hidden_states.grad_fn.register_hook(
+                    get_torch_backward_hook(
+                        backward_previous_event,
+                        name="DeepEPDispatcher.dispatch_postprocess",
+                        debug=XTUNER_DISPATCHER_DEBUG,
+                    )
+                )
 
         if decoding:
             raise NotImplementedError
@@ -442,19 +591,43 @@ def combine_preprocess(
         )
 
         if async_op:
-            backward_previous_event = EventOverlap(None)
             forward_finished_event = buffer_capture()
-            if hidden_states.grad_fn is not None:
-                hidden_states.grad_fn.register_prehook(
-                    get_backward_pre_hook(
-                        backward_previous_event=backward_previous_event,
-                        name="TorchAll2AllDispatcher.combine_preprocess",
-                        debug=XTUNER_DISPATCHER_DEBUG,
+            tp_backward_finished_event = None
+            if self._expert_tp is not None:
+                backward_previous_event = None
+                tp_backward_finished_event = torch.cuda.Event()
+                if hidden_states.grad_fn is not None:
+                    hidden_states.grad_fn.register_prehook(
+                        get_torch_backward_pre_hook(
+                            backward_previous_event=tp_backward_finished_event,
+                            name="DeepEPDispatcher.combine_preprocess",
+                            debug=XTUNER_DISPATCHER_DEBUG,
+                        )
+                    )
+                    topk_weights_backward_previous_event = dispatched["topk_weights_backward_previous_event"]
+                    if topk_weights_backward_previous_event is not None:
+                        hidden_states.grad_fn.register_hook(
+                            get_torch_backward_hook(
+                                topk_weights_backward_previous_event,
+                                name="DeepEPDispatcher.combine_preprocess.topk_weights",
+                                debug=XTUNER_DISPATCHER_DEBUG,
+                            )
+                        )
+            else:
+                backward_previous_event = EventOverlap(None)
+                tp_backward_finished_event = None
+                if hidden_states.grad_fn is not None:
+                    hidden_states.grad_fn.register_prehook(
+                        get_backward_pre_hook(
+                            backward_previous_event=backward_previous_event,
+                            name="TorchAll2AllDispatcher.combine_preprocess",
+                            debug=XTUNER_DISPATCHER_DEBUG,
+                        )
                     )
-                )
         else:
             backward_previous_event = None
             forward_finished_event = None
+            tp_backward_finished_event = None
 
         if decoding:
             raise NotImplementedError
@@ -463,6 +636,7 @@ def combine_preprocess(
                 hidden_states=hidden_states,
                 forward_finished_event=forward_finished_event,
                 backward_previous_event=backward_previous_event,
+                tp_backward_finished_event=tp_backward_finished_event,
             )
 
     @override
@@ -479,27 +653,57 @@ def combine(
         if async_op:
             backward_previous_event = EventOverlap(None)
             assert pre_combined["forward_finished_event"] is not None, "Please use `async_op=True` for combine!"
-            pre_combined["forward_finished_event"].current_stream_wait()
         else:
             backward_previous_event = None
 
         hidden_states_for_combine = pre_combined["hidden_states"]
         if self._expert_tp is not None:
-            # 中文注释：combine 阶段先把各 ExpertTP rank 的 expert partial output 做
-            # TP ReduceScatterRowsSum，回到当前 rank 的 DeepEP received source-token rows。
-            hidden_states_for_combine = self._expert_tp.reduce_scatter_rows_sum(
-                hidden_states_for_combine,
-                dispatched["tp_rank_row_counts"],
-            )
+            if async_op:
+                assert self._comm_stream is not None
+                assert pre_combined["tp_backward_finished_event"] is not None
+                tp_forward_previous_event = _torch_event_after_event_overlap(pre_combined["forward_finished_event"])
+                tp_forward_finished_event = torch.cuda.Event()
+                deepep_backward_finished_event = torch.cuda.Event()
+                # 中文注释：TP ReduceScatterRowsSum 属于 combine 通信段；
+                # DeepEP combine 只等待 TP 输出事件，不直接接触 ExpertTP 内部事件类型。
+                hidden_states_for_combine = self._expert_tp.async_reduce_scatter_rows_sum(
+                    hidden_states_for_combine,
+                    tp_rank_row_counts=dispatched["tp_rank_row_counts"],
+                    forward_previous_event=tp_forward_previous_event,
+                    forward_finished_event=tp_forward_finished_event,
+                    backward_previous_event=deepep_backward_finished_event,
+                    backward_finished_event=pre_combined["tp_backward_finished_event"],
+                    comm_stream=self._comm_stream,
+                )
+                forward_previous_event = _event_overlap_after_torch_event(tp_forward_finished_event)
+                deepep_backward_finished_overlap = EventOverlap(None)
+            else:
+                # 中文注释：combine 阶段先把各 ExpertTP rank 的 expert partial output 做
+                # TP ReduceScatterRowsSum，回到当前 rank 的 DeepEP received source-token rows。
+                hidden_states_for_combine = self._expert_tp.reduce_scatter_rows_sum(
+                    hidden_states_for_combine,
+                    dispatched["tp_rank_row_counts"],
+                )
+                forward_previous_event = pre_combined["forward_finished_event"]
+                deepep_backward_finished_event = None
+                deepep_backward_finished_overlap = pre_combined["backward_previous_event"]
+        else:
+            forward_previous_event = pre_combined["forward_finished_event"]
+            if async_op:
+                assert forward_previous_event is not None, "Please use `async_op=True` for combine!"
+                forward_previous_event.current_stream_wait()
+            deepep_backward_finished_event = None
+            deepep_backward_finished_overlap = pre_combined["backward_previous_event"]
 
         combined_hidden_states, event = _async_combine(
             hidden_states_for_combine,
             self._n_routed_experts,
             dispatched["handle"],
             self._process_group,
-            pre_combined["forward_finished_event"],
+            forward_previous_event,
             backward_previous_event,
-            pre_combined["backward_previous_event"],
+            deepep_backward_finished_overlap,
+            deepep_backward_finished_event,
         )
         if not async_op:
             event.current_stream_wait()

From 877182c317a3acfee8586cabe8d04015726ba590 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 14:22:59 +0000
Subject: [PATCH 21/25] Add DeepEP ExpertTP TrainEngine equivalence test

---
 .../test_moe_train_engine_deepep_expert_tp.py | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 tests/engine/test_moe_train_engine_deepep_expert_tp.py

diff --git a/tests/engine/test_moe_train_engine_deepep_expert_tp.py b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
new file mode 100644
index 000000000..3b0984f1f
--- /dev/null
+++ b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
@@ -0,0 +1,140 @@
+from __future__ import annotations
+
+import os
+import unittest
+from typing import Literal
+
+# 本测试关注 DeepEP + ExpertTP 的真实 grouped-GEMM 训练路径；
+# 与既有 engine ExpertTP 测试一致，用 Cutlass 后端规避本地 Triton TMA 兼容性差异。
+os.environ.setdefault("XTUNER_USE_CUTLASS_GROUP_GEMM", "1")
+
+import torch
+import torch.distributed as dist
+from mmengine.utils import is_installed
+from torch.testing._comparison import default_tolerances
+
+from xtuner._testing import DeterministicDDPTestCase
+from xtuner.v1.config import AdamWConfig, FSDPConfig
+from xtuner.v1.engine.train_engine import TrainEngine
+from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.module.dispatcher.deepep import DeepEPDispatcher
+from xtuner.v1.module.dispatcher.torch_all2all import TorchAll2AllDispatcher
+
+from .test_moe_train_engine_tpep import (
+    _build_tiny_moe_cfg,
+    _copy_matching_engine_weights,
+    _get_param_grad,
+    _make_engine_input,
+    _run_one_step_with_norm,
+)
+
+BF16_RTOL, BF16_ATOL = default_tolerances(torch.bfloat16)
+
+
+def _build_engine(
+    *,
+    dispatcher: Literal["all2all", "deepep"],
+    ep_size: int,
+    expert_tp_size: int,
+) -> TrainEngine:
+    moe_cfg = _build_tiny_moe_cfg(ep_size=ep_size, expert_tp_size=expert_tp_size)
+    moe_cfg.dispatcher = dispatcher
+    optim_cfg = AdamWConfig()
+    fsdp_cfg = FSDPConfig(
+        ep_size=ep_size,
+        cpu_offload=False,
+    )
+    return TrainEngine(
+        model_cfg=moe_cfg,
+        optim_cfg=optim_cfg,
+        fsdp_cfg=fsdp_cfg,
+    )
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available() or not is_installed("deep_ep"),
+    "CUDA/NCCL and DeepEP are required for real DeepEP ExpertTP TrainEngine validation.",
+)
+class TestMoETrainEngineDeepEPExpertTP(DeterministicDDPTestCase):
+    def test_deepep_matches_all2all_with_same_expert_tp_topology(self) -> None:
+        pg = self.create_pg("cuda")
+
+        ep_size = 2
+        expert_tp_size = 2
+        engine_all2all = _build_engine(
+            dispatcher="all2all",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+        )
+        engine_all2all.init_model_weights()
+
+        engine_deepep = _build_engine(
+            dispatcher="deepep",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+        )
+        engine_deepep.init_model_weights()
+        _copy_matching_engine_weights(engine_all2all, engine_deepep)
+        dist.barrier()
+
+        assert isinstance(engine_all2all.model.layers["0"].dispatcher, TorchAll2AllDispatcher)
+        assert isinstance(engine_deepep.model.layers["0"].dispatcher, DeepEPDispatcher)
+        assert engine_all2all.model.ep_mesh is not None
+        assert engine_deepep.model.ep_mesh is not None
+        assert engine_all2all.model.expert_tp_mesh is not None
+        assert engine_deepep.model.expert_tp_mesh is not None
+        assert engine_all2all.model.ep_mesh.size() == engine_deepep.model.ep_mesh.size() == ep_size
+        assert (
+            engine_all2all.model.expert_tp_mesh.size()
+            == engine_deepep.model.expert_tp_mesh.size()
+            == expert_tp_size
+        )
+        assert type(engine_all2all.optimizer) is type(engine_deepep.optimizer)
+        assert len(engine_all2all.optimizer.param_groups) == len(engine_deepep.optimizer.param_groups)
+        assert [
+            len(group["params"]) for group in engine_all2all.optimizer.param_groups
+        ] == [len(group["params"]) for group in engine_deepep.optimizer.param_groups]
+
+        device = torch.device("cuda", dist.get_rank() % torch.cuda.device_count())
+        input_ids, labels = _make_engine_input(device=device, seed_offset=dist.get_rank())
+        loss_cfg = CELossConfig()
+
+        loss_deepep, _, norm_deepep = _run_one_step_with_norm(engine_deepep, loss_cfg, input_ids, labels)
+        loss_all2all, _, norm_all2all = _run_one_step_with_norm(engine_all2all, loss_cfg, input_ids, labels)
+
+        torch.testing.assert_close(
+            torch.tensor(loss_deepep),
+            torch.tensor(loss_all2all),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        gate_grad_deepep = _get_param_grad(engine_deepep, "layers.0.gate.weight")
+        gate_grad_all2all = _get_param_grad(engine_all2all, "layers.0.gate.weight")
+        torch.testing.assert_close(
+            gate_grad_deepep,
+            gate_grad_all2all,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            norm_deepep,
+            norm_all2all,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @property
+    def destroy_pg_upon_exit(self) -> bool:
+        return False

From 8df2f1b0809d46cc2bd780d1f36a7527e6fccc8a Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 14:32:02 +0000
Subject: [PATCH 22/25] Add DeepEP ExpertTP single-model baseline test

---
 .../test_moe_train_engine_deepep_expert_tp.py | 133 +++++++++++++++++-
 1 file changed, 130 insertions(+), 3 deletions(-)

diff --git a/tests/engine/test_moe_train_engine_deepep_expert_tp.py b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
index 3b0984f1f..d8f878fff 100644
--- a/tests/engine/test_moe_train_engine_deepep_expert_tp.py
+++ b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
@@ -24,11 +24,28 @@
     _build_tiny_moe_cfg,
     _copy_matching_engine_weights,
     _get_param_grad,
+    _get_tpep_grouped_linear,
     _make_engine_input,
     _run_one_step_with_norm,
+    _run_train_step_without_clip,
+    _slice_tpep_weight,
+    _sync_engine_weights,
+    _zero_non_expert_grads,
 )
 
 BF16_RTOL, BF16_ATOL = default_tolerances(torch.bfloat16)
+BF16_GRAD_ATOL = BF16_ATOL * 2
+
+
+def _assert_bf16_training_close(actual: torch.Tensor, expected: torch.Tensor) -> None:
+    # 中文注释：梯度矩阵经过 grouped-GEMM 与 TP/EP 规约，近 0 元素会出现极小累加顺序差异；
+    # 这里仍以 torch.testing 的 bf16 默认精度为基准，只给梯度绝对误差留 2 倍余量。
+    torch.testing.assert_close(
+        actual.to(torch.bfloat16),
+        expected.to(torch.bfloat16),
+        atol=BF16_GRAD_ATOL,
+        rtol=BF16_RTOL,
+    )
 
 
 def _build_engine(
@@ -111,15 +128,125 @@ def test_deepep_matches_all2all_with_same_expert_tp_topology(self) -> None:
 
         gate_grad_deepep = _get_param_grad(engine_deepep, "layers.0.gate.weight")
         gate_grad_all2all = _get_param_grad(engine_all2all, "layers.0.gate.weight")
+        _assert_bf16_training_close(gate_grad_deepep, gate_grad_all2all)
         torch.testing.assert_close(
-            gate_grad_deepep,
-            gate_grad_all2all,
+            norm_deepep,
+            norm_all2all,
             atol=BF16_ATOL,
             rtol=BF16_RTOL,
         )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    def test_deepep_expert_tp_matches_single_model_baseline(self) -> None:
+        pg = self.create_pg("cuda")
+
+        ep_size = 2
+        expert_tp_size = 2
+        engine_ref = _build_engine(
+            dispatcher="all2all",
+            ep_size=1,
+            expert_tp_size=1,
+        )
+        engine_ref.init_model_weights()
+
+        engine_deepep = _build_engine(
+            dispatcher="deepep",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+        )
+        engine_deepep.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_deepep)
+        dist.barrier()
+
+        assert isinstance(engine_deepep.model.layers["0"].dispatcher, DeepEPDispatcher)
+        assert engine_deepep.model.ep_mesh is not None
+        assert engine_deepep.model.expert_tp_mesh is not None
+        assert engine_deepep.model.ep_mesh.size() == ep_size
+        assert engine_deepep.model.expert_tp_mesh.size() == expert_tp_size
+
+        device = torch.device("cuda", dist.get_rank() % torch.cuda.device_count())
+        input_ids, labels = _make_engine_input(device=device, seed_offset=dist.get_rank())
+        loss_cfg = CELossConfig()
+
+        loss_deepep, _, norm_deepep = _run_one_step_with_norm(engine_deepep, loss_cfg, input_ids, labels)
+        loss_ref, _, norm_ref = _run_one_step_with_norm(engine_ref, loss_cfg, input_ids, labels)
+
+        torch.testing.assert_close(
+            torch.tensor(loss_deepep),
+            torch.tensor(loss_ref),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        gate_grad_deepep = _get_param_grad(engine_deepep, "layers.0.gate.weight")
+        gate_grad_ref = _get_param_grad(engine_ref, "layers.0.gate.weight")
+        _assert_bf16_training_close(gate_grad_deepep, gate_grad_ref)
+
+        for module_suffix, fused_gate_up in (
+            ("layers.0.experts.fused_w1w3", True),
+            ("layers.0.experts.fused_w2", False),
+        ):
+            ref_grad = _get_param_grad(engine_ref, f"{module_suffix}.weight")
+            deepep_grad = _get_param_grad(engine_deepep, f"{module_suffix}.weight")
+            deepep_module = _get_tpep_grouped_linear(engine_deepep, module_suffix)
+            expected_deepep_grad = _slice_tpep_weight(deepep_module, ref_grad, fused_gate_up=fused_gate_up)
+            _assert_bf16_training_close(deepep_grad, expected_deepep_grad)
+
         torch.testing.assert_close(
             norm_deepep,
-            norm_all2all,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
+    def test_deepep_expert_tp_expert_only_grad_norm_matches_single_model_baseline(self) -> None:
+        pg = self.create_pg("cuda")
+
+        ep_size = 2
+        expert_tp_size = 2
+        engine_ref = _build_engine(
+            dispatcher="all2all",
+            ep_size=1,
+            expert_tp_size=1,
+        )
+        engine_ref.init_model_weights()
+
+        engine_deepep = _build_engine(
+            dispatcher="deepep",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+        )
+        engine_deepep.init_model_weights()
+        _sync_engine_weights(engine_ref, engine_deepep)
+        dist.barrier()
+
+        device = torch.device("cuda", dist.get_rank() % torch.cuda.device_count())
+        input_ids, labels = _make_engine_input(device=device, seed_offset=dist.get_rank())
+        loss_cfg = CELossConfig()
+
+        _run_train_step_without_clip(engine_deepep, loss_cfg, input_ids, labels)
+        _run_train_step_without_clip(engine_ref, loss_cfg, input_ids, labels)
+        # 中文注释：expert-only norm 单独验证 EP 和 ExpertTP shard 的 norm-square 汇总语义。
+        _zero_non_expert_grads(engine_deepep)
+        _zero_non_expert_grads(engine_ref)
+        expert_norm_deepep = engine_deepep.clip_grad_norm(do_clip=False).detach().float().cpu()
+        expert_norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+        torch.testing.assert_close(
+            expert_norm_deepep,
+            expert_norm_ref,
             atol=BF16_ATOL,
             rtol=BF16_RTOL,
         )

From 515d6dca00c5aace8a475cbb5f3ec6e6f531fd62 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 14:38:06 +0000
Subject: [PATCH 23/25] Add DeepEP ExpertTP Domino micro-batch test

---
 .../test_moe_train_engine_deepep_expert_tp.py | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/tests/engine/test_moe_train_engine_deepep_expert_tp.py b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
index d8f878fff..b8a691a67 100644
--- a/tests/engine/test_moe_train_engine_deepep_expert_tp.py
+++ b/tests/engine/test_moe_train_engine_deepep_expert_tp.py
@@ -26,6 +26,7 @@
     _get_param_grad,
     _get_tpep_grouped_linear,
     _make_engine_input,
+    _run_train_step_items_without_clip,
     _run_one_step_with_norm,
     _run_train_step_without_clip,
     _slice_tpep_weight,
@@ -53,6 +54,7 @@ def _build_engine(
     dispatcher: Literal["all2all", "deepep"],
     ep_size: int,
     expert_tp_size: int,
+    intra_layer_micro_batch: int = 1,
 ) -> TrainEngine:
     moe_cfg = _build_tiny_moe_cfg(ep_size=ep_size, expert_tp_size=expert_tp_size)
     moe_cfg.dispatcher = dispatcher
@@ -65,6 +67,101 @@ def _build_engine(
         model_cfg=moe_cfg,
         optim_cfg=optim_cfg,
         fsdp_cfg=fsdp_cfg,
+        intra_layer_micro_batch=intra_layer_micro_batch,
+    )
+
+
+def _record_deepep_expert_tp_collective_stages(
+    engine: TrainEngine,
+) -> tuple[dict[str, list[str]], list[tuple[str, tuple[int, ...], bool]]]:
+    stages: dict[str, list[str]] = {
+        "async_op_true": [],
+        "async_all_gather_rows": [],
+        "async_all_gather_row_metadata": [],
+        "async_all_gather_per_rank_metadata": [],
+        "async_reduce_scatter_rows_sum": [],
+    }
+    row_gather_inputs: list[tuple[str, tuple[int, ...], bool]] = []
+    current_stage: list[str] = []
+
+    for layer in engine.model.layers.values():
+        dispatcher = layer.dispatcher
+        assert isinstance(dispatcher, DeepEPDispatcher)
+        expert_tp = dispatcher._expert_tp
+        assert expert_tp is not None
+
+        for stage_name in (
+            "dispatch_preprocess",
+            "dispatch",
+            "dispatch_postprocess",
+            "combine_preprocess",
+            "combine",
+            "combine_postprocess",
+        ):
+            original_stage = getattr(dispatcher, stage_name)
+
+            def stage_wrapper(*args, _original_stage=original_stage, _stage_name=stage_name, **kwargs):
+                if kwargs.get("async_op", False):
+                    stages["async_op_true"].append(_stage_name)
+                current_stage.append(_stage_name)
+                try:
+                    return _original_stage(*args, **kwargs)
+                finally:
+                    current_stage.pop()
+
+            setattr(dispatcher, stage_name, stage_wrapper)
+
+        for collective_name in (
+            "async_all_gather_rows",
+            "async_all_gather_row_metadata",
+            "async_all_gather_per_rank_metadata",
+            "async_reduce_scatter_rows_sum",
+        ):
+            original_collective = getattr(expert_tp, collective_name)
+
+            def collective_wrapper(
+                *args,
+                _original_collective=original_collective,
+                _collective_name=collective_name,
+                **kwargs,
+            ):
+                stage = current_stage[-1] if current_stage else "<outside>"
+                stages[_collective_name].append(stage)
+                if _collective_name == "async_all_gather_rows":
+                    tensor = args[0]
+                    row_gather_inputs.append((stage, tuple(tensor.shape[1:]), tensor.requires_grad))
+                return _original_collective(*args, **kwargs)
+
+            setattr(expert_tp, collective_name, collective_wrapper)
+
+    return stages, row_gather_inputs
+
+
+def _assert_domino_deepep_expert_tp_collective_stages(
+    stages: dict[str, list[str]],
+    row_gather_inputs: list[tuple[str, tuple[int, ...], bool]],
+) -> None:
+    assert set(stages["async_op_true"]) == {
+        "dispatch_preprocess",
+        "dispatch",
+        "dispatch_postprocess",
+        "combine_preprocess",
+        "combine",
+        "combine_postprocess",
+    }
+    assert stages["async_all_gather_rows"]
+    assert stages["async_all_gather_row_metadata"]
+    assert stages["async_all_gather_per_rank_metadata"]
+    assert stages["async_reduce_scatter_rows_sum"]
+    assert set(stages["async_all_gather_rows"]) == {"dispatch"}
+    assert set(stages["async_all_gather_row_metadata"]) == {"dispatch"}
+    assert set(stages["async_all_gather_per_rank_metadata"]) == {"dispatch"}
+    assert set(stages["async_reduce_scatter_rows_sum"]) == {"combine"}
+    # 中文注释：shape=(2,) 且 requires_grad=True 的 dispatch-stage row gather
+    # 对应 router topK weights 的可微 ExpertTP gather 路径。
+    assert any(
+        stage == "dispatch" and shape == (2,) and requires_grad
+        for stage, shape, requires_grad in row_gather_inputs
     )
 
 
@@ -258,6 +355,66 @@ def test_deepep_expert_tp_expert_only_grad_norm_matches_single_model_baseline(se
         except Exception:
             pass
 
+    def test_deepep_expert_tp_domino_micro_batch_matches_sync_baseline(self) -> None:
+        pg = self.create_pg("cuda")
+
+        ep_size = 2
+        expert_tp_size = 2
+        engine_ref = _build_engine(
+            dispatcher="deepep",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+        )
+        engine_ref.init_model_weights()
+
+        engine_domino = _build_engine(
+            dispatcher="deepep",
+            ep_size=ep_size,
+            expert_tp_size=expert_tp_size,
+            intra_layer_micro_batch=2,
+        )
+        engine_domino.init_model_weights()
+        _copy_matching_engine_weights(engine_ref, engine_domino)
+        stages, row_gather_inputs = _record_deepep_expert_tp_collective_stages(engine_domino)
+        dist.barrier()
+
+        device = torch.device("cuda", dist.get_rank() % torch.cuda.device_count())
+        batches = [
+            _make_engine_input(device=device, seed_offset=dist.get_rank() * 2),
+            _make_engine_input(device=device, seed_offset=dist.get_rank() * 2 + 1),
+        ]
+        loss_cfg = CELossConfig()
+
+        loss_domino = _run_train_step_items_without_clip(engine_domino, loss_cfg, batches)
+        norm_domino = engine_domino.clip_grad_norm(do_clip=False).detach().float().cpu()
+        gate_grad_domino = _get_param_grad(engine_domino, "layers.0.gate.weight")
+
+        loss_ref = _run_train_step_items_without_clip(engine_ref, loss_cfg, batches)
+        norm_ref = engine_ref.clip_grad_norm(do_clip=False).detach().float().cpu()
+        gate_grad_ref = _get_param_grad(engine_ref, "layers.0.gate.weight")
+
+        _assert_domino_deepep_expert_tp_collective_stages(stages, row_gather_inputs)
+        torch.testing.assert_close(
+            torch.tensor(loss_domino),
+            torch.tensor(loss_ref),
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        torch.testing.assert_close(
+            norm_domino,
+            norm_ref,
+            atol=BF16_ATOL,
+            rtol=BF16_RTOL,
+        )
+        _assert_bf16_training_close(gate_grad_domino, gate_grad_ref)
+
+        dist.barrier()
+        torch.cuda.empty_cache()
+        try:
+            dist.destroy_process_group(pg)
+        except Exception:
+            pass
+
     @property
     def world_size(self) -> int:
         return 4

From 262a95f985c1f6d3a24ca0002bb2afa160fd8077 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Thu, 21 May 2026 14:47:52 +0000
Subject: [PATCH 24/25] Document DeepEP ExpertTP forward example

---
 ...idate_dispatcher_documentation_examples.py | 411 ++++++++++++++++++
 xtuner_ep_dispatcher.md                       | 161 ++++++-
 2 files changed, 567 insertions(+), 5 deletions(-)
 create mode 100644 ci/scripts/validate_dispatcher_documentation_examples.py

diff --git a/ci/scripts/validate_dispatcher_documentation_examples.py b/ci/scripts/validate_dispatcher_documentation_examples.py
new file mode 100644
index 000000000..9eb75e47f
--- /dev/null
+++ b/ci/scripts/validate_dispatcher_documentation_examples.py
@@ -0,0 +1,411 @@
+from __future__ import annotations
+
+import argparse
+import json
+from fractions import Fraction
+from typing import Any
+
+
+K = 2
+EP_EXPERTS = {
+    0: (0, 1, 2),
+    1: (3, 4, 5),
+}
+SOURCE_TOKENS = {
+    "ep0": ("A0", "A1", "A2", "A3"),
+    "ep1": ("B0", "B1", "B2", "B3"),
+}
+TOKEN_VALUE = {
+    "A0": 10,
+    "A1": 11,
+    "A2": 12,
+    "A3": 13,
+    "B0": 20,
+    "B1": 21,
+    "B2": 22,
+    "B3": 23,
+}
+TOPK_IDS = {
+    "A0": (0, 4),
+    "A1": (3, 1),
+    "A2": (2, 5),
+    "A3": (4, 0),
+    "B0": (1, 3),
+    "B1": (4, 2),
+    "B2": (5, 0),
+    "B3": (3, 1),
+}
+TOPK_WEIGHTS = {
+    "A0": (Fraction(1, 4), Fraction(3, 4)),
+    "A1": (Fraction(2, 5), Fraction(3, 5)),
+    "A2": (Fraction(7, 10), Fraction(3, 10)),
+    "A3": (Fraction(4, 5), Fraction(1, 5)),
+    "B0": (Fraction(1, 5), Fraction(4, 5)),
+    "B1": (Fraction(1, 2), Fraction(1, 2)),
+    "B2": (Fraction(9, 10), Fraction(1, 10)),
+    "B3": (Fraction(7, 20), Fraction(13, 20)),
+}
+FORWARD_ORDER = [
+    "DeepEP dispatch receives source-token rows",
+    "TP AllGather hidden, topK ids, and topK weights",
+    "dispatch_postprocess builds local route-copy layout",
+    "local experts produce ExpertTP partial outputs",
+    "combine_preprocess performs Expert-side topK folding",
+    "TP ReduceScatterRowsSum returns each TP rank source-token slice",
+    "DeepEP combine sends reduced source-token rows back",
+]
+
+
+def _number(value: Fraction | int) -> int | float:
+    if isinstance(value, int) or value.denominator == 1:
+        return int(value)
+    return float(value)
+
+
+def _numbers(values: list[Fraction] | tuple[Fraction, ...]) -> list[int | float]:
+    return [_number(value) for value in values]
+
+
+def _matrix_numbers(values: list[list[Fraction]]) -> list[list[int | float]]:
+    return [_numbers(row) for row in values]
+
+
+def _local_expert_id(global_expert: int, ep_rank: int) -> int:
+    if global_expert not in EP_EXPERTS[ep_rank]:
+        return -1
+    return global_expert - min(EP_EXPERTS[ep_rank])
+
+
+def _source_preprocess(source_rank: str) -> dict[str, Any]:
+    tokens = SOURCE_TOKENS[source_rank]
+    flat_copies = []
+    for slot in range(K):
+        for token_index, token in enumerate(tokens):
+            flat_copies.append(
+                {
+                    "flat_pos": slot * len(tokens) + token_index,
+                    "source_rank": source_rank,
+                    "source_row": token_index,
+                    "token": token,
+                    "global_expert": TOPK_IDS[token][slot],
+                    "topk_slot": slot,
+                }
+            )
+
+    sorted_copies = sorted(flat_copies, key=lambda row: (row["global_expert"], row["source_row"]))
+    row_id_map = [-1] * len(flat_copies)
+    for sorted_row, copy in enumerate(sorted_copies):
+        row_id_map[copy["flat_pos"]] = sorted_row
+
+    return {
+        "tokens": [row["token"] for row in sorted_copies],
+        "global_experts": [row["global_expert"] for row in sorted_copies],
+        "row_id_map": row_id_map,
+        "rows": sorted_copies,
+    }
+
+
+def _all2all_dispatch_rows(preprocessed_sources: dict[str, dict[str, Any]], target_ep_rank: int) -> list[dict[str, Any]]:
+    rows = []
+    for source_rank in ("ep0", "ep1"):
+        for row in preprocessed_sources[source_rank]["rows"]:
+            if row["global_expert"] not in EP_EXPERTS[target_ep_rank]:
+                continue
+            rows.append(
+                {
+                    **row,
+                    "target_ep_rank": target_ep_rank,
+                    "local_expert": _local_expert_id(row["global_expert"], target_ep_rank),
+                }
+            )
+    return rows
+
+
+def _permute_route_rows_by_local_expert(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    sorted_input_indices = sorted(range(len(rows)), key=lambda index: (rows[index]["local_expert"], index))
+    row_ids_map = [-1] * len(rows)
+    post_rows = []
+    for post_row, input_index in enumerate(sorted_input_indices):
+        row_ids_map[input_index] = post_row
+        post_rows.append(rows[input_index])
+
+    tokens_per_expert = [0, 0, 0]
+    for row in post_rows:
+        tokens_per_expert[row["local_expert"]] += 1
+
+    return {
+        "tokens": [row["token"] for row in post_rows],
+        "local_experts": [row["local_expert"] for row in post_rows],
+        "row_ids_map": row_ids_map,
+        "tokens_per_expert": tokens_per_expert,
+        "rows": post_rows,
+    }
+
+
+def _received_rows_for_ep(ep_rank: int) -> list[dict[str, Any]]:
+    rows = []
+    all_tokens = SOURCE_TOKENS["ep0"] + SOURCE_TOKENS["ep1"]
+    for token in all_tokens:
+        topk_ids = []
+        topk_weights = []
+        for slot, global_expert in enumerate(TOPK_IDS[token]):
+            local_expert = _local_expert_id(global_expert, ep_rank)
+            topk_ids.append(local_expert)
+            topk_weights.append(TOPK_WEIGHTS[token][slot] if local_expert >= 0 else Fraction(0))
+        if any(expert >= 0 for expert in topk_ids):
+            rows.append(
+                {
+                    "token": token,
+                    "hidden": TOKEN_VALUE[token],
+                    "topk_ids": topk_ids,
+                    "topk_weights": topk_weights,
+                }
+            )
+    return rows
+
+
+def _local_route_copy_layout(received_rows: list[dict[str, Any]]) -> dict[str, Any]:
+    route_copies = []
+    row_count = len(received_rows)
+    for slot in range(K):
+        for received_row, row in enumerate(received_rows):
+            local_expert = row["topk_ids"][slot]
+            if local_expert < 0:
+                continue
+            route_copies.append(
+                {
+                    "flat_pos": slot * row_count + received_row,
+                    "received_row": received_row,
+                    "topk_slot": slot,
+                    "token": row["token"],
+                    "hidden": row["hidden"],
+                    "local_expert": local_expert,
+                    "topk_weight": row["topk_weights"][slot],
+                }
+            )
+
+    # 中文注释：DeepEP dispatch 收到的是 source-token rows；这里才展开成 expert route-copy rows。
+    post_rows = sorted(route_copies, key=lambda row: (row["local_expert"], row["received_row"]))
+    row_ids_map = [-1] * (row_count * K)
+    for post_row, row in enumerate(post_rows):
+        row_ids_map[row["flat_pos"]] = post_row
+
+    tokens_per_expert = [0, 0, 0]
+    for row in post_rows:
+        tokens_per_expert[row["local_expert"]] += 1
+
+    return {
+        "tokens": [row["token"] for row in post_rows],
+        "local_experts": [row["local_expert"] for row in post_rows],
+        "row_ids_map": row_ids_map,
+        "tokens_per_expert": tokens_per_expert,
+        "rows": post_rows,
+    }
+
+
+def _fold_topk(
+    *,
+    route_outputs: list[Fraction],
+    row_ids_map: list[int],
+    received_rows: list[dict[str, Any]],
+) -> list[Fraction]:
+    row_count = len(received_rows)
+    folded = [Fraction(0) for _ in range(row_count)]
+    for flat_pos, post_row in enumerate(row_ids_map):
+        if post_row < 0:
+            continue
+        slot = flat_pos // row_count
+        received_row = flat_pos % row_count
+        folded[received_row] += route_outputs[post_row] * received_rows[received_row]["topk_weights"][slot]
+    return folded
+
+
+def validate_all2all_example() -> dict[str, Any]:
+    preprocessed = {source_rank: _source_preprocess(source_rank) for source_rank in SOURCE_TOKENS}
+    assert preprocessed["ep0"]["row_id_map"] == [0, 4, 3, 6, 5, 2, 7, 1]
+    assert preprocessed["ep1"]["row_id_map"] == [1, 6, 7, 5, 4, 3, 0, 2]
+
+    dispatched_ep0 = _all2all_dispatch_rows(preprocessed, target_ep_rank=0)
+    dispatched_ep1 = _all2all_dispatch_rows(preprocessed, target_ep_rank=1)
+    assert [row["token"] for row in dispatched_ep0] == ["A0", "A3", "A1", "A2", "B2", "B0", "B3", "B1"]
+    assert [row["token"] for row in dispatched_ep1] == ["A1", "A0", "A3", "A2", "B0", "B3", "B1", "B2"]
+
+    post_ep0 = _permute_route_rows_by_local_expert(dispatched_ep0)
+    post_ep1 = _permute_route_rows_by_local_expert(dispatched_ep1)
+    assert post_ep0["tokens"] == ["A0", "A3", "B2", "A1", "B0", "B3", "A2", "B1"]
+    assert post_ep1["tokens"] == ["A1", "B0", "B3", "A0", "A3", "B1", "A2", "B2"]
+    assert post_ep0["row_ids_map"] == [0, 1, 3, 6, 2, 4, 5, 7]
+    assert post_ep1["row_ids_map"] == [0, 3, 4, 6, 1, 2, 5, 7]
+
+    return {
+        "passed": True,
+        "ep0_dispatch_rows": [row["token"] for row in dispatched_ep0],
+        "ep1_dispatch_rows": [row["token"] for row in dispatched_ep1],
+        "ep0_tokens_per_expert": post_ep0["tokens_per_expert"],
+        "ep1_tokens_per_expert": post_ep1["tokens_per_expert"],
+    }
+
+
+def validate_deepep_example() -> dict[str, Any]:
+    received_by_ep = {ep_rank: _received_rows_for_ep(ep_rank) for ep_rank in EP_EXPERTS}
+    layouts = {ep_rank: _local_route_copy_layout(rows) for ep_rank, rows in received_by_ep.items()}
+    assert layouts[0]["tokens"] == ["A0", "A3", "B2", "A1", "B0", "B3", "A2", "B1"]
+    assert layouts[1]["tokens"] == ["A1", "B0", "B3", "A0", "A3", "B1", "A2", "B2"]
+    assert layouts[0]["tokens_per_expert"] == [3, 3, 2]
+    assert layouts[1]["tokens_per_expert"] == [3, 3, 2]
+
+    pre_combined_by_ep: dict[int, list[Fraction]] = {}
+    for ep_rank, layout in layouts.items():
+        route_outputs = [
+            Fraction(row["hidden"] + (row["local_expert"] + min(EP_EXPERTS[ep_rank])) * 100)
+            for row in layout["rows"]
+        ]
+        pre_combined_by_ep[ep_rank] = _fold_topk(
+            route_outputs=route_outputs,
+            row_ids_map=layout["row_ids_map"],
+            received_rows=received_by_ep[ep_rank],
+        )
+
+    expected_ep0 = [
+        Fraction(5, 2),
+        Fraction(333, 5),
+        Fraction(742, 5),
+        Fraction(13, 5),
+        Fraction(24),
+        Fraction(221, 2),
+        Fraction(11, 5),
+        Fraction(1599, 20),
+    ]
+    expected_ep1 = [
+        Fraction(615, 2),
+        Fraction(622, 5),
+        Fraction(768, 5),
+        Fraction(1652, 5),
+        Fraction(256),
+        Fraction(421, 2),
+        Fraction(2349, 5),
+        Fraction(2261, 20),
+    ]
+    assert pre_combined_by_ep[0] == expected_ep0
+    assert pre_combined_by_ep[1] == expected_ep1
+
+    source_ep0 = [pre_combined_by_ep[0][i] + pre_combined_by_ep[1][i] for i in range(4)]
+    source_ep1 = [pre_combined_by_ep[0][i] + pre_combined_by_ep[1][i] for i in range(4, 8)]
+    assert source_ep0 == [Fraction(310), Fraction(191), Fraction(302), Fraction(333)]
+    assert source_ep1 == [Fraction(280), Fraction(321), Fraction(472), Fraction(193)]
+
+    return {
+        "passed": True,
+        "ep0_pre_combined": _numbers(pre_combined_by_ep[0]),
+        "ep1_pre_combined": _numbers(pre_combined_by_ep[1]),
+        "source_outputs": {
+            "ep0": _numbers(source_ep0),
+            "ep1": _numbers(source_ep1),
+        },
+    }
+
+
+def validate_deepep_expert_tp_example() -> dict[str, Any]:
+    received_rows_by_tp_rank = [
+        [
+            {"token": "S0", "hidden": 10, "topk_ids": [0, 1], "topk_weights": [Fraction(1, 4), Fraction(3, 4)]},
+            {"token": "S1", "hidden": 20, "topk_ids": [2, -1], "topk_weights": [Fraction(3, 5), Fraction(0)]},
+            {"token": "S2", "hidden": 30, "topk_ids": [-1, 0], "topk_weights": [Fraction(0), Fraction(2, 5)]},
+        ],
+        [
+            {"token": "S3", "hidden": 40, "topk_ids": [1, 2], "topk_weights": [Fraction(3, 10), Fraction(7, 10)]},
+            {"token": "S4", "hidden": 50, "topk_ids": [-1, 1], "topk_weights": [Fraction(0), Fraction(1, 2)]},
+        ],
+    ]
+    tp_rank_row_counts = [len(rows) for rows in received_rows_by_tp_rank]
+    gathered_rows = [row for rows in received_rows_by_tp_rank for row in rows]
+    gathered_topk_ids = [row["topk_ids"] for row in gathered_rows]
+    gathered_topk_weights = [row["topk_weights"] for row in gathered_rows]
+
+    assert tp_rank_row_counts == [3, 2]
+    assert gathered_topk_ids == [[0, 1], [2, -1], [-1, 0], [1, 2], [-1, 1]]
+
+    layout = _local_route_copy_layout(gathered_rows)
+    assert len(gathered_rows) == 5
+    assert len(layout["rows"]) == 7
+    assert layout["tokens_per_expert"] == [2, 3, 2]
+    assert layout["tokens"] == ["S0", "S2", "S0", "S3", "S4", "S1", "S3"]
+
+    # 中文注释：两个 ExpertTP rank 分别给出 row-parallel partial；
+    # 先在 expert 侧按 topK fold，再由 ReduceScatterRowsSum 求和并切回 source-token slice。
+    tp0_route_outputs = [Fraction(row["hidden"]) for row in layout["rows"]]
+    tp1_route_outputs = [Fraction(row["local_expert"] * 100) for row in layout["rows"]]
+    tp0_folded = _fold_topk(
+        route_outputs=tp0_route_outputs,
+        row_ids_map=layout["row_ids_map"],
+        received_rows=gathered_rows,
+    )
+    tp1_folded = _fold_topk(
+        route_outputs=tp1_route_outputs,
+        row_ids_map=layout["row_ids_map"],
+        received_rows=gathered_rows,
+    )
+    folded_sum = [left + right for left, right in zip(tp0_folded, tp1_folded)]
+    assert tp0_folded == [Fraction(10), Fraction(12), Fraction(12), Fraction(40), Fraction(25)]
+    assert tp1_folded == [Fraction(75), Fraction(120), Fraction(0), Fraction(170), Fraction(50)]
+    assert folded_sum == [Fraction(85), Fraction(132), Fraction(12), Fraction(210), Fraction(75)]
+
+    reduce_scatter_rows_sum = {
+        "tp0": folded_sum[: tp_rank_row_counts[0]],
+        "tp1": folded_sum[tp_rank_row_counts[0] :],
+    }
+    assert reduce_scatter_rows_sum == {
+        "tp0": [Fraction(85), Fraction(132), Fraction(12)],
+        "tp1": [Fraction(210), Fraction(75)],
+    }
+
+    return {
+        "passed": True,
+        "forward_order": FORWARD_ORDER,
+        "tp_rank_row_counts": tp_rank_row_counts,
+        "gathered_hidden": [row["hidden"] for row in gathered_rows],
+        "gathered_topk_ids": gathered_topk_ids,
+        "gathered_topk_weights": _matrix_numbers(gathered_topk_weights),
+        "route_copy_tokens": layout["tokens"],
+        "route_copy_local_experts": layout["local_experts"],
+        "row_ids_map": layout["row_ids_map"],
+        "tokens_per_expert": layout["tokens_per_expert"],
+        "folded_partials": {
+            "tp0": _numbers(tp0_folded),
+            "tp1": _numbers(tp1_folded),
+        },
+        "folded_sum": _numbers(folded_sum),
+        "reduce_scatter_rows_sum": {
+            "tp0": _numbers(reduce_scatter_rows_sum["tp0"]),
+            "tp1": _numbers(reduce_scatter_rows_sum["tp1"]),
+        },
+        "deepep_combine_inputs": {
+            "tp0": _numbers(reduce_scatter_rows_sum["tp0"]),
+            "tp1": _numbers(reduce_scatter_rows_sum["tp1"]),
+        },
+    }
+
+
+def validate_all() -> dict[str, Any]:
+    return {
+        "all2all": validate_all2all_example(),
+        "deepep": validate_deepep_example(),
+        "deepep_expert_tp": validate_deepep_expert_tp_example(),
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Validate dispatcher documentation examples.")
+    parser.add_argument("--json", action="store_true", help="Print machine-readable validation results.")
+    args = parser.parse_args()
+
+    payload = validate_all()
+    if args.json:
+        print(json.dumps(payload, sort_keys=True))
+    else:
+        print("dispatcher documentation examples: ok")
+        print("validated: all2all, deepep, deepep_expert_tp")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xtuner_ep_dispatcher.md b/xtuner_ep_dispatcher.md
index a176aab40..a6d3b5d38 100644
--- a/xtuner_ep_dispatcher.md
+++ b/xtuner_ep_dispatcher.md
@@ -893,6 +893,156 @@ post_combined["hidden_states"]: [N, H] = [4, H]
 `combine_postprocess` 不再像 All2All 那样使用 source rank 的 `row_id_map` 和 `topk_weights` 做本地 topK 加权合并；DeepEP 的
 topK 加权已经在 `combine_preprocess` 完成，`combine_postprocess` 主要负责 event 等待和返回 hidden。
 
+## DeepEP + ExpertTP 前向示例
+
+DeepEP + ExpertTP 和 All2All + ExpertTP 的关键区别是 row space 不同：
+
+- **DeepEP received source-token rows** 是 DeepEP dispatch 后收到的 source token 行；TP AllGather 和
+  TP ReduceScatterRowsSum 都按这个行空间记录 `tp_rank_row_counts`。
+- **All2All route-copy rows** 是 All2All dispatch 前已经按 topK 展开的 expert copy 行；DeepEP 不在 dispatch 前展开，
+  只在 `dispatch_postprocess` 里根据 received topK ids 构造本地 route-copy layout。
+
+下面只看一个 DeepEP receiver EP rank 内的 ExpertTP group。这个 EP rank 拥有 local expert `0,1,2`，`K=2`，
+`expert_tp_size=2`。数值只表示 hidden 的第一列，真实实现里是 `[rows, hidden]`。
+
+DeepEP dispatch 后，各 TP rank 先各自拿到本 rank 的 received source-token rows：
+
+```text
+tp0 received source-token rows:
+row:          0        1        2
+source token: S0       S1       S2
+hidden:       10       20       30
+topk_ids:     [0,1]    [2,-1]   [-1,0]
+topk_weights: [.25,.75] [.60,0] [0,.40]
+
+tp1 received source-token rows:
+row:          0        1
+source token: S3       S4
+hidden:       40       50
+topk_ids:     [1,2]    [-1,1]
+topk_weights: [.30,.70] [0,.50]
+```
+
+所以这里的：
+
+```text
+TP rank row counts = [3, 2]
+dispatched["tp_rank_row_counts"] = [3, 2]
+```
+
+这个 `[3, 2]` 描述的是 DeepEP received source-token rows，不是 topK 展开后的 route-copy rows。
+
+### 1. `dispatch`: TP AllGather received source-token rows
+
+`DeepEPDispatcher.dispatch` 在 DeepEP dispatch 后，对 hidden、received topK ids、received topK weights 使用同一份
+`tp_rank_row_counts` 做 TP AllGather，保证三者行顺序一致：
+
+```text
+gathered received rows: S0 S1 S2 | S3 S4
+
+gathered_hidden:
+[10, 20, 30, 40, 50]
+
+gathered_topk_ids:
+[[0, 1], [2, -1], [-1, 0], [1, 2], [-1, 1]]
+
+gathered_topk_weights:
+[[.25, .75], [.60, 0], [0, .40], [.30, .70], [0, .50]]
+```
+
+此时每个 ExpertTP rank 都看到 5 行 source token；还没有变成 7 行 route-copy。
+
+### 2. `dispatch_postprocess`: 构造本地 route-copy layout
+
+`dispatch_postprocess` 消费 gathered `topk_ids`，丢掉 `-1` slot，并在 receiver rank 内按 local expert 分组：
+
+```text
+post row:          0   1 | 2   3   4 | 5   6
+source copy:       S0  S2| S0  S3  S4| S1  S3
+local expert id:   0   0 | 1   1   1 | 2   2
+row_ids_map:       [0, 5, -1, 3, -1, 2, -1, 1, 6, 4]
+tokens_per_expert = [2, 3, 2]
+```
+
+`row_ids_map` 的长度是 `M_recv * K = 5 * 2 = 10`，对应 topk-slot-first 的 received source-token flat 空间。
+有效 route-copy 行数是 `sum(tokens_per_expert) = 7`，它和前面的 received source-token rows 是两个不同的行空间。
+
+### 3. local experts grouped GEMM
+
+为了让数字可检查，示例把两个 ExpertTP rank 的 row-parallel partial output 写成：
+
+```text
+tp0 partial out(source, expert) = hidden
+tp1 partial out(source, expert) = local_expert_id * 100
+```
+
+因此两个 TP rank 在相同 route-copy layout 上分别得到：
+
+```text
+post row:             0   1 | 2    3    4 | 5    6
+source copy:          S0  S2| S0   S3   S4| S1   S3
+local expert id:      0   0 | 1    1    1 | 2    2
+tp0 expert partial:   10  30| 10   40   50| 20   40
+tp1 expert partial:   0   0 | 100  100  100| 200  200
+```
+
+### 4. `combine_preprocess`: Expert-side topK folding
+
+DeepEP 把 topK weights 发到了 expert rank，所以 topK 加权合并发生在 expert side：
+
+```text
+tp0 folded source rows:
+S0 = 10*.25 + 10*.75 = 10
+S1 = 20*.60          = 12
+S2 = 30*.40          = 12
+S3 = 40*.30 + 40*.70 = 40
+S4 = 50*.50          = 25
+
+tp1 folded source rows:
+S0 = 0*.25   + 100*.75 = 75
+S1 = 200*.60           = 120
+S2 = 0*.40             = 0
+S3 = 100*.30 + 200*.70 = 170
+S4 = 100*.50           = 50
+```
+
+这一步输出仍然是 gathered received source-token row space：
+
+```text
+pre_combined tp0 partial: [10, 12, 12, 40, 25]
+pre_combined tp1 partial: [75, 120, 0, 170, 50]
+```
+
+### 5. `combine`: TP ReduceScatterRowsSum 后再 DeepEP combine
+
+`combine` 先执行 TP ReduceScatterRowsSum。它先对两个 TP rank 的 folded partial 做 SUM，再按同一份
+`TP rank row counts = [3, 2]` 切回每个 TP rank 的 received source-token slice：
+
+```text
+SUM over ExpertTP ranks:
+[85, 132, 12, 210, 75]
+
+TP ReduceScatterRowsSum output:
+tp0 rows [0:3] -> [85, 132, 12]
+tp1 rows [3:5] -> [210, 75]
+```
+
+DeepEP combine 在 TP ReduceScatterRowsSum 之后运行；它消费的是每个 TP rank 自己的 reduced source-token rows，
+不是 gathered 5 行，也不是 route-copy 7 行：
+
+```text
+DeepEP combine input on tp0: S0=85, S1=132, S2=12
+DeepEP combine input on tp1: S3=210, S4=75
+```
+
+这个 forward order 和上面的期望输出由脚本校验：
+
+```bash
+python ci/scripts/validate_dispatcher_documentation_examples.py
+```
+
+脚本同时校验本文件里的 All2All 和 DeepEP-only 文档例子，避免 DeepEP + ExpertTP 示例更新时破坏已有例子的行空间推导。
+
 ## Host metadata 同步
 
 DeepEP 不像 `TorchAll2AllDispatcher` 那样在 XTuner 代码里显式执行：
@@ -922,8 +1072,9 @@ num_recv_tokens_per_expert_list, handle, event
 
 ## 当前支持边界
 
-当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 会直接构造 `DeepEPDispatcher`，`tp_group` 没有接入
-DeepEP dispatcher。也就是说，XTuner 当前的 DeepEP 路径是 EP dispatcher，不包含 `TorchAll2AllTPEPDispatcher`
-那套 TP AllGather / TP ReduceScatterRowsSum 通信段。DeepEP + ExpertTP 如果要成为 Domino-compatible ExpertTP，需要
-额外设计 DeepEP dispatch 后的 TP AllGather、combine 前的 TP ReduceScatterRowsSum，以及相应的 `topk_weights`
-event 语义；这部分见 `xtuner_etp.md`。
+当前 `build_dispatcher(dispatcher="deepep", tp_group=...)` 仍然构造 `DeepEPDispatcher`。`tp_group=None` 时保持
+DeepEP-only 语义；`tp_group` 大小大于 1 时，`DeepEPDispatcher` 在 DeepEP dispatch 后接入 TP AllGather，并在
+DeepEP combine 前接入 TP ReduceScatterRowsSum。
+
+这个支持目标覆盖 BF16 训练 forward/backward 和 Domino-compatible ExpertTP 的 dispatcher 通信边界；`decoding=True`
+和 FP8 DeepEP 通信仍不属于当前范围。

From ab7e5fafbbea3b4da0180fb7e4155888a30520f8 Mon Sep 17 00:00:00 2001
From: zhaopenghao <henryzhao1989@163.com>
Date: Fri, 22 May 2026 02:56:19 +0000
Subject: [PATCH 25/25] Add decoding and fp8 checks in DeepEPDispatcher

---
 xtuner/v1/module/dispatcher/deepep.py | 53 ++++++++++++++-------------
 xtuner/v1/ops/comm/deepep_op.py       | 23 +++++++++---
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/xtuner/v1/module/dispatcher/deepep.py b/xtuner/v1/module/dispatcher/deepep.py
index 19e425001..56fa54db3 100644
--- a/xtuner/v1/module/dispatcher/deepep.py
+++ b/xtuner/v1/module/dispatcher/deepep.py
@@ -313,6 +313,11 @@ def _event_overlap_after_torch_event(event: torch.cuda.Event) -> EventOverlap:
     return buffer_capture()
 
 
+def _raise_if_decoding(decoding: bool) -> None:
+    if decoding:
+        raise NotImplementedError("DeepEPDispatcher does not support decoding=True.")
+
+
 class DeepEPDispatcher(
     GenericDispatcher[
         DeepEPPreDispatchResult,
@@ -348,6 +353,9 @@ def __init__(
             "If you are training a MoE model, it means that `expert parallel` is not enabled in the config."
         )
         self._expert_tp = ExpertTP(tp_group) if tp_group is not None and tp_group.size() > 1 else None
+        if self._expert_tp is not None and (training_dtype == "fp8" or generate_dtype == "fp8"):
+            # TODO: 待测试 fp8
+            raise NotImplementedError("FP8 DeepEP communication is not supported for DeepEP + ExpertTP.")
         if self._expert_tp is not None and DeepEPDispatcher._comm_stream is None:
             DeepEPDispatcher._comm_stream = torch.cuda.Stream(device=DEVICE)
 
@@ -390,6 +398,7 @@ def dispatch(
         async_op: bool = False,
         decoding: bool = False,
     ) -> DeepEPDispatchResult:
+        _raise_if_decoding(decoding)
         hidden_backward_previous_event = None
         hidden_backward_finished_event = None
         topk_weights_backward_previous_event = None
@@ -528,6 +537,7 @@ def dispatch_postprocess(
         async_op: bool = False,
         decoding: bool = False,
     ) -> DeepEPPostDispatchResult:
+        _raise_if_decoding(decoding)
         if async_op:
             assert dispatched["forward_finished_event"] is not None, "Please use `async_op=True` for dispatch!"
             dispatched["forward_finished_event"].current_stream_wait()
@@ -564,14 +574,11 @@ def dispatch_postprocess(
                     )
                 )
 
-        if decoding:
-            raise NotImplementedError
-        else:
-            return DeepEPPostDispatchResult(
-                hidden_states=permuted_hidden_states,
-                row_ids_map=row_ids_map,
-                tokens_per_expert=tokens_per_expert,
-            )
+        return DeepEPPostDispatchResult(
+            hidden_states=permuted_hidden_states,
+            row_ids_map=row_ids_map,
+            tokens_per_expert=tokens_per_expert,
+        )
 
     @override
     def combine_preprocess(
@@ -584,6 +591,7 @@ def combine_preprocess(
         async_op: bool = False,
         decoding: bool = False,
     ) -> DeepEPPreCombineResult:
+        _raise_if_decoding(decoding)
         hidden_states = unpermute(
             hidden_states,
             post_dispatched["row_ids_map"],
@@ -629,15 +637,12 @@ def combine_preprocess(
             forward_finished_event = None
             tp_backward_finished_event = None
 
-        if decoding:
-            raise NotImplementedError
-        else:
-            return DeepEPPreCombineResult(
-                hidden_states=hidden_states,
-                forward_finished_event=forward_finished_event,
-                backward_previous_event=backward_previous_event,
-                tp_backward_finished_event=tp_backward_finished_event,
-            )
+        return DeepEPPreCombineResult(
+            hidden_states=hidden_states,
+            forward_finished_event=forward_finished_event,
+            backward_previous_event=backward_previous_event,
+            tp_backward_finished_event=tp_backward_finished_event,
+        )
 
     @override
     def combine(
@@ -650,6 +655,7 @@ def combine(
         async_op: bool = False,
         decoding: bool = False,
     ) -> CombineResult:
+        _raise_if_decoding(decoding)
         if async_op:
             backward_previous_event = EventOverlap(None)
             assert pre_combined["forward_finished_event"] is not None, "Please use `async_op=True` for combine!"
@@ -708,14 +714,11 @@ def combine(
         if not async_op:
             event.current_stream_wait()
 
-        if not decoding:
-            return DeepEPCombineResult(
-                hidden_states=combined_hidden_states,
-                forward_finished_event=event,
-                backward_previous_event=backward_previous_event,
-            )
-        else:
-            raise NotImplementedError
+        return DeepEPCombineResult(
+            hidden_states=combined_hidden_states,
+            forward_finished_event=event,
+            backward_previous_event=backward_previous_event,
+        )
 
     @override
     def combine_postprocess(
diff --git a/xtuner/v1/ops/comm/deepep_op.py b/xtuner/v1/ops/comm/deepep_op.py
index 575d1ea26..b83559567 100644
--- a/xtuner/v1/ops/comm/deepep_op.py
+++ b/xtuner/v1/ops/comm/deepep_op.py
@@ -85,22 +85,35 @@ def get_low_latency_buffer(
     if _buffer is None:
         # NOTES: for best performance, the QP number **must** be equal to the number of the local experts
         assert num_experts % group.size() == 0
-        # _buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+        num_qps_per_rank = max(num_experts // group.size(), Buffer.num_sms // 2)
         _buffer = Buffer(
             group,
             num_nvl_bytes,
             num_rdma_bytes,
             low_latency_mode=True,
-            num_qps_per_rank=max(num_experts // group.size(), Buffer.num_sms // 2),
+            num_qps_per_rank=num_qps_per_rank,
         )
         logger.info(
-            f"{num_nvl_bytes}, {_buffer.num_nvl_bytes}, {num_max_dispatch_tokens_per_rank}, {hidden}, {num_experts}, {group.size()}"
+            "[DeepEP low-latency] allocated buffer: "
+            f"num_nvl_bytes={num_nvl_bytes} (allocated={_buffer.num_nvl_bytes}), "
+            f"num_rdma_bytes={num_rdma_bytes} (allocated={_buffer.num_rdma_bytes}), "
+            f"num_max_dispatch_tokens_per_rank={num_max_dispatch_tokens_per_rank}, "
+            f"hidden={hidden}, num_experts={num_experts}, ep_group_size={group.size()}, "
+            f"num_qps_per_rank={num_qps_per_rank}"
         )
     else:
         assert num_nvl_bytes <= _buffer.num_nvl_bytes, (
-            f"{num_nvl_bytes}, {_buffer.num_nvl_bytes}, {num_max_dispatch_tokens_per_rank}, {hidden}, {num_experts}, {group.size()}"
+            "[DeepEP low-latency] NVL buffer too small: "
+            f"required={num_nvl_bytes}, allocated={_buffer.num_nvl_bytes}, "
+            f"num_max_dispatch_tokens_per_rank={num_max_dispatch_tokens_per_rank}, "
+            f"hidden={hidden}, num_experts={num_experts}, ep_group_size={group.size()}"
+        )
+        assert num_rdma_bytes <= _buffer.num_rdma_bytes, (
+            "[DeepEP low-latency] RDMA buffer too small: "
+            f"required={num_rdma_bytes}, allocated={_buffer.num_rdma_bytes}, "
+            f"num_max_dispatch_tokens_per_rank={num_max_dispatch_tokens_per_rank}, "
+            f"hidden={hidden}, num_experts={num_experts}, ep_group_size={group.size()}"
         )
-        assert num_rdma_bytes <= _buffer.num_rdma_bytes
     return _buffer