Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6918c37
init xtuner ep doc
jayhenry Apr 28, 2026
b044513
fix row_id_map in dispatch_preprocess
jayhenry Apr 28, 2026
4c0147f
feat(dispatcher): add torch all2all TP/EP dispatcher and TP+EP docs v…
jayhenry Apr 28, 2026
111f35e
add tp ep demo case with moe block
jayhenry Apr 28, 2026
fa8cabd
add tp_mesh into moe model and decoder layer
jayhenry Apr 28, 2026
30c3fd9
add more backgroud docs for parallel training
jayhenry May 12, 2026
6c18915
[WIP] GroupedLinear support real TP shard; but loss grad is wrong now
jayhenry May 12, 2026
71c40ae
[Fix] ETP calculates correct loss grad; Tighten the numerical precisi…
jayhenry May 13, 2026
c2638b0
Enhance TP/EP dispatcher with async operations for donimo
jayhenry May 13, 2026
802d6d0
Enhance documentation on host metadata synchronization in variable-le…
jayhenry May 13, 2026
3e5bf67
Refactor TPEP TP collectives
jayhenry May 14, 2026
419134f
Support Naive ExpertTP without EP
jayhenry May 14, 2026
a3ecc11
Support async Naive ExpertTP events
jayhenry May 15, 2026
d18a3a7
Fix MoE compile config for ExpertTP
jayhenry May 20, 2026
c373405
Validate ExpertTP-only training
jayhenry May 21, 2026
92741d0
Add Domino ExpertTP-only engine test
jayhenry May 21, 2026
aa5c9b8
Share ExpertTP row collectives in All2All dispatcher
jayhenry May 21, 2026
941d83e
add deepep doc and validate scripts
jayhenry May 21, 2026
bf7af7a
Add sync DeepEP ExpertTP dispatcher path
jayhenry May 21, 2026
9bb255d
Add async DeepEP ExpertTP dispatcher path
jayhenry May 21, 2026
877182c
Add DeepEP ExpertTP TrainEngine equivalence test
jayhenry May 21, 2026
8df2f1b
Add DeepEP ExpertTP single-model baseline test
jayhenry May 21, 2026
515d6dc
Add DeepEP ExpertTP Domino micro-batch test
jayhenry May 21, 2026
262a95f
Document DeepEP ExpertTP forward example
jayhenry May 21, 2026
ab7e5fa
Add decoding and fp8 checks in DeepEPDispatcher
jayhenry May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .dev_scripts/run_test_moe_train_engine_tpep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Run the EP+TP training unit test.
# Requires 4 GPUs (EP=2 * TP=2 * DP=1).
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

CONDA_ENV="${CONDA_ENV:-fla}"
source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate "${CONDA_ENV}"

XTUNER_USE_CUTLASS_GROUP_GEMM="${XTUNER_USE_CUTLASS_GROUP_GEMM:-1}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
MASTER_PORT="${MASTER_PORT:-29533}"

export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export CUDA_VISIBLE_DEVICES
export XTUNER_USE_CUTLASS_GROUP_GEMM

cd "${REPO_ROOT}"
python -m pytest \
tests/engine/test_moe_train_engine_tpep.py \
-v \
-x \
--no-header
33 changes: 33 additions & 0 deletions .dev_scripts/run_validate_moeblock_tpep_vs_single.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

# 默认使用用户指定的 fla 环境;需要切换时可在命令前覆盖 CONDA_ENV。
CONDA_ENV="${CONDA_ENV:-fla}"
source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate "${CONDA_ENV}"

# 本脚本固定验证 EP=2, TP=2。
EP_SIZE="${EP_SIZE:-2}"
TP_SIZE="${TP_SIZE:-2}"
DP_SIZE="${DP_SIZE:-1}"
NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * TP_SIZE * DP_SIZE))}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
MASTER_PORT="${MASTER_PORT:-29532}"
XTUNER_USE_CUTLASS_GROUP_GEMM="${XTUNER_USE_CUTLASS_GROUP_GEMM:-1}"

# 显式使用当前仓库代码,避免导入 conda 环境或其他目录下安装的 xtuner。
export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export CUDA_VISIBLE_DEVICES
export EP_SIZE
export TP_SIZE
export DP_SIZE
export XTUNER_USE_CUTLASS_GROUP_GEMM

cd "${REPO_ROOT}"
torchrun \
--nproc-per-node="${NPROC_PER_NODE}" \
--master-port="${MASTER_PORT}" \
.dev_scripts/validate_moeblock_tpep_vs_single.py
31 changes: 31 additions & 0 deletions .dev_scripts/run_validate_xtuner_deepep_md.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

# 默认使用用户指定的 pt29_sg59 环境;需要切换时可在命令前覆盖 CONDA_ENV。
CONDA_ENV="${CONDA_ENV:-pt29_sg59}"
source $(conda info --base)/etc/profile.d/conda.sh
conda activate "${CONDA_ENV}"

export XTUNER_EP_DEBUG="${XTUNER_EP_DEBUG:-1}"

# xtuner_ep_dispatcher.md 的 DeepEP 示例固定为 EP=2;默认额外验证 4 份 DP replica。
EP_SIZE="${EP_SIZE:-2}"
DP_SIZE="${DP_SIZE:-4}"
NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * DP_SIZE))}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
MASTER_PORT="${MASTER_PORT:-29532}"

# 显式使用当前仓库代码,避免导入 conda 环境或其他目录下安装的 xtuner。
export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export CUDA_VISIBLE_DEVICES
export EP_SIZE
export DP_SIZE

cd "${REPO_ROOT}"
torchrun \
--nproc-per-node="${NPROC_PER_NODE}" \
--master-port="${MASTER_PORT}" \
.dev_scripts/validate_xtuner_deepep_md.py
31 changes: 31 additions & 0 deletions .dev_scripts/run_validate_xtuner_ep_md.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

# 默认使用用户指定的 fla 环境;需要切换时可在命令前覆盖 CONDA_ENV。
CONDA_ENV="${CONDA_ENV:-fla}"
source $(conda info --base)/etc/profile.d/conda.sh
conda activate "${CONDA_ENV}"

export XTUNER_EP_DEBUG=1

# xtuner_ep.md 的示例固定为 EP=2;默认额外验证 4 份 DP replica。
EP_SIZE="${EP_SIZE:-2}"
DP_SIZE="${DP_SIZE:-4}"
NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * DP_SIZE))}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
MASTER_PORT="${MASTER_PORT:-29531}"

# 显式使用当前仓库代码,避免导入 conda 环境或其他目录下安装的 xtuner。
export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export CUDA_VISIBLE_DEVICES
export EP_SIZE
export DP_SIZE

cd "${REPO_ROOT}"
torchrun \
--nproc-per-node="${NPROC_PER_NODE}" \
--master-port="${MASTER_PORT}" \
.dev_scripts/validate_xtuner_ep_md.py
33 changes: 33 additions & 0 deletions .dev_scripts/run_validate_xtuner_tpep_md.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"

# 默认使用用户指定的 fla 环境;需要切换时可在命令前覆盖 CONDA_ENV。
CONDA_ENV="${CONDA_ENV:-fla}"
source $(conda info --base)/etc/profile.d/conda.sh
conda activate "${CONDA_ENV}"

export XTUNER_TPEP_DEBUG=1

# xtuner_ep.md 的示例固定为 EP=2;默认额外验证 4 份 DP replica。
EP_SIZE="${EP_SIZE:-2}"
TP_SIZE="${TP_SIZE:-2}"
DP_SIZE="${DP_SIZE:-1}"
NPROC_PER_NODE="${NPROC_PER_NODE:-$((EP_SIZE * TP_SIZE * DP_SIZE))}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
MASTER_PORT="${MASTER_PORT:-29531}"

# 显式使用当前仓库代码,避免导入 conda 环境或其他目录下安装的 xtuner。
export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export CUDA_VISIBLE_DEVICES
export EP_SIZE
export TP_SIZE
export DP_SIZE

cd "${REPO_ROOT}"
torchrun \
--nproc-per-node="${NPROC_PER_NODE}" \
--master-port="${MASTER_PORT}" \
.dev_scripts/validate_xtuner_tpep_md.py
Loading