Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<p align="center">
<img src="https://img.shields.io/badge/python-3.11-5be.svg">
<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
<a href="https://github.com/NVIDIA/Megatron-LM/"><img src="https://img.shields.io/badge/megatron--core-%E2%89%A50.12-76B900.svg"></a>
<a href="https://github.com/NVIDIA/Megatron-LM/"><img src="https://img.shields.io/badge/megatron--core-%E2%89%A50.15-76B900.svg"></a>
<!-- <a href="https://mcore-bridge.readthedocs.io/en/latest/"><img src="https://img.shields.io/badge/docs-latest-blue.svg"></a> -->
<a href="https://pypi.org/project/mcore-bridge/"><img src="https://badge.fury.io/py/mcore-bridge.svg"></a>
<a href="https://github.com/modelscope/mcore-bridge/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/mcore-bridge"></a>
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<p align="center">
<img src="https://img.shields.io/badge/python-3.11-5be.svg">
<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
<a href="https://github.com/NVIDIA/Megatron-LM/"><img src="https://img.shields.io/badge/megatron--core-%E2%89%A50.12-76B900.svg"></a>
<a href="https://github.com/NVIDIA/Megatron-LM/"><img src="https://img.shields.io/badge/megatron--core-%E2%89%A50.15-76B900.svg"></a>
<!-- <a href="https://mcore-bridge.readthedocs.io/en/latest/"><img src="https://img.shields.io/badge/docs-latest-blue.svg"></a> -->
<a href="https://pypi.org/project/mcore-bridge/"><img src="https://badge.fury.io/py/mcore-bridge.svg"></a>
<a href="https://github.com/modelscope/mcore-bridge/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/mcore-bridge"></a>
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
megatron-core>=0.12
megatron-core>=0.15
modelscope
peft>=0.11,<0.20
safetensors
Expand Down
16 changes: 2 additions & 14 deletions src/mcore_bridge/bridge/gpt_bridge.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import math
import megatron.core
import re
import torch
import torch.distributed as dist
import torch.nn.functional as F
from contextlib import contextmanager
from megatron.core import mpu
from packaging import version
from peft import PeftModel
from peft.utils import ModulesToSaveWrapper
from tqdm import tqdm
Expand All @@ -22,8 +20,6 @@

logger = get_logger()

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')

EP_PP_SIZE = None
EP_PP_GROUP = None
EP_PP_RANK = None
Expand Down Expand Up @@ -60,7 +56,6 @@ def __init__(self, config: ModelConfig):
self.model_type = config.hf_model_type
self.llm_model_type = config.llm_model_type
self.is_multimodal = config.is_multimodal
self.mcore_014 = version.parse(megatron.core.__version__) >= version.parse('0.14.0rc0')
self.module_mapping = config.model_meta.visual_cls.module_mapping if self.is_multimodal else {}
self.tp_size = self.config.tensor_model_parallel_size
self.pp_size = self.config.pipeline_model_parallel_size
Expand Down Expand Up @@ -130,9 +125,6 @@ def _get_tp_split_dim(self, mg_key: Optional[str]) -> Optional[int]:
}
if self.config.task_type in {'causal_lm', 'generative_reranker'}:
dim0_keys.add('output_layer')
if not self.mcore_014:
# https://github.com/NVIDIA/Megatron-LM/commit/720c8b40d8e7e2de1dd303d792f29093101c5e72
dim0_keys.update({'linear_q_down_proj', 'linear_kv_down_proj'})
# RowLinear
dim1_keys = {'out_proj', 'linear_proj', 'linear_fc2'}
if 'lora_A' not in mg_key and 'lora_B' not in mg_key:
Expand Down Expand Up @@ -1679,12 +1671,8 @@ def _convert(self, mg_models, hf_state_dict, hf_prefix: str, to_mcore: bool, tqd
hf_state_dict = {}
mg_models = iter(mg_models)
mg_model = next(mg_models)
if mcore_013:
is_pp_first_stage = mpu.is_pipeline_first_stage(ignore_virtual=False, vp_stage=mg_model.vp_stage)
is_pp_last_stage = mpu.is_pipeline_last_stage(ignore_virtual=False, vp_stage=mg_model.vp_stage)
else:
is_pp_first_stage = mpu.is_pipeline_first_stage()
is_pp_last_stage = mpu.is_pipeline_last_stage()
is_pp_first_stage = mpu.is_pipeline_first_stage(ignore_virtual=False, vp_stage=mg_model.vp_stage)
is_pp_last_stage = mpu.is_pipeline_last_stage(ignore_virtual=False, vp_stage=mg_model.vp_stage)
if not to_mcore or is_pp_first_stage:
hf_state_dict.update(self._convert_pre_process(mg_model, hf_state_dict, '', to_mcore))
if to_mcore:
Expand Down
12 changes: 1 addition & 11 deletions src/mcore_bridge/model/gpt_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import copy
import math
import megatron.core
import os
import torch
import torch.nn.functional as F
Expand All @@ -20,7 +19,6 @@
from megatron.core.transformer.multi_token_prediction import MTPLossAutoScaler, MTPLossLoggingHelper, roll_tensor
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.utils import WrappedTensor, deprecate_inference_params
from packaging import version
from typing import Optional, Tuple

from mcore_bridge.config import ModelConfig
Expand All @@ -30,8 +28,6 @@

logger = get_logger()

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class OutputLayerLinear(TELinear):

Expand Down Expand Up @@ -79,12 +75,6 @@ def __init__(
config.mscale_all_dim = hf_rope_scaling['mscale_all_dim']
config.rotary_scaling_factor = hf_rope_scaling['factor']
self.hf_rope_scaling = hf_rope_scaling
if mcore_013:
kwargs = {'vp_stage': vp_stage}
else:
self.vp_stage = vp_stage
assert vp_stage is None, 'megatron-core==0.12 does not support vp_stage'
kwargs = {}
super().__init__(
config,
transformer_layer_spec,
Expand All @@ -96,7 +86,7 @@ def __init__(
position_embedding_type=config.position_embedding_type,
rotary_base=config.rotary_base,
mtp_block_spec=mtp_block_spec,
**kwargs,
vp_stage=vp_stage,
)
if config.multi_latent_attention:
self.rotary_pos_emb = RotaryEmbedding(
Expand Down
4 changes: 0 additions & 4 deletions src/mcore_bridge/model/gpts/glm4.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import megatron.core
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
from megatron.core.extensions.transformer_engine import TENorm
from megatron.core.transformer import transformer_layer
from megatron.core.transformer.attention import SelfAttention
from megatron.core.transformer.mlp import MLP, apply_swiglu_sharded_factory
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.utils import sharded_state_dict_default
from packaging import version
from typing import Optional

from mcore_bridge.bridge import GPTBridge
Expand All @@ -16,8 +14,6 @@
from ..constant import ModelType
from ..register import ModelLoader, ModelMeta, register_model

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class Glm4SelfAttention(SelfAttention):

Expand Down
4 changes: 0 additions & 4 deletions src/mcore_bridge/model/gpts/minimax_m2.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import megatron.core
from megatron.core import mpu
from megatron.core.tensor_parallel.mappings import (gather_from_tensor_model_parallel_region,
scatter_to_tensor_model_parallel_region)
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.spec_utils import build_module
from packaging import version
from typing import Optional

from mcore_bridge.bridge import GPTBridge
Expand All @@ -15,8 +13,6 @@
from ..constant import ModelType
from ..register import ModelLoader, ModelMeta, register_model

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class MinimaxM2SelfAttention(SelfAttention):

Expand Down
7 changes: 1 addition & 6 deletions src/mcore_bridge/model/gpts/olmoe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import megatron.core
import torch
import torch.distributed as dist
from copy import deepcopy
Expand All @@ -9,7 +8,6 @@
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build
from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
from packaging import version
from typing import Optional

from mcore_bridge.bridge import GPTBridge
Expand All @@ -19,8 +17,6 @@
from ..constant import ModelType
from ..register import ModelLoader, ModelMeta, register_model

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class OLMoESelfAttention(SelfAttentionBase):

Expand Down Expand Up @@ -78,13 +74,12 @@ def get_olmoe_decoder_block_spec(
) -> TransformerBlockSubmodules:
"""GPT block spec."""
layer_norm_impl = TENorm
kwargs = {'use_kitchen': config.use_kitchen} if mcore_013 else {}
moe_layer_spec = get_gpt_layer_with_transformer_engine_spec(
num_experts=config.num_moe_experts,
moe_grouped_gemm=config.moe_grouped_gemm,
qk_layernorm=True,
multi_latent_attention=False,
**kwargs,
use_kitchen=config.use_kitchen,
)
layer_specs = []
for _ in range(config.num_layers):
Expand Down
29 changes: 6 additions & 23 deletions src/mcore_bridge/model/gpts/qwen3_next.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import megatron.core
import torch
from copy import deepcopy
from megatron.core.extensions.transformer_engine import TEColumnParallelLinear, _get_extra_te_kwargs
Expand All @@ -16,7 +15,6 @@
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
from megatron.core.utils import deprecate_inference_params, is_fa_min_version
from packaging import version
from transformers.utils import is_torch_npu_available
from typing import Optional, Tuple, Union

Expand All @@ -27,8 +25,6 @@
from ..constant import ModelType
from ..register import ModelLoader, ModelMeta, register_model

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
mcore_015 = version.parse(megatron.core.__version__) >= version.parse('0.15.0rc0')
try:
from flashattn_hopper.flash_attn_interface import _flash_attn_forward
from flashattn_hopper.flash_attn_interface import flash_attn_with_kvcache as flash_attn3_with_kvcache
Expand Down Expand Up @@ -102,10 +98,6 @@ class Qwen3NextSelfAttention(SelfAttention):
def __init__(self, config: ModelConfig, submodules: SelfAttentionSubmodules, *args, **kwargs):
super(SelfAttention, self).__init__(config, submodules, *args, attention_type='self', **kwargs)
kwargs = {}
if mcore_015:
kwargs['tp_group'] = self.pg_collection.tp
elif mcore_013:
kwargs['tp_group'] = self.model_comm_pgs.tp
self.linear_qkv = build_module(
submodules.linear_qkv,
self.config.hidden_size,
Expand All @@ -117,6 +109,7 @@ def __init__(self, config: ModelConfig, submodules: SelfAttentionSubmodules, *ar
skip_bias_add=False,
is_expert=False,
tp_comm_buffer_name='qkv',
tp_group=self.pg_collection.tp,
**kwargs,
)

Expand Down Expand Up @@ -253,7 +246,7 @@ def nvtx_range_push(*args, **kwargs):
if (in_decode_mode and self.config.enable_cuda_graph and inference_context.is_static_batching()):
raise ValueError('CUDA graphs must use flash decode with static batching!')

result = self._adjust_key_value_for_inference(
query, key, value, rotary_pos_emb, attn_mask_type, block_table = self._adjust_key_value_for_inference(
inference_context,
query,
key,
Expand All @@ -263,10 +256,6 @@ def nvtx_range_push(*args, **kwargs):
rotary_pos_sin,
sequence_len_offset,
)
if mcore_013:
query, key, value, rotary_pos_emb, attn_mask_type, block_table = result
else:
query, key, value, rotary_pos_emb, attn_mask_type = result

if packed_seq_params is not None:
query = query.squeeze(1)
Expand All @@ -277,11 +266,6 @@ def nvtx_range_push(*args, **kwargs):
# ================================================
# relative positional embedding (rotary embedding)
# ================================================
kwargs = {}
if mcore_015:
kwargs['cp_group'] = self.pg_collection.cp
elif mcore_013:
kwargs['cp_group'] = self.model_comm_pgs.cp
nvtx_range_push(suffix='rotary_pos_emb')
if rotary_pos_emb is not None and not self.config.flash_decode:
q_pos_emb, k_pos_emb = rotary_pos_emb
Expand All @@ -306,11 +290,11 @@ def nvtx_range_push(*args, **kwargs):
q_pos_emb,
config=self.config,
cu_seqlens=cu_seqlens_q,
**kwargs,
cp_group=self.pg_collection.cp,
)
else:
query = inference_context.apply_rotary_emb_query(query, q_pos_emb, self.config, cu_seqlens_q,
**kwargs)
query = inference_context.apply_rotary_emb_query(
query, q_pos_emb, self.config, cu_seqlens_q, cp_group=self.pg_collection.cp)
if k_pos_emb is not None:
key = apply_rotary_pos_emb(
key,
Expand Down Expand Up @@ -561,13 +545,12 @@ def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):

# Use Zero-Centered RMSNorm to match HuggingFace exactly (no +1/-1 conversion needed)
layer_norm_impl = Qwen3NextRMSNorm
kwargs = {'use_kitchen': config.use_kitchen} if mcore_013 else {}
moe_layer_spec = get_gpt_layer_with_transformer_engine_spec(
num_experts=config.num_moe_experts,
moe_grouped_gemm=config.moe_grouped_gemm,
qk_layernorm=config.qk_layernorm,
multi_latent_attention=config.multi_latent_attention,
**kwargs,
use_kitchen=config.use_kitchen,
)
layer_specs = []
for is_linear_attention in self.config.linear_attention_freq:
Expand Down
8 changes: 2 additions & 6 deletions src/mcore_bridge/model/mm_gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,12 @@
from megatron.core.tensor_parallel import VocabParallelEmbedding, reduce_scatter_to_sequence_parallel_region
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.spec_utils import ModuleSpec
from packaging import version

from mcore_bridge.config import ModelConfig
from mcore_bridge.utils import split_cp_inputs

from .gpt_model import GPTModel

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class MultimodalGPTModel(MegatronModule):

Expand Down Expand Up @@ -60,9 +57,8 @@ def forward(_self, input_):
res = split_cp_inputs(res, getattr(packed_seq_params, 'cu_seqlens_q', None), 1)
if reduce_scatter_embeddings:
res = res.transpose(0, 1).contiguous()
group_kwargs = {'group': _self.tp_group} if mcore_013 else {}
res = reduce_scatter_to_sequence_parallel_region(res, **
group_kwargs) / self.config.tensor_model_parallel_size
res = reduce_scatter_to_sequence_parallel_region(
res, group=_self.tp_group) / self.config.tensor_model_parallel_size
return res

VocabParallelEmbedding.forward = forward
Expand Down
4 changes: 0 additions & 4 deletions src/mcore_bridge/model/mm_gpts/llama4.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import megatron.core
import torch
from copy import deepcopy
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
from packaging import version
from transformers import PretrainedConfig
from typing import Optional

Expand All @@ -14,8 +12,6 @@
from ..register import ModelLoader, ModelMeta, register_model
from .utils import HuggingFaceVit

mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')


class Llama4Vit(HuggingFaceVit):
module_mapping = {'multi_modal_projector': 'multi_modal_projector', 'vision_model': 'vision_model'}
Expand Down
Loading
Loading