forked from deepseek-ai/DeepEP
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.py
More file actions
191 lines (160 loc) · 7.49 KB
/
Copy pathsetup.py
File metadata and controls
191 lines (160 loc) · 7.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import ast
import re
import os
import subprocess
import setuptools
import importlib
from pathlib import Path
from setuptools.command.build_py import build_py
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
current_dir = os.path.dirname(os.path.realpath(__file__))
persistent_env_names = ('EP_JIT_CACHE_DIR', 'EP_JIT_PRINT_COMPILER_COMMAND', 'EP_NUM_TOPK_IDX_BITS', 'EP_NCCL_ROOT_DIR')
# Load discover module without triggering `deep_ep.__init__`
find_pkgs_spec = importlib.util.spec_from_file_location('find_pkgs', os.path.join(current_dir, 'deep_ep', 'utils', 'find_pkgs.py'))
find_pkgs = importlib.util.module_from_spec(find_pkgs_spec)
find_pkgs_spec.loader.exec_module(find_pkgs)
# Wheel specific: the wheels only include the SO name of the host library `libnvshmem_host.so.X`
def get_nvshmem_host_lib_name(base_dir):
path = Path(base_dir).joinpath('lib')
for file in path.rglob('libnvshmem_host.so.*'):
return file.name
raise ModuleNotFoundError('libnvshmem_host.so not found')
def get_package_version():
with open(Path(current_dir) / 'deep_ep' / '__init__.py', 'r') as f:
version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE)
public_version = ast.literal_eval(version_match.group(1))
# noinspection PyBroadException
try:
status_cmd = ['git', 'status', '--porcelain']
status_output = subprocess.check_output(status_cmd).decode('ascii').strip()
if status_output:
print(f'Warning: Git working directory is not clean. Uncommitted changes:\n{status_output}')
assert False, 'Git working directory is not clean'
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except:
revision = '+local'
return f'{public_version}{revision}'
class CustomBuildPy(build_py):
def run(self):
# Make clusters' cache setting default into `envs.py`
self.generate_default_envs()
# Finally, run the regular build
build_py.run(self)
def generate_default_envs(self):
code = '# Pre-installed environment variables\n'
code += 'persistent_envs = dict()\n'
# noinspection PyShadowingNames
for name in persistent_env_names:
code += f"persistent_envs['{name}'] = '{os.environ[name]}'\n" if name in os.environ else ''
# Create temporary build directory
build_include_dir = os.path.join(self.build_lib, 'deep_ep')
os.makedirs(build_include_dir, exist_ok=True)
with open(os.path.join(self.build_lib, 'deep_ep', 'envs.py'), 'w') as f:
f.write(code)
if __name__ == '__main__':
# TODO: make NVSHMEM and legacy optional
nvshmem_root_dir = find_pkgs.find_nvshmem_root()
nccl_root_dir = find_pkgs.find_nccl_root()
# `128,2417` is used to suppress warnings of `fmt`
cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable', '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
nvcc_flags = ['-O3', '-Xcompiler', '-O3', '--extended-lambda', '--diag-suppress=128,2417']
sources = ['csrc/python_api.cpp', 'csrc/kernels/legacy/layout.cu', 'csrc/kernels/legacy/intranode.cu']
include_dirs = [f'{current_dir}/deep_ep/include',
f'{current_dir}/third-party/fmt/include',
'/usr/local/cuda/include/cccl']
library_dirs = []
nvcc_dlink = []
extra_link_args = ['-lcuda']
# NVSHMEM flags
sources.extend(['csrc/kernels/legacy/internode.cu', 'csrc/kernels/legacy/internode_ll.cu', 'csrc/kernels/backend/nvshmem.cu'])
include_dirs.extend([f'{nvshmem_root_dir}/include'])
library_dirs.extend([f'{nvshmem_root_dir}/lib'])
nvcc_dlink.extend(['-dlink', f'-L{nvshmem_root_dir}/lib', '-lnvshmem_device'])
extra_link_args.extend([f'-l:libnvshmem_host.so', '-l:libnvshmem_device.a', f'-Wl,-rpath,{nvshmem_root_dir}/lib'])
# NCCL flags
sources.extend(['csrc/kernels/backend/nccl.cu'])
include_dirs.extend([f'{nccl_root_dir}/include'])
extra_link_args.extend([f'-l:libnccl.so', f'-Wl,-rpath,{nccl_root_dir}/lib'])
# CUDA driver sources
sources.extend(['csrc/kernels/backend/cuda_driver.cu'])
# TODO: remove these
if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
# Prefer A100
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
# Disable some SM90 features: FP8, launch methods, and TMA
cxx_flags.append('-DDISABLE_SM90_FEATURES')
nvcc_flags.append('-DDISABLE_SM90_FEATURES')
# Disable internode and low-latency kernels
assert False, 'Not implemented'
else:
# Prefer H800 series
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
# CUDA 12 flags
nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
# Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
# Disable aggressive PTX instructions
if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '1')):
cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
# Legacy environment name
if 'TOPK_IDX_BITS' in os.environ:
assert 'EP_NUM_TOPK_IDX_BITS' not in os.environ
os.environ['EP_NUM_TOPK_IDX_BITS'] = os.environ['TOPK_IDX_BITS']
# Bits of `topk_idx.dtype`, choices are 32 and 64
if 'EP_NUM_TOPK_IDX_BITS' in os.environ:
num_topk_idx_bits = int(os.environ['EP_NUM_TOPK_IDX_BITS'])
cxx_flags.append(f'-DEP_NUM_TOPK_IDX_BITS={num_topk_idx_bits}')
nvcc_flags.append(f'-DEP_NUM_TOPK_IDX_BITS={num_topk_idx_bits}')
# Put them together
extra_compile_args = {
'cxx': cxx_flags,
'nvcc': nvcc_flags,
}
if len(nvcc_dlink) > 0:
extra_compile_args['nvcc_dlink'] = nvcc_dlink
# Summary
print('Build summary:')
print(f' > Sources: {sources}')
print(f' > Includes: {include_dirs}')
print(f' > Libraries: {library_dirs}')
print(f' > Compilation flags: {extra_compile_args}')
print(f' > Link flags: {extra_link_args}')
print(f' > Arch list: {os.environ["TORCH_CUDA_ARCH_LIST"]}')
print(f' > NVSHMEM path: {nvshmem_root_dir}')
print(f' > NCCL path: {nccl_root_dir}')
# Print persistent env variables
persistent_envs = []
for name in persistent_env_names:
if name in os.environ:
persistent_envs.append((name, os.environ[name]))
if len(persistent_envs) > 0:
print(f' > Persistent envs:')
for k, v in persistent_envs:
print(f' > {k}: {v}')
print()
setuptools.setup(
name='deep_ep',
version=get_package_version(),
packages=setuptools.find_packages(include=['deep_ep', 'deep_ep.*']),
package_data={
'deep_ep': [
'include/deep_ep/**/*',
]
},
ext_modules=[
CUDAExtension(name='deep_ep._C',
include_dirs=include_dirs,
library_dirs=library_dirs,
sources=sources,
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args)
],
cmdclass={
'build_ext': BuildExtension,
'build_py': CustomBuildPy
}
)