diff --git a/CMakeLists.txt b/CMakeLists.txt index c555a5ed44..05fd752dd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,9 +221,9 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA") message(STATUS "Enabling cumem allocator extension.") # link against cuda driver library list(APPEND CUMEM_LIBS CUDA::cuda_driver) - define_gpu_extension_target( - cumem_allocator - DESTINATION aphrodite +define_gpu_extension_target( + cumem_allocator + DESTINATION aphrodite/extensions/cuda LANGUAGE CXX SOURCES ${APHRODITE_CUMEM_EXT_SRC} LIBRARIES ${CUMEM_LIBS} @@ -804,7 +804,7 @@ endif() message(STATUS "Enabling C extension.") define_gpu_extension_target( _C - DESTINATION aphrodite + DESTINATION $,aphrodite/extensions/cuda,aphrodite/extensions/rocm> LANGUAGE ${APHRODITE_GPU_LANG} SOURCES ${APHRODITE_EXT_SRC} COMPILE_FLAGS ${APHRODITE_GPU_FLAGS} @@ -918,7 +918,7 @@ endif() message(STATUS "Enabling moe extension.") define_gpu_extension_target( _moe_C - DESTINATION aphrodite + DESTINATION $,aphrodite/extensions/cuda,aphrodite/extensions/rocm> LANGUAGE ${APHRODITE_GPU_LANG} SOURCES ${APHRODITE_MOE_EXT_SRC} COMPILE_FLAGS ${APHRODITE_GPU_FLAGS} @@ -939,7 +939,7 @@ if(APHRODITE_GPU_LANG STREQUAL "HIP") define_gpu_extension_target( _rocm_C - DESTINATION aphrodite + DESTINATION aphrodite/extensions/rocm LANGUAGE ${APHRODITE_GPU_LANG} SOURCES ${APHRODITE_ROCM_EXT_SRC} COMPILE_FLAGS ${APHRODITE_GPU_FLAGS} diff --git a/aphrodite/_custom_ops.py b/aphrodite/_custom_ops.py index 5a324ab90c..418f5d46ae 100644 --- a/aphrodite/_custom_ops.py +++ b/aphrodite/_custom_ops.py @@ -10,14 +10,30 @@ if not current_platform.is_tpu() and not current_platform.is_xpu(): try: - import aphrodite._C + if current_platform.is_cuda(): + import aphrodite.extensions.cuda._C # noqa: F401 + elif current_platform.is_rocm(): + import aphrodite.extensions.rocm._C # noqa: F401 + # Also register ROCm-specific ops if present + with contextlib.suppress(ImportError): + import aphrodite.extensions.rocm._rocm_C # noqa: F401 + elif current_platform.is_cpu(): + import aphrodite.extensions.cpu._C # noqa: F401 + else: + # Other platforms not handled here + pass except ImportError as e: - logger.warning("Failed to import from aphrodite._C with {!r}", e) + logger.warning("Failed to import platform-specific _C with {!r}", e) supports_moe_ops = False -with contextlib.suppress(ImportError): - import aphrodite._moe_C # noqa: F401 - supports_moe_ops = True +if current_platform.is_cuda(): + with contextlib.suppress(ImportError): + import aphrodite.extensions.cuda._moe_C # noqa: F401 + supports_moe_ops = True +elif current_platform.is_rocm(): + with contextlib.suppress(ImportError): + import aphrodite.extensions.rocm._moe_C # noqa: F401 + supports_moe_ops = True if TYPE_CHECKING: diff --git a/aphrodite/attention/ops/flashmla.py b/aphrodite/attention/ops/flashmla.py index b8d4ee3d49..292708ab17 100644 --- a/aphrodite/attention/ops/flashmla.py +++ b/aphrodite/attention/ops/flashmla.py @@ -7,7 +7,7 @@ if current_platform.is_cuda(): try: - import aphrodite._flashmla_C # noqa: F401 + import aphrodite.extensions.cuda._flashmla_C # noqa: F401 _flashmla_C_AVAILABLE = True except ImportError: _flashmla_C_AVAILABLE = False @@ -24,8 +24,9 @@ def is_flashmla_supported() -> Tuple[bool, Optional[str]]: if current_platform.get_device_capability()[0] != 9: return False, "FlashMLA is only supported on Hopper devices." if not _flashmla_C_AVAILABLE: - return False, "aphrodite._flashmla_C is not available, likely was not "\ - "compiled due to insufficient nvcc version or a supported arch "\ + return False, "aphrodite.extensions.cuda._flashmla_C is not " \ + "available, likely was not compiled due to insufficient nvcc " \ + "version or a supported arch "\ "(only sm90a currently) was not in the list of target arches to "\ "compile for." return True, None diff --git a/aphrodite/extensions/README.md b/aphrodite/extensions/README.md new file mode 100644 index 0000000000..931f300a5b --- /dev/null +++ b/aphrodite/extensions/README.md @@ -0,0 +1 @@ +This directory contains the compiled binaries for the Aphrodite extensions. By default, it's empty and will be populated when building the Aphrodite extensions. \ No newline at end of file diff --git a/aphrodite/extensions/cpu/.gitkeep b/aphrodite/extensions/cpu/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/extensions/cpu/__init__.py b/aphrodite/extensions/cpu/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/extensions/cuda/.gitkeep b/aphrodite/extensions/cuda/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/extensions/cuda/__init__.py b/aphrodite/extensions/cuda/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/extensions/rocm/.gitkeep b/aphrodite/extensions/rocm/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/extensions/rocm/__init__.py b/aphrodite/extensions/rocm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/aphrodite/platforms/cuda.py b/aphrodite/platforms/cuda.py index 1ffaad20e8..607f476cab 100644 --- a/aphrodite/platforms/cuda.py +++ b/aphrodite/platforms/cuda.py @@ -14,7 +14,7 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration -import aphrodite._C # noqa +import aphrodite.extensions.cuda._C # noqa import aphrodite.common.envs as envs from aphrodite.common.logger import log_once from aphrodite.utils import cuda_device_count_stateless, import_pynvml diff --git a/aphrodite/platforms/rocm.py b/aphrodite/platforms/rocm.py index 66abae8453..8bf102f0c6 100644 --- a/aphrodite/platforms/rocm.py +++ b/aphrodite/platforms/rocm.py @@ -26,15 +26,17 @@ logger.warning("Failed to import from amdsmi with {}", e) try: - import aphrodite._C # noqa: F401 + import aphrodite.extensions.rocm._C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._C with {}", e) + logger.warning( + "Failed to import from aphrodite.extensions.rocm._C with {}", e) # import custom ops, trigger op registration try: - import aphrodite._rocm_C # noqa: F401 + import aphrodite.extensions.rocm._rocm_C # noqa: F401 except ImportError as e: - logger.warning("Failed to import from aphrodite._rocm_C with {}", e) + logger.warning( + "Failed to import from aphrodite.extensions.rocm._rocm_C with {}", e) # Models not supported by ROCm. _ROCM_UNSUPPORTED_MODELS: list[str] = [] diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 0a7e19d79b..7cc9d837d8 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -290,7 +290,7 @@ message(STATUS "CPU extension source files: ${APHRODITE_EXT_SRC}") define_gpu_extension_target( _C - DESTINATION aphrodite + DESTINATION aphrodite/extensions/cpu LANGUAGE CXX SOURCES ${APHRODITE_EXT_SRC} LIBRARIES ${LIBS} diff --git a/cmake/external_project/flashmla.cmake b/cmake/external_project/flashmla.cmake index 25268d0080..8dc4f1323b 100644 --- a/cmake/external_project/flashmla.cmake +++ b/cmake/external_project/flashmla.cmake @@ -52,7 +52,7 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) define_gpu_extension_target( _flashmla_C - DESTINATION aphrodite + DESTINATION aphrodite/extensions/cuda LANGUAGE ${APHRODITE_GPU_LANG} SOURCES ${FlashMLA_SOURCES} COMPILE_FLAGS ${APHRODITE_GPU_FLAGS} diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index affe562c24..421879ae73 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -1,7 +1,7 @@ # Common dependencies -r common.txt ---extra-index-url https://download.pytorch.org/whl/rocm6.3 +--extra-index-url https://download.pytorch.org/whl/rocm6.4 torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 7038c9024c..94180dd61c 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' boto3 botocore datasets -ray>=2.10.0,<2.45.0 +ray>=2.10.0 peft pytest-asyncio tensorizer==2.10.1 diff --git a/setup.py b/setup.py index 90e44364e5..c27bd567bb 100644 --- a/setup.py +++ b/setup.py @@ -334,8 +334,11 @@ def build_extensions(self) -> None: targets = [] def target_name(s: str) -> str: - return s.removeprefix("aphrodite.").removeprefix( - "aphrodite_flash_attn.") + return s.removeprefix("aphrodite.extensions.cuda.") \ + .removeprefix("aphrodite.extensions.rocm.") \ + .removeprefix("aphrodite.extensions.cpu.") \ + .removeprefix("aphrodite.") \ + .removeprefix("aphrodite_flash_attn.") # Build all the extensions for ext in self.extensions: @@ -633,11 +636,15 @@ def _read_requirements(filename: str) -> list[str]: # Skip building extensions if using precompiled binaries if not envs.APHRODITE_USE_PRECOMPILED: - if _is_cuda() or _is_hip(): - ext_modules.append(CMakeExtension(name="aphrodite._moe_C")) + # MoE extension per backend + if _is_cuda(): + ext_modules.append(CMakeExtension(name="aphrodite.extensions.cuda._moe_C")) + elif _is_hip(): + ext_modules.append(CMakeExtension(name="aphrodite.extensions.rocm._moe_C")) + # ROCm-specific extension if _is_hip(): - ext_modules.append(CMakeExtension(name="aphrodite._rocm_C")) + ext_modules.append(CMakeExtension(name="aphrodite.extensions.rocm._rocm_C")) if _is_cuda(): if not envs.APHRODITE_DISABLE_FLASH_ATTN_COMPILE: @@ -656,11 +663,17 @@ def _read_requirements(filename: str) -> list[str]: if envs.APHRODITE_USE_PRECOMPILED or \ get_nvcc_cuda_version() >= Version("12.3"): ext_modules.append( - CMakeExtension(name="aphrodite._flashmla_C", optional=True)) - ext_modules.append(CMakeExtension(name="aphrodite.cumem_allocator")) + CMakeExtension(name="aphrodite.extensions.cuda._flashmla_C", optional=True)) + ext_modules.append(CMakeExtension(name="aphrodite.extensions.cuda.cumem_allocator")) + # Core extension per backend if _build_custom_ops(): - ext_modules.append(CMakeExtension(name="aphrodite._C")) + if _is_cuda(): + ext_modules.append(CMakeExtension(name="aphrodite.extensions.cuda._C")) + elif _is_hip(): + ext_modules.append(CMakeExtension(name="aphrodite.extensions.rocm._C")) + elif _is_cpu(): + ext_modules.append(CMakeExtension(name="aphrodite.extensions.cpu._C")) package_data = { "aphrodite": [