From 61b841eba82b60fe469848593a3ed92a62c91cc0 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 17 Mar 2026 11:17:10 -0500 Subject: [PATCH 01/10] PR ready --- CMakeLists.txt | 5 ++ README.md | 81 ++++++++++++++----- graphbolt/CMakeLists.txt | 8 +- graphbolt/src/cnumpy.h | 33 +++++++- .../cuda/cooperative_minibatching_utils.cu | 4 - .../src/cuda/extension/gpu_graph_cache.cu | 4 - .../cuda/extension/unique_and_compact_map.cu | 23 +----- graphbolt/src/cuda/isin.cu | 4 + .../hip/hip_extensions/amd_warp_primitives.h | 2 + python/dgl/_ffi/_cython/function.pxi | 2 +- 10 files changed, 110 insertions(+), 56 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a4eed08b58f..2243293cae13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,6 +206,11 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(USE_LIBXSMM OFF) endif() +if(USE_HIP AND USE_LIBXSMM) + message(STATUS "Disabling LIBXSMM for HIP builds (incompatible with lld linker).") + set(USE_LIBXSMM OFF) +endif() + # Source file lists file(GLOB DGL_SRC src/*.cc diff --git a/README.md b/README.md index a2dc005c30ac..e97565f5da40 100644 --- a/README.md +++ b/README.md @@ -35,27 +35,64 @@ docker run \ - PyTorch >= 2.6.0 ```bash -# get the code +# setup your environment to make ROCm and PyTorch visible +python3 -m ven dgl_venv +source dgl_venv/bin/activate +pip3 install Cython==3.0.12 scipy pandas tqdm pytest pydantic +export AMDGPU_TARGETS=gfx942 // change this as needed +export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS +export ROCM_PATH=/opt/rocm // change this as needed +export PATH=$ROCM_PATH/bin:$PATH + +# note, the above might be set with: +module load rocm pytorch +# on a system where modules are used. + +# install dependencies +export LIBHIPCXX_BRANCH=therock-7.11 // change as needed +git clone -b $LIBHIPCXX_BRANCH https://github.com/ROCm/libhipcxx.git + +cd libhipcxx && mkdir build && cd build + +export LIBHIPCXX_PATH=$HOME/libhipcxx_install // set to user desired path + +cmake -DCMAKE_PREFIX_PATH=$LIBHIPCXX_PATH -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=$AMDGPU_TARGETS -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=OFF -DUSE_LIBXSMM=OFF .. + +cd ../.. + +export libhipcxx_DIR=$LIBHIPCXX_PATH + +export HIPCOLL_BRANCH=release/rocmds-25.10 // change as needed +git clone -b $HIPCOLL_BRANCH https://github.com/ROCm/hipCollections.git + +export HIPCOLL_PATH=$HOME/hipcoll_install // set to user defined path + +cd hipColections && mkdir build && cd build + +cmake -DCMAKE_INSTALL_PREFIX=$HIPCOLL_PATH -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_CXX_COMPILER=`which amdclang++` .. + +export hipCCL_DIR=$HIPCOLL_PATH + +cd ../.. + git clone --recurse-submodules https://github.com/ROCm/dgl -cd dgl -# install graphbolt dependencies -mkdir -p deps -cd deps -bash ../script/install_graphbolt_deps.sh -cd .. +export DGL_PATH=$HOME/dgl_install + +cd dgl && mkdir build && cd build -# build dgl -cmake --preset rocm -cmake --build build +cmake -DCMAKE_INSTALL_PREFIX=$DGL_PATH -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=gfx942 -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_PREFIX_PATH=${hipCCL_DIR}/lib/cmake/cuco .. -# install the python bindings (should be done inside a virtual env) -cd python -python -m pip install -e . -python -m build --wheel +cd ../python + +pip3 install . --target=$DGL_PATH --no-deps --no-build-isolation +export PYTHONPATH=$DGL_PATH:$PYTHONPATH + +cd ../.. # Check installation with python tests bash tests/scripts/task_unit_test_rocm.sh pytorch gpu +deactivate ```

@@ -71,6 +108,14 @@ bash tests/scripts/task_unit_test_rocm.sh pytorch gpu [Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) +[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep) + +[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) + +[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep) + +[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) + DGL is an easy-to-use, high performance and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning if a deep graph model is a component of an end-to-end application, the rest of the logics can be implemented in any major frameworks, such as PyTorch, Apache MXNet or TensorFlow.

@@ -378,11 +423,3 @@ If you use DGL in a scientific publication, we would appreciate citations to the journal={arXiv preprint arXiv:1909.01315} } ``` - -## The Team - -DGL is developed and maintained by [NYU, NYU Shanghai, AWS Shanghai AI Lab, and AWS MXNet Science Team](https://www.dgl.ai/pages/about.html). - -## License - -DGL uses Apache License 2.0. diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt index 689774244285..4b99fed8b43a 100644 --- a/graphbolt/CMakeLists.txt +++ b/graphbolt/CMakeLists.txt @@ -205,8 +205,12 @@ if(USE_CUDA OR USE_HIP) set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES LINKER_LANGUAGE HIP) endif() - # Enables libcudacxx/libhipcxx for gpu_cache. - target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION) + # Enables libcudacxx/libhipcxx for gpu_cache. + # On HIP, cuda::binary_semaphore is unsupported, so we skip the libcudacxx + # code paths and use the fallback (plain int mutex, raw pointers). + if(NOT USE_HIP) + target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION) + endif() include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include") message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.") diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h index 793116580963..fbd1daeb14dd 100644 --- a/graphbolt/src/cnumpy.h +++ b/graphbolt/src/cnumpy.h @@ -17,7 +17,11 @@ #include #include #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else #include +#endif #include #include #include @@ -46,9 +50,36 @@ struct io_uring_queue_destroyer { /** * @brief Disk Numpy Fetecher class. */ + +#ifdef __HIP_PLATFORM_AMD__ +template +class counting_semaphore_impl { + public: + explicit counting_semaphore_impl(int initial) : count_(initial) {} + static constexpr int max() { return MaxCount; } + void acquire() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] { return count_ > 0; }); + --count_; + } + void release(int n = 1) { + std::lock_guard lock(mutex_); + count_ += n; + for (int i = 0; i < n; ++i) cv_.notify_one(); + } + private: + std::mutex mutex_; + std::condition_variable cv_; + int count_; +}; +#endif + class OnDiskNpyArray : public torch::CustomClassHolder { - // No user will need more than 1024 io_uring queues. +#ifdef __HIP_PLATFORM_AMD__ + using counting_semaphore_t = counting_semaphore_impl<1024>; +#else using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>; +#endif public: static constexpr int kGroupSize = 256; diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu index 969296597475..6bd0f1005875 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu @@ -54,11 +54,7 @@ torch::Tensor RankAssignment( THRUST_CALL( transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr, -#ifdef GRAPHBOLT_USE_HIP - ::proclaim_return_type -#else ::cuda::proclaim_return_type -#endif ( [rank = static_cast(rank), world_size = static_cast( diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu index 46732e0aa8f4..3519f01842c2 100644 --- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu +++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu @@ -23,11 +23,7 @@ #include #ifdef GRAPHBOLT_USE_HIP -#include #include -namespace cuda { -using stream_ref = cuco::cuda_stream_ref; -} #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK #else #include diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu index b305247824d1..7c72be8c0b79 100644 --- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu +++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu @@ -25,10 +25,6 @@ #ifdef GRAPHBOLT_USE_HIP #include -#include -namespace cuda{ - using stream_ref = cuco::cuda_stream_ref; -} #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK #else #include @@ -209,11 +205,7 @@ UniqueAndCompactBatchedHashMapBased( cub::ArgIndexInputIterator index_it(indexes.data_ptr()); auto input_it = thrust::make_transform_iterator( index_it, - #ifdef GRAPHBOLT_USE_HIP - ::proclaim_return_type - #else ::cuda::proclaim_return_type - #endif <::cuda::std::tuple>( [=, map = map.ref(cuco::find)] __device__(auto it) -> ::cuda::std::tuple { @@ -247,11 +239,7 @@ UniqueAndCompactBatchedHashMapBased( auto unique_ids_offsets_dev_ptr = unique_ids_offsets_dev.data_ptr(); auto output_it = thrust::make_tabulate_output_iterator( - #ifdef GRAPHBOLT_USE_HIP - ::proclaim_return_type - #else ::cuda::proclaim_return_type - #endif ( [=, unique_ids_ptr = unique_ids.data_ptr(), part_ids_ptr = @@ -276,11 +264,7 @@ UniqueAndCompactBatchedHashMapBased( DeviceSelect::If, input_it, output_it, unique_ids_offsets_dev_ptr + num_batches, offsets_ptr[2 * num_batches], - #ifdef GRAPHBOLT_USE_HIP - ::proclaim_return_type - #else ::cuda::proclaim_return_type - #endif ([] __device__(const auto& t) { return ::cuda::std::get<3>(t); })); @@ -300,12 +284,7 @@ UniqueAndCompactBatchedHashMapBased( thrust::make_zip_iterator( unique_ids_offsets_dev2.data_ptr(), unique_ids_offsets.data_ptr()), - #ifdef GRAPHBOLT_USE_HIP - ::proclaim_return_type - #else - ::cuda::proclaim_return_type - #endif - < + ::cuda::proclaim_return_type< thrust::tuple>( [=] __device__(const auto x) { return thrust::make_tuple(x, x); diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu index 2d25e763890f..98ff8963c335 100644 --- a/graphbolt/src/cuda/isin.cu +++ b/graphbolt/src/cuda/isin.cu @@ -47,7 +47,11 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) { } torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) { +#ifdef __HIPCC__ + rocprim::counting_iterator iota(0); +#else thrust::counting_iterator iota(0); +#endif auto result = torch::empty_like(mask, torch::kInt64); auto mask_ptr = mask.data_ptr(); auto result_ptr = result.data_ptr(); diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index 5bf10e3c3dcf..a4f8d42e4877 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -11,6 +11,8 @@ #ifdef __AMDGCN_WAVEFRONT_SIZE #undef WAVEFRONT_SIZE #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE +#elif defined(__AMDGCN__) +#define WAVEFRONT_SIZE 64 #endif /* this header file provides _*_sync functions, which is a hack only, diff --git a/python/dgl/_ffi/_cython/function.pxi b/python/dgl/_ffi/_cython/function.pxi index dfcace512c21..c1b25f386c85 100644 --- a/python/dgl/_ffi/_cython/function.pxi +++ b/python/dgl/_ffi/_cython/function.pxi @@ -92,7 +92,7 @@ cdef inline int make_arg(object arg, ptr = arg._dgl_handle value[0].v_handle = (ptr) tcode[0] = arg.__class__._dgl_tcode - elif isinstance(arg, (int, long)): + elif isinstance(arg, int): value[0].v_int64 = arg tcode[0] = kObjectInt elif isinstance(arg, float): From 3b2a3b27bb9484085e5108a3c5095aa69f395e06 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 17 Mar 2026 11:19:01 -0500 Subject: [PATCH 02/10] Minor change --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e97565f5da40..796c226a1be8 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ docker run \ # setup your environment to make ROCm and PyTorch visible python3 -m ven dgl_venv source dgl_venv/bin/activate -pip3 install Cython==3.0.12 scipy pandas tqdm pytest pydantic +pip3 install cython scipy pandas tqdm pytest pydantic export AMDGPU_TARGETS=gfx942 // change this as needed export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS export ROCM_PATH=/opt/rocm // change this as needed From a38a18ac5e1a9579c34f04cce0211485d48670b1 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 17 Mar 2026 11:26:47 -0500 Subject: [PATCH 03/10] Fixed typo in readme --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index 796c226a1be8..cfa46a36b77a 100644 --- a/README.md +++ b/README.md @@ -108,14 +108,6 @@ deactivate [Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) -[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep) - -[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) - -[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep) - -[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ) - DGL is an easy-to-use, high performance and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning if a deep graph model is a component of an end-to-end application, the rest of the logics can be implemented in any major frameworks, such as PyTorch, Apache MXNet or TensorFlow.

From c61cc21bc30a983485ab343c8e4f5450b451d0a6 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 17 Mar 2026 14:45:02 -0500 Subject: [PATCH 04/10] Attempt at fixing the linting errors --- graphbolt/src/cnumpy.h | 1 + graphbolt/src/cuda/cooperative_minibatching_utils.cu | 3 +-- graphbolt/src/cuda/extension/gpu_graph_cache.cu | 10 ++++------ graphbolt/src/cuda/extension/unique_and_compact_map.cu | 10 ++++------ 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h index fbd1daeb14dd..c54d6e588dd6 100644 --- a/graphbolt/src/cnumpy.h +++ b/graphbolt/src/cnumpy.h @@ -67,6 +67,7 @@ class counting_semaphore_impl { count_ += n; for (int i = 0; i < n; ++i) cv_.notify_one(); } + private: std::mutex mutex_; std::condition_variable cv_; diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu index 6bd0f1005875..00ad8341900c 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu @@ -54,8 +54,7 @@ torch::Tensor RankAssignment( THRUST_CALL( transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr, - ::cuda::proclaim_return_type - ( + ::cuda::proclaim_return_type( [rank = static_cast(rank), world_size = static_cast( world_size)] __device__(index_t id) -> part_t { diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu index 3519f01842c2..3990ac498713 100644 --- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu +++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu @@ -506,12 +506,10 @@ std::tuple> GpuGraphCache::Replace( } if (edge_id_offsets) { // Append the edge ids as the last element of the output. - output_edge_tensors.push_back( - ops::IndptrEdgeIdsImpl( - output_indptr, output_indptr.scalar_type(), - *edge_id_offsets, - static_cast( - static_cast(output_size)))); + output_edge_tensors.push_back(ops::IndptrEdgeIdsImpl( + output_indptr, output_indptr.scalar_type(), + *edge_id_offsets, + static_cast(static_cast(output_size)))); } { diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu index 7c72be8c0b79..6a5625634065 100644 --- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu +++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu @@ -205,8 +205,8 @@ UniqueAndCompactBatchedHashMapBased( cub::ArgIndexInputIterator index_it(indexes.data_ptr()); auto input_it = thrust::make_transform_iterator( index_it, - ::cuda::proclaim_return_type - <::cuda::std::tuple>( + ::cuda::proclaim_return_type< + ::cuda::std::tuple>( [=, map = map.ref(cuco::find)] __device__(auto it) -> ::cuda::std::tuple { const auto i = it.key; @@ -239,8 +239,7 @@ UniqueAndCompactBatchedHashMapBased( auto unique_ids_offsets_dev_ptr = unique_ids_offsets_dev.data_ptr(); auto output_it = thrust::make_tabulate_output_iterator( - ::cuda::proclaim_return_type - ( + ::cuda::proclaim_return_type( [=, unique_ids_ptr = unique_ids.data_ptr(), part_ids_ptr = part_ids ? part_ids->data_ptr() : nullptr, @@ -264,8 +263,7 @@ UniqueAndCompactBatchedHashMapBased( DeviceSelect::If, input_it, output_it, unique_ids_offsets_dev_ptr + num_batches, offsets_ptr[2 * num_batches], - ::cuda::proclaim_return_type - ([] __device__(const auto& t) { + ::cuda::proclaim_return_type([] __device__(const auto& t) { return ::cuda::std::get<3>(t); })); auto unique_ids_offsets = torch::empty( From 24fba8d496b15e21b69af8f51c7aa03616ab1d68 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 17 Mar 2026 15:07:58 -0500 Subject: [PATCH 05/10] Fixing linting --- include/dgl/hip/hip_extensions/amd_warp_primitives.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index a4f8d42e4877..d79cbf961055 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -85,7 +85,7 @@ __device__ inline void __sync_active_threads() { __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); } -#if HIP_VERSION_MAJOR < 7 +#if HIP_VERSION_MAJOR < 7 __device__ inline void __syncwarp() { /* sync/barrier all threads in a warp */ __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); From ed3f23a4d85982c2ad530eaa6147c99f59d7b5cf Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Fri, 20 Mar 2026 16:33:27 -0500 Subject: [PATCH 06/10] Addressing comments on PR --- README.md | 8 ++++++ graphbolt/CMakeLists.txt | 6 +---- graphbolt/src/cnumpy.h | 6 ++--- .../hip/hip_extensions/amd_warp_primitives.h | 25 +++++++------------ .../gpu_cache/include/nv_gpu_cache.hpp | 21 ++++++++++++++++ 5 files changed, 42 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 4980db4a37c2..881c6f7ec455 100644 --- a/README.md +++ b/README.md @@ -426,3 +426,11 @@ If you use DGL in a scientific publication, we would appreciate citations to the journal={arXiv preprint arXiv:1909.01315} } ``` + +## The Team + +DGL is developed and maintained by [NYU, NYU Shanghai, AWS Shanghai AI Lab, and AWS MXNet Science Team](https://www.dgl.ai/pages/about.html). + +## License + +DGL uses Apache License 2.0. diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt index 4b99fed8b43a..9463b50a18c2 100644 --- a/graphbolt/CMakeLists.txt +++ b/graphbolt/CMakeLists.txt @@ -206,11 +206,7 @@ if(USE_CUDA OR USE_HIP) endif() # Enables libcudacxx/libhipcxx for gpu_cache. - # On HIP, cuda::binary_semaphore is unsupported, so we skip the libcudacxx - # code paths and use the fallback (plain int mutex, raw pointers). - if(NOT USE_HIP) - target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION) - endif() + target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION) include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include") message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.") diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h index c54d6e588dd6..eb39aee7135d 100644 --- a/graphbolt/src/cnumpy.h +++ b/graphbolt/src/cnumpy.h @@ -17,7 +17,7 @@ #include #include #include -#ifdef __HIP_PLATFORM_AMD__ +#ifdef __HIPCC__ #include #else #include @@ -51,7 +51,7 @@ struct io_uring_queue_destroyer { * @brief Disk Numpy Fetecher class. */ -#ifdef __HIP_PLATFORM_AMD__ +#ifdef __HIPCC__ template class counting_semaphore_impl { public: @@ -76,7 +76,7 @@ class counting_semaphore_impl { #endif class OnDiskNpyArray : public torch::CustomClassHolder { -#ifdef __HIP_PLATFORM_AMD__ +#ifdef __HIPCC__ using counting_semaphore_t = counting_semaphore_impl<1024>; #else using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>; diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index d79cbf961055..e3af4ddaf8c0 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -12,7 +12,7 @@ #undef WAVEFRONT_SIZE #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE #elif defined(__AMDGCN__) -#define WAVEFRONT_SIZE 64 +#define WAVEFRONT_SIZE __builtin_amdgcn_wavefrontsize() #endif /* this header file provides _*_sync functions, which is a hack only, @@ -60,11 +60,10 @@ __device__ inline int __thread_rank(lane_mask mask) { } __device__ inline unsigned int __mask_size(lane_mask mask) { -#if WAVEFRONT_SIZE == 64 - return __popcll(mask); -#else - return __popc(mask); -#endif + if (WAVEFRONT_SIZE == 64) + return __popcll(mask); + else + return __popc(mask); } __device__ inline int __thread_rank_to_lane_id(lane_mask mask, int i) { @@ -296,11 +295,8 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) { bmask = __branchmask(); while (1) { -#if WAVEFRONT_SIZE == 64 - int i = __ffsll(bmask) - 1; -#else - int i = __ffs((unsigned int)bmask) - 1; -#endif + int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1 + : __ffs((unsigned int)bmask) - 1; if (i < 0) break; @@ -327,11 +323,8 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) { #endif while (1) { -#if WAVEFRONT_SIZE == 64 - int i = __ffsll(bmask) - 1; -#else - int i = __ffs((unsigned int)bmask) - 1; -#endif + int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1 + : __ffs((unsigned int)bmask) - 1; if (i < 0) break; diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp index 21b0c812be5f..b060f67aac5d 100644 --- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp +++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp @@ -24,8 +24,10 @@ #include "gpu_cache_api.hpp" #ifdef LIBCUDACXX_VERSION #include +#ifndef DGL_USE_HIP #include #endif +#endif #define SET_ASSOCIATIVITY 2 // TODO: Properly for portable HIP code, this should be determined at runtime, @@ -94,7 +96,26 @@ class gpu_cache : public gpu_cache_api { using slabset = slab_set; #ifdef LIBCUDACXX_VERSION using atomic_ref_counter_type = cuda::atomic; +#ifdef DGL_USE_HIP + // cuda::binary_semaphore is not supported on HIP. Provide a device-side + // spinlock with the same acquire()/release() interface so that the + // LIBCUDACXX_VERSION kernel code compiles unchanged. + struct mutex { + int flag_; + __host__ __device__ explicit mutex(int initial = 1) : flag_(initial) {} + __device__ void acquire() { + while (atomicCAS(&flag_, 1, 0) == 0) + ; + __threadfence(); + } + __device__ void release() { + __threadfence(); + atomicExch(&flag_, 1); + } + }; +#else using mutex = cuda::binary_semaphore; +#endif #endif private: From 9729b7b0c45713ebaee5135a9eca581da65b79d7 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 24 Mar 2026 10:36:37 -0500 Subject: [PATCH 07/10] Addressing latest set of comments --- CMakePresets.json | 1 + README.md | 65 ++++------------ .../hip/hip_extensions/amd_warp_primitives.h | 2 +- script/install_graphbolt_deps.sh | 78 +++++++++++++++---- 4 files changed, 81 insertions(+), 65 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 00926fbcc23b..7eab668b06bd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -18,6 +18,7 @@ "GPU_TARGETS": "gfx90a,gfx942", "CMAKE_HIP_ARCHITECTURES": "gfx90a,gfx942", "CMAKE_INSTALL_PREFIX": "/opt/rocm", + "USE_LIBXSMM": "OFF", "BUILD_CPP_TEST": "ON", "CMAKE_C_FLAGS": "-fdiagnostics-color=always", "CMAKE_CXX_FLAGS": "-fdiagnostics-color=always", diff --git a/README.md b/README.md index 881c6f7ec455..9330525b4dc7 100644 --- a/README.md +++ b/README.md @@ -46,64 +46,27 @@ docker run \ - PyTorch >= 2.6.0 ```bash -# setup your environment to make ROCm and PyTorch visible -python3 -m ven dgl_venv -source dgl_venv/bin/activate -pip3 install cython scipy pandas tqdm pytest pydantic -export AMDGPU_TARGETS=gfx942 // change this as needed -export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS -export ROCM_PATH=/opt/rocm // change this as needed -export PATH=$ROCM_PATH/bin:$PATH - -# note, the above might be set with: -module load rocm pytorch -# on a system where modules are used. - -# install dependencies -export LIBHIPCXX_BRANCH=therock-7.11 // change as needed -git clone -b $LIBHIPCXX_BRANCH https://github.com/ROCm/libhipcxx.git - -cd libhipcxx && mkdir build && cd build - -export LIBHIPCXX_PATH=$HOME/libhipcxx_install // set to user desired path - -cmake -DCMAKE_PREFIX_PATH=$LIBHIPCXX_PATH -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=$AMDGPU_TARGETS -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=OFF -DUSE_LIBXSMM=OFF .. - -cd ../.. - -export libhipcxx_DIR=$LIBHIPCXX_PATH - -export HIPCOLL_BRANCH=release/rocmds-25.10 // change as needed -git clone -b $HIPCOLL_BRANCH https://github.com/ROCm/hipCollections.git - -export HIPCOLL_PATH=$HOME/hipcoll_install // set to user defined path - -cd hipColections && mkdir build && cd build - -cmake -DCMAKE_INSTALL_PREFIX=$HIPCOLL_PATH -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_CXX_COMPILER=`which amdclang++` .. - -export hipCCL_DIR=$HIPCOLL_PATH - -cd ../.. - +# get the code git clone --recurse-submodules https://github.com/ROCm/dgl +cd dgl -export DGL_PATH=$HOME/dgl_install - -cd dgl && mkdir build && cd build - -cmake -DCMAKE_INSTALL_PREFIX=$DGL_PATH -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=gfx942 -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_PREFIX_PATH=${hipCCL_DIR}/lib/cmake/cuco .. - -cd ../python +# install graphbolt dependencies +mkdir -p deps +cd deps +bash ../script/install_graphbolt_deps.sh +cd .. -pip3 install . --target=$DGL_PATH --no-deps --no-build-isolation -export PYTHONPATH=$DGL_PATH:$PYTHONPATH +# build dgl +cmake --preset rocm +cmake --build build -cd ../.. +# install the python bindings (should be done inside a virtual env) +cd python +python -m pip install -e . +python -m build --wheel # Check installation with python tests bash tests/scripts/task_unit_test_rocm.sh pytorch gpu -deactivate ```

diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index e3af4ddaf8c0..62b9e52f87f5 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -12,7 +12,7 @@ #undef WAVEFRONT_SIZE #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE #elif defined(__AMDGCN__) -#define WAVEFRONT_SIZE __builtin_amdgcn_wavefrontsize() +#define WAVEFRONT_SIZE 64 #endif /* this header file provides _*_sync functions, which is a hack only, diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh index 15662accaf11..6778ae11b6ad 100644 --- a/script/install_graphbolt_deps.sh +++ b/script/install_graphbolt_deps.sh @@ -1,22 +1,74 @@ #!/usr/bin/env bash -ROCM_ROOT=/opt/rocm +usage() { + cat < Date: Tue, 24 Mar 2026 10:50:46 -0500 Subject: [PATCH 08/10] Forgot to include one reverted change to CMakeLists.txt --- CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a7121087869..d10d9cd75d33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,11 +206,6 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(USE_LIBXSMM OFF) endif() -if(USE_HIP AND USE_LIBXSMM) - message(STATUS "Disabling LIBXSMM for HIP builds (incompatible with lld linker).") - set(USE_LIBXSMM OFF) -endif() - # Source file lists file(GLOB DGL_SRC src/*.cc From cc9e5505b6e071702fb849c04366274e478820a8 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 24 Mar 2026 12:00:28 -0500 Subject: [PATCH 09/10] Making install_graphbolt_deps.sh executable --- script/install_graphbolt_deps.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 script/install_graphbolt_deps.sh diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh old mode 100644 new mode 100755 From 324eaf1934faa8905630b7f2c1124af81cca12f6 Mon Sep 17 00:00:00 2001 From: gcapodagAMD Date: Tue, 24 Mar 2026 12:09:42 -0500 Subject: [PATCH 10/10] Addressing James' comments --- .../hip/hip_extensions/amd_warp_primitives.h | 23 ++++++++++++------- script/install_graphbolt_deps.sh | 8 ------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index 62b9e52f87f5..d79cbf961055 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -60,10 +60,11 @@ __device__ inline int __thread_rank(lane_mask mask) { } __device__ inline unsigned int __mask_size(lane_mask mask) { - if (WAVEFRONT_SIZE == 64) - return __popcll(mask); - else - return __popc(mask); +#if WAVEFRONT_SIZE == 64 + return __popcll(mask); +#else + return __popc(mask); +#endif } __device__ inline int __thread_rank_to_lane_id(lane_mask mask, int i) { @@ -295,8 +296,11 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) { bmask = __branchmask(); while (1) { - int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1 - : __ffs((unsigned int)bmask) - 1; +#if WAVEFRONT_SIZE == 64 + int i = __ffsll(bmask) - 1; +#else + int i = __ffs((unsigned int)bmask) - 1; +#endif if (i < 0) break; @@ -323,8 +327,11 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) { #endif while (1) { - int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1 - : __ffs((unsigned int)bmask) - 1; +#if WAVEFRONT_SIZE == 64 + int i = __ffsll(bmask) - 1; +#else + int i = __ffs((unsigned int)bmask) - 1; +#endif if (i < 0) break; diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh index 6778ae11b6ad..4aa3e145eb32 100755 --- a/script/install_graphbolt_deps.sh +++ b/script/install_graphbolt_deps.sh @@ -76,14 +76,6 @@ cd ${DEPS_DIR} # This was implicitly not allowed in previous releases we were using, # but with v2.7.0 they are explicitly not allowed. -# We only use semaphores for a counter of IO operations in graphbolt, -# that only runs on the host (not on the device) so we should be "safe" -# to use this for now. -run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore -run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore -run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore -run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore - # TODO (#22) remove this once the patches are merged # the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883 # but may take more time to be released.