From 61b841eba82b60fe469848593a3ed92a62c91cc0 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 17 Mar 2026 11:17:10 -0500
Subject: [PATCH 01/10] PR ready

---
 CMakeLists.txt                                |  5 ++
 README.md                                     | 81 ++++++++++++++-----
 graphbolt/CMakeLists.txt                      |  8 +-
 graphbolt/src/cnumpy.h                        | 33 +++++++-
 .../cuda/cooperative_minibatching_utils.cu    |  4 -
 .../src/cuda/extension/gpu_graph_cache.cu     |  4 -
 .../cuda/extension/unique_and_compact_map.cu  | 23 +-----
 graphbolt/src/cuda/isin.cu                    |  4 +
 .../hip/hip_extensions/amd_warp_primitives.h  |  2 +
 python/dgl/_ffi/_cython/function.pxi          |  2 +-
 10 files changed, 110 insertions(+), 56 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a4eed08b58f..2243293cae13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,6 +206,11 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   set(USE_LIBXSMM OFF)
 endif()
 
+if(USE_HIP AND USE_LIBXSMM)
+  message(STATUS "Disabling LIBXSMM for HIP builds (incompatible with lld linker).")
+  set(USE_LIBXSMM OFF)
+endif()
+
 # Source file lists
 file(GLOB DGL_SRC
   src/*.cc
diff --git a/README.md b/README.md
index a2dc005c30ac..e97565f5da40 100644
--- a/README.md
+++ b/README.md
@@ -35,27 +35,64 @@ docker run \
 - PyTorch >= 2.6.0
 
 ```bash
-# get the code
+# setup your environment to make ROCm and PyTorch visible
+python3 -m ven dgl_venv
+source dgl_venv/bin/activate
+pip3 install Cython==3.0.12 scipy pandas tqdm pytest pydantic
+export AMDGPU_TARGETS=gfx942 // change this as needed 
+export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS
+export ROCM_PATH=/opt/rocm // change this as needed 
+export PATH=$ROCM_PATH/bin:$PATH
+
+# note, the above might be set with:
+module load rocm pytorch
+# on a system where modules are used.
+
+# install dependencies
+export LIBHIPCXX_BRANCH=therock-7.11 // change as needed 
+git clone -b $LIBHIPCXX_BRANCH https://github.com/ROCm/libhipcxx.git
+
+cd libhipcxx && mkdir build && cd build
+
+export LIBHIPCXX_PATH=$HOME/libhipcxx_install // set to user desired path
+
+cmake -DCMAKE_PREFIX_PATH=$LIBHIPCXX_PATH -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=$AMDGPU_TARGETS -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=OFF  -DUSE_LIBXSMM=OFF ..
+
+cd ../..
+
+export libhipcxx_DIR=$LIBHIPCXX_PATH
+
+export HIPCOLL_BRANCH=release/rocmds-25.10 // change as needed
+git clone -b $HIPCOLL_BRANCH https://github.com/ROCm/hipCollections.git
+
+export HIPCOLL_PATH=$HOME/hipcoll_install // set to user defined path
+
+cd hipColections && mkdir build && cd build
+
+cmake  -DCMAKE_INSTALL_PREFIX=$HIPCOLL_PATH -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF  -DCMAKE_CXX_COMPILER=`which amdclang++` ..
+
+export hipCCL_DIR=$HIPCOLL_PATH
+
+cd ../..
+
 git clone --recurse-submodules https://github.com/ROCm/dgl
-cd dgl
 
-# install graphbolt dependencies
-mkdir -p deps
-cd deps
-bash ../script/install_graphbolt_deps.sh
-cd ..
+export DGL_PATH=$HOME/dgl_install
+
+cd dgl && mkdir build && cd build
 
-# build dgl
-cmake --preset rocm <OPTIONAL_CMAKE_ARGS>
-cmake --build build
+cmake  -DCMAKE_INSTALL_PREFIX=$DGL_PATH -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=gfx942   -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_PREFIX_PATH=${hipCCL_DIR}/lib/cmake/cuco ..
 
-# install the python bindings (should be done inside a virtual env)
-cd python
-python -m pip install -e .
-python -m build --wheel
+cd ../python
+
+pip3 install . --target=$DGL_PATH --no-deps --no-build-isolation
+export PYTHONPATH=$DGL_PATH:$PYTHONPATH
+
+cd ../..
 
 # Check installation with python tests
 bash tests/scripts/task_unit_test_rocm.sh pytorch gpu
+deactivate
 ```
 
 <p align="center">
@@ -71,6 +108,14 @@ bash tests/scripts/task_unit_test_rocm.sh pytorch gpu
 
 [Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
 
+[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep)
+
+[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
+
+[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep)
+
+[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
+
 DGL is an easy-to-use, high performance and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning if a deep graph model is a component of an end-to-end application, the rest of the logics can be implemented in any major frameworks, such as PyTorch, Apache MXNet or TensorFlow.
 
 <p align="center">
@@ -378,11 +423,3 @@ If you use DGL in a scientific publication, we would appreciate citations to the
     journal={arXiv preprint arXiv:1909.01315}
 }
 ```
-
-## The Team
-
-DGL is developed and maintained by [NYU, NYU Shanghai, AWS Shanghai AI Lab, and AWS MXNet Science Team](https://www.dgl.ai/pages/about.html).
-
-## License
-
-DGL uses Apache License 2.0.
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
index 689774244285..4b99fed8b43a 100644
--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -205,8 +205,12 @@ if(USE_CUDA OR USE_HIP)
   set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES LINKER_LANGUAGE HIP)
   endif()
 
-  # Enables libcudacxx/libhipcxx for gpu_cache. 
-  target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
+  # Enables libcudacxx/libhipcxx for gpu_cache.
+  # On HIP, cuda::binary_semaphore is unsupported, so we skip the libcudacxx
+  # code paths and use the fallback (plain int mutex, raw pointers).
+  if(NOT USE_HIP)
+    target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
+  endif()
   include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include")
   message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.")
 
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index 793116580963..fbd1daeb14dd 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -17,7 +17,11 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#ifdef __HIP_PLATFORM_AMD__
+#include <condition_variable>
+#else
 #include <cuda/std/semaphore>
+#endif
 #include <memory>
 #include <mutex>
 #include <string>
@@ -46,9 +50,36 @@ struct io_uring_queue_destroyer {
 /**
  * @brief Disk Numpy Fetecher class.
  */
+
+#ifdef __HIP_PLATFORM_AMD__
+template <int MaxCount>
+class counting_semaphore_impl {
+ public:
+  explicit counting_semaphore_impl(int initial) : count_(initial) {}
+  static constexpr int max() { return MaxCount; }
+  void acquire() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return count_ > 0; });
+    --count_;
+  }
+  void release(int n = 1) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    count_ += n;
+    for (int i = 0; i < n; ++i) cv_.notify_one();
+  }
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  int count_;
+};
+#endif
+
 class OnDiskNpyArray : public torch::CustomClassHolder {
-  // No user will need more than 1024 io_uring queues.
+#ifdef __HIP_PLATFORM_AMD__
+  using counting_semaphore_t = counting_semaphore_impl<1024>;
+#else
   using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>;
+#endif
 
  public:
   static constexpr int kGroupSize = 256;
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 969296597475..6bd0f1005875 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -54,11 +54,7 @@ torch::Tensor RankAssignment(
         THRUST_CALL(
             transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr,
 
-#ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-#else
             ::cuda::proclaim_return_type
-#endif
             <part_t>(
                 [rank = static_cast<uint32_t>(rank),
                  world_size = static_cast<uint32_t>(
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 46732e0aa8f4..3519f01842c2 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -23,11 +23,7 @@
 
 #include <cstddef>
 #ifdef GRAPHBOLT_USE_HIP
-#include <cuco/cuda_stream_ref.hpp>
 #include <hipcub/hipcub.hpp>
-namespace cuda {
-using stream_ref = cuco::cuda_stream_ref;
-}
 #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK
 #else
 #include <cub/cub.cuh>
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index b305247824d1..7c72be8c0b79 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -25,10 +25,6 @@
 
 #ifdef GRAPHBOLT_USE_HIP
 #include <hipcub/hipcub.hpp>
-#include <cuco/cuda_stream_ref.hpp>
-namespace cuda{
-    using stream_ref = cuco::cuda_stream_ref;
-}
 #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK
 #else
 #include <cub/cub.cuh>
@@ -209,11 +205,7 @@ UniqueAndCompactBatchedHashMapBased(
         cub::ArgIndexInputIterator index_it(indexes.data_ptr<int32_t>());
         auto input_it = thrust::make_transform_iterator(
             index_it,
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
                 [=, map = map.ref(cuco::find)] __device__(auto it)
                     -> ::cuda::std::tuple<int64_t*, index_t, int32_t, bool> {
@@ -247,11 +239,7 @@ UniqueAndCompactBatchedHashMapBased(
         auto unique_ids_offsets_dev_ptr =
             unique_ids_offsets_dev.data_ptr<int64_t>();
         auto output_it = thrust::make_tabulate_output_iterator(
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <void>(
                 [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
                  part_ids_ptr =
@@ -276,11 +264,7 @@ UniqueAndCompactBatchedHashMapBased(
             DeviceSelect::If, input_it, output_it,
             unique_ids_offsets_dev_ptr + num_batches,
             offsets_ptr[2 * num_batches],
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <bool>([] __device__(const auto& t) {
               return ::cuda::std::get<3>(t);
             }));
@@ -300,12 +284,7 @@ UniqueAndCompactBatchedHashMapBased(
                       thrust::make_zip_iterator(
                           unique_ids_offsets_dev2.data_ptr<int64_t>(),
                           unique_ids_offsets.data_ptr<int64_t>()),
-                      #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
-            ::cuda::proclaim_return_type
-            #endif
-            <
+                      ::cuda::proclaim_return_type<
                           thrust::tuple<int64_t, int64_t>>(
                           [=] __device__(const auto x) {
                             return thrust::make_tuple(x, x);
diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu
index 2d25e763890f..98ff8963c335 100644
--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
@@ -47,7 +47,11 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) {
 }
 
 torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) {
+#ifdef __HIPCC__
+  rocprim::counting_iterator<int64_t> iota(0);
+#else
   thrust::counting_iterator<int64_t> iota(0);
+#endif
   auto result = torch::empty_like(mask, torch::kInt64);
   auto mask_ptr = mask.data_ptr<bool>();
   auto result_ptr = result.data_ptr<int64_t>();
diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index 5bf10e3c3dcf..a4f8d42e4877 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -11,6 +11,8 @@
 #ifdef __AMDGCN_WAVEFRONT_SIZE
 #undef WAVEFRONT_SIZE
 #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#elif defined(__AMDGCN__)
+#define WAVEFRONT_SIZE 64
 #endif
 
 /* this header file provides _*_sync functions, which is a hack only,
diff --git a/python/dgl/_ffi/_cython/function.pxi b/python/dgl/_ffi/_cython/function.pxi
index dfcace512c21..c1b25f386c85 100644
--- a/python/dgl/_ffi/_cython/function.pxi
+++ b/python/dgl/_ffi/_cython/function.pxi
@@ -92,7 +92,7 @@ cdef inline int make_arg(object arg,
         ptr = arg._dgl_handle
         value[0].v_handle = (<void*>ptr)
         tcode[0] = arg.__class__._dgl_tcode
-    elif isinstance(arg, (int, long)):
+    elif isinstance(arg, int):
         value[0].v_int64 = arg
         tcode[0] = kObjectInt
     elif isinstance(arg, float):

From 3b2a3b27bb9484085e5108a3c5095aa69f395e06 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 17 Mar 2026 11:19:01 -0500
Subject: [PATCH 02/10] Minor change

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e97565f5da40..796c226a1be8 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ docker run \
 # setup your environment to make ROCm and PyTorch visible
 python3 -m ven dgl_venv
 source dgl_venv/bin/activate
-pip3 install Cython==3.0.12 scipy pandas tqdm pytest pydantic
+pip3 install cython scipy pandas tqdm pytest pydantic
 export AMDGPU_TARGETS=gfx942 // change this as needed 
 export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS
 export ROCM_PATH=/opt/rocm // change this as needed 

From a38a18ac5e1a9579c34f04cce0211485d48670b1 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 17 Mar 2026 11:26:47 -0500
Subject: [PATCH 03/10] Fixed typo in readme

---
 README.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/README.md b/README.md
index 796c226a1be8..cfa46a36b77a 100644
--- a/README.md
+++ b/README.md
@@ -108,14 +108,6 @@ deactivate
 
 [Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
 
-[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep)
-
-[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
-
-[![Twitter](https://img.shields.io/twitter/follow/DGLGraph?style=social)](https://twitter.com/GraphDeep)
-
-[Website](https://www.dgl.ai) | [A Blitz Introduction to DGL](https://docs.dgl.ai/tutorials/blitz/index.html) | Documentation ([Latest](https://www.dgl.ai/dgl_docs/) | [Official Examples](examples/README.md) | [Discussion Forum](https://discuss.dgl.ai) | [Slack Channel](https://join.slack.com/t/deep-graph-library/shared_invite/zt-eb4ict1g-xcg3PhZAFAB8p6dtKuP6xQ)
-
 DGL is an easy-to-use, high performance and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning if a deep graph model is a component of an end-to-end application, the rest of the logics can be implemented in any major frameworks, such as PyTorch, Apache MXNet or TensorFlow.
 
 <p align="center">

From c61cc21bc30a983485ab343c8e4f5450b451d0a6 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 17 Mar 2026 14:45:02 -0500
Subject: [PATCH 04/10] Attempt at fixing the linting errors

---
 graphbolt/src/cnumpy.h                                 |  1 +
 graphbolt/src/cuda/cooperative_minibatching_utils.cu   |  3 +--
 graphbolt/src/cuda/extension/gpu_graph_cache.cu        | 10 ++++------
 graphbolt/src/cuda/extension/unique_and_compact_map.cu | 10 ++++------
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index fbd1daeb14dd..c54d6e588dd6 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -67,6 +67,7 @@ class counting_semaphore_impl {
     count_ += n;
     for (int i = 0; i < n; ++i) cv_.notify_one();
   }
+
  private:
   std::mutex mutex_;
   std::condition_variable cv_;
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 6bd0f1005875..00ad8341900c 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -54,8 +54,7 @@ torch::Tensor RankAssignment(
         THRUST_CALL(
             transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr,
 
-            ::cuda::proclaim_return_type
-            <part_t>(
+            ::cuda::proclaim_return_type<part_t>(
                 [rank = static_cast<uint32_t>(rank),
                  world_size = static_cast<uint32_t>(
                      world_size)] __device__(index_t id) -> part_t {
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 3519f01842c2..3990ac498713 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -506,12 +506,10 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
               }
               if (edge_id_offsets) {
                 // Append the edge ids as the last element of the output.
-                output_edge_tensors.push_back(
-                    ops::IndptrEdgeIdsImpl(
-                        output_indptr, output_indptr.scalar_type(),
-                        *edge_id_offsets,
-                        static_cast<int64_t>(
-                            static_cast<indptr_t>(output_size))));
+                output_edge_tensors.push_back(ops::IndptrEdgeIdsImpl(
+                    output_indptr, output_indptr.scalar_type(),
+                    *edge_id_offsets,
+                    static_cast<int64_t>(static_cast<indptr_t>(output_size))));
               }
 
               {
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index 7c72be8c0b79..6a5625634065 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -205,8 +205,8 @@ UniqueAndCompactBatchedHashMapBased(
         cub::ArgIndexInputIterator index_it(indexes.data_ptr<int32_t>());
         auto input_it = thrust::make_transform_iterator(
             index_it,
-            ::cuda::proclaim_return_type
-            <::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
+            ::cuda::proclaim_return_type<
+                ::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
                 [=, map = map.ref(cuco::find)] __device__(auto it)
                     -> ::cuda::std::tuple<int64_t*, index_t, int32_t, bool> {
                   const auto i = it.key;
@@ -239,8 +239,7 @@ UniqueAndCompactBatchedHashMapBased(
         auto unique_ids_offsets_dev_ptr =
             unique_ids_offsets_dev.data_ptr<int64_t>();
         auto output_it = thrust::make_tabulate_output_iterator(
-            ::cuda::proclaim_return_type
-            <void>(
+            ::cuda::proclaim_return_type<void>(
                 [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
                  part_ids_ptr =
                      part_ids ? part_ids->data_ptr<cuda::part_t>() : nullptr,
@@ -264,8 +263,7 @@ UniqueAndCompactBatchedHashMapBased(
             DeviceSelect::If, input_it, output_it,
             unique_ids_offsets_dev_ptr + num_batches,
             offsets_ptr[2 * num_batches],
-            ::cuda::proclaim_return_type
-            <bool>([] __device__(const auto& t) {
+            ::cuda::proclaim_return_type<bool>([] __device__(const auto& t) {
               return ::cuda::std::get<3>(t);
             }));
         auto unique_ids_offsets = torch::empty(

From 24fba8d496b15e21b69af8f51c7aa03616ab1d68 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 17 Mar 2026 15:07:58 -0500
Subject: [PATCH 05/10] Fixing linting

---
 include/dgl/hip/hip_extensions/amd_warp_primitives.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index a4f8d42e4877..d79cbf961055 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -85,7 +85,7 @@ __device__ inline void __sync_active_threads() {
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
 }
 
-#if HIP_VERSION_MAJOR < 7 
+#if HIP_VERSION_MAJOR < 7
 __device__ inline void __syncwarp() {
   /* sync/barrier all threads in a warp */
   __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");

From ed3f23a4d85982c2ad530eaa6147c99f59d7b5cf Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Fri, 20 Mar 2026 16:33:27 -0500
Subject: [PATCH 06/10] Addressing comments on PR

---
 README.md                                     |  8 ++++++
 graphbolt/CMakeLists.txt                      |  6 +----
 graphbolt/src/cnumpy.h                        |  6 ++---
 .../hip/hip_extensions/amd_warp_primitives.h  | 25 +++++++------------
 .../gpu_cache/include/nv_gpu_cache.hpp        | 21 ++++++++++++++++
 5 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 4980db4a37c2..881c6f7ec455 100644
--- a/README.md
+++ b/README.md
@@ -426,3 +426,11 @@ If you use DGL in a scientific publication, we would appreciate citations to the
     journal={arXiv preprint arXiv:1909.01315}
 }
 ```
+
+## The Team
+
+DGL is developed and maintained by [NYU, NYU Shanghai, AWS Shanghai AI Lab, and AWS MXNet Science Team](https://www.dgl.ai/pages/about.html).
+
+## License
+
+DGL uses Apache License 2.0.
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
index 4b99fed8b43a..9463b50a18c2 100644
--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -206,11 +206,7 @@ if(USE_CUDA OR USE_HIP)
   endif()
 
   # Enables libcudacxx/libhipcxx for gpu_cache.
-  # On HIP, cuda::binary_semaphore is unsupported, so we skip the libcudacxx
-  # code paths and use the fallback (plain int mutex, raw pointers).
-  if(NOT USE_HIP)
-    target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
-  endif()
+  target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
   include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include")
   message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.")
 
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index c54d6e588dd6..eb39aee7135d 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -17,7 +17,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#ifdef __HIP_PLATFORM_AMD__
+#ifdef __HIPCC__
 #include <condition_variable>
 #else
 #include <cuda/std/semaphore>
@@ -51,7 +51,7 @@ struct io_uring_queue_destroyer {
  * @brief Disk Numpy Fetecher class.
  */
 
-#ifdef __HIP_PLATFORM_AMD__
+#ifdef __HIPCC__
 template <int MaxCount>
 class counting_semaphore_impl {
  public:
@@ -76,7 +76,7 @@ class counting_semaphore_impl {
 #endif
 
 class OnDiskNpyArray : public torch::CustomClassHolder {
-#ifdef __HIP_PLATFORM_AMD__
+#ifdef __HIPCC__
   using counting_semaphore_t = counting_semaphore_impl<1024>;
 #else
   using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>;
diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index d79cbf961055..e3af4ddaf8c0 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -12,7 +12,7 @@
 #undef WAVEFRONT_SIZE
 #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
 #elif defined(__AMDGCN__)
-#define WAVEFRONT_SIZE 64
+#define WAVEFRONT_SIZE __builtin_amdgcn_wavefrontsize()
 #endif
 
 /* this header file provides _*_sync functions, which is a hack only,
@@ -60,11 +60,10 @@ __device__ inline int __thread_rank(lane_mask mask) {
 }
 
 __device__ inline unsigned int __mask_size(lane_mask mask) {
-#if WAVEFRONT_SIZE == 64
-  return __popcll(mask);
-#else
-  return __popc(mask);
-#endif
+  if (WAVEFRONT_SIZE == 64)
+    return __popcll(mask);
+  else
+    return __popc(mask);
 }
 
 __device__ inline int __thread_rank_to_lane_id(lane_mask mask, int i) {
@@ -296,11 +295,8 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) {
   bmask = __branchmask();
 
   while (1) {
-#if WAVEFRONT_SIZE == 64
-    int i = __ffsll(bmask) - 1;
-#else
-    int i = __ffs((unsigned int)bmask) - 1;
-#endif
+    int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1
+                                   : __ffs((unsigned int)bmask) - 1;
 
     if (i < 0) break;
 
@@ -327,11 +323,8 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) {
 #endif
 
   while (1) {
-#if WAVEFRONT_SIZE == 64
-    int i = __ffsll(bmask) - 1;
-#else
-    int i = __ffs((unsigned int)bmask) - 1;
-#endif
+    int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1
+                                   : __ffs((unsigned int)bmask) - 1;
 
     if (i < 0) break;
 
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
index 21b0c812be5f..b060f67aac5d 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
@@ -24,8 +24,10 @@
 #include "gpu_cache_api.hpp"
 #ifdef LIBCUDACXX_VERSION
 #include <cuda/atomic>
+#ifndef DGL_USE_HIP
 #include <cuda/semaphore>
 #endif
+#endif
 
 #define SET_ASSOCIATIVITY 2
 // TODO: Properly for portable HIP code, this should be determined at runtime,
@@ -94,7 +96,26 @@ class gpu_cache : public gpu_cache_api<key_type> {
   using slabset = slab_set<set_associativity, key_type, warp_size>;
 #ifdef LIBCUDACXX_VERSION
   using atomic_ref_counter_type = cuda::atomic<ref_counter_type, cuda::thread_scope_device>;
+#ifdef DGL_USE_HIP
+  // cuda::binary_semaphore is not supported on HIP. Provide a device-side
+  // spinlock with the same acquire()/release() interface so that the
+  // LIBCUDACXX_VERSION kernel code compiles unchanged.
+  struct mutex {
+    int flag_;
+    __host__ __device__ explicit mutex(int initial = 1) : flag_(initial) {}
+    __device__ void acquire() {
+      while (atomicCAS(&flag_, 1, 0) == 0)
+        ;
+      __threadfence();
+    }
+    __device__ void release() {
+      __threadfence();
+      atomicExch(&flag_, 1);
+    }
+  };
+#else
   using mutex = cuda::binary_semaphore<cuda::thread_scope_device>;
+#endif
 #endif
 
  private:

From 9729b7b0c45713ebaee5135a9eca581da65b79d7 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 24 Mar 2026 10:36:37 -0500
Subject: [PATCH 07/10] Addressing latest set of comments

---
 CMakePresets.json                             |  1 +
 README.md                                     | 65 ++++------------
 .../hip/hip_extensions/amd_warp_primitives.h  |  2 +-
 script/install_graphbolt_deps.sh              | 78 +++++++++++++++----
 4 files changed, 81 insertions(+), 65 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 00926fbcc23b..7eab668b06bd 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -18,6 +18,7 @@
         "GPU_TARGETS": "gfx90a,gfx942",
         "CMAKE_HIP_ARCHITECTURES": "gfx90a,gfx942",
         "CMAKE_INSTALL_PREFIX": "/opt/rocm",
+        "USE_LIBXSMM": "OFF",
         "BUILD_CPP_TEST": "ON",
         "CMAKE_C_FLAGS": "-fdiagnostics-color=always",
         "CMAKE_CXX_FLAGS": "-fdiagnostics-color=always",
diff --git a/README.md b/README.md
index 881c6f7ec455..9330525b4dc7 100644
--- a/README.md
+++ b/README.md
@@ -46,64 +46,27 @@ docker run \
 - PyTorch >= 2.6.0
 
 ```bash
-# setup your environment to make ROCm and PyTorch visible
-python3 -m ven dgl_venv
-source dgl_venv/bin/activate
-pip3 install cython scipy pandas tqdm pytest pydantic
-export AMDGPU_TARGETS=gfx942 // change this as needed 
-export PYTORCH_ROCM_ARCH=$AMDGPU_TARGETS
-export ROCM_PATH=/opt/rocm // change this as needed 
-export PATH=$ROCM_PATH/bin:$PATH
-
-# note, the above might be set with:
-module load rocm pytorch
-# on a system where modules are used.
-
-# install dependencies
-export LIBHIPCXX_BRANCH=therock-7.11 // change as needed 
-git clone -b $LIBHIPCXX_BRANCH https://github.com/ROCm/libhipcxx.git
-
-cd libhipcxx && mkdir build && cd build
-
-export LIBHIPCXX_PATH=$HOME/libhipcxx_install // set to user desired path
-
-cmake -DCMAKE_PREFIX_PATH=$LIBHIPCXX_PATH -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=$AMDGPU_TARGETS -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=OFF  -DUSE_LIBXSMM=OFF ..
-
-cd ../..
-
-export libhipcxx_DIR=$LIBHIPCXX_PATH
-
-export HIPCOLL_BRANCH=release/rocmds-25.10 // change as needed
-git clone -b $HIPCOLL_BRANCH https://github.com/ROCm/hipCollections.git
-
-export HIPCOLL_PATH=$HOME/hipcoll_install // set to user defined path
-
-cd hipColections && mkdir build && cd build
-
-cmake  -DCMAKE_INSTALL_PREFIX=$HIPCOLL_PATH -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF  -DCMAKE_CXX_COMPILER=`which amdclang++` ..
-
-export hipCCL_DIR=$HIPCOLL_PATH
-
-cd ../..
-
+# get the code
 git clone --recurse-submodules https://github.com/ROCm/dgl
+cd dgl
 
-export DGL_PATH=$HOME/dgl_install
-
-cd dgl && mkdir build && cd build
-
-cmake  -DCMAKE_INSTALL_PREFIX=$DGL_PATH -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=$AMDGPU_TARGETS -DGPU_TARGETS=gfx942   -DCMAKE_CXX_COMPILER=`which amdclang++` -DCMAKE_C_COMPILER=`which amdclang` -DCMAKE_PREFIX_PATH=${hipCCL_DIR}/lib/cmake/cuco ..
-
-cd ../python
+# install graphbolt dependencies
+mkdir -p deps
+cd deps
+bash ../script/install_graphbolt_deps.sh
+cd ..
 
-pip3 install . --target=$DGL_PATH --no-deps --no-build-isolation
-export PYTHONPATH=$DGL_PATH:$PYTHONPATH
+# build dgl
+cmake --preset rocm <OPTIONAL_CMAKE_ARGS>
+cmake --build build
 
-cd ../..
+# install the python bindings (should be done inside a virtual env)
+cd python
+python -m pip install -e .
+python -m build --wheel
 
 # Check installation with python tests
 bash tests/scripts/task_unit_test_rocm.sh pytorch gpu
-deactivate
 ```
 
 <p align="center">
diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index e3af4ddaf8c0..62b9e52f87f5 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -12,7 +12,7 @@
 #undef WAVEFRONT_SIZE
 #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
 #elif defined(__AMDGCN__)
-#define WAVEFRONT_SIZE __builtin_amdgcn_wavefrontsize()
+#define WAVEFRONT_SIZE 64
 #endif
 
 /* this header file provides _*_sync functions, which is a hack only,
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index 15662accaf11..6778ae11b6ad 100644
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -1,22 +1,74 @@
 #!/usr/bin/env bash
 
-ROCM_ROOT=/opt/rocm
+usage() {
+  cat <<EOF
+Usage: $(basename "$0") [OPTIONS]
+
+Install graphbolt dependencies (hipCollections, rocPRIM patches).
+
+Options:
+  --rocm-root DIR           ROCm installation root. Takes priority over \$ROCM_ROOT and \$ROCM_PATH
+                            env variables. Falls back to \$ROCM_ROOT, then \$ROCM_PATH, then /opt/rocm.
+  --install-prefix DIR      Installation prefix for dependencies (default: same as rocm-root)
+  --hipcollections-branch B Branch to clone hipCollections from (default: release/rocmds-25.10)
+  --dry-run                 Print the commands that would be executed without running them
+  -h, --help                Show this help message
+EOF
+  exit 0
+}
+
+_ROCM_ENV_SET=false
+if [[ -n "${ROCM_ROOT}" || -n "${ROCM_PATH}" ]]; then
+  _ROCM_ENV_SET=true
+fi
+
+ROCM_ROOT="${ROCM_ROOT:-${ROCM_PATH:-/opt/rocm}}"
+INSTALL_PREFIX=""
+HIPCOLLECTIONS_BRANCH="release/rocmds-25.10"
+DRY_RUN=false
+_ROCM_FLAG_SET=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --rocm-root)         ROCM_ROOT="$2"; _ROCM_FLAG_SET=true; shift 2 ;;
+    --install-prefix)    INSTALL_PREFIX="$2"; shift 2 ;;
+    --hipcollections-branch) HIPCOLLECTIONS_BRANCH="$2"; shift 2 ;;
+    --dry-run)           DRY_RUN=true; shift ;;
+    -h|--help)           usage ;;
+    *) echo "Unknown option: $1"; usage ;;
+  esac
+done
+
+INSTALL_PREFIX="${INSTALL_PREFIX:-${ROCM_ROOT}}"
+
+if ! $_ROCM_ENV_SET && ! $_ROCM_FLAG_SET; then
+  echo "Neither ROCM_ROOT nor ROCM_PATH is set, defaulting to /opt/rocm."
+fi
+echo "ROCM_ROOT:      ${ROCM_ROOT}"
+echo "INSTALL_PREFIX:  ${INSTALL_PREFIX}"
+
+run() {
+  if $DRY_RUN; then
+    echo "[dry-run] $*"
+  else
+    "$@"
+  fi
+}
 
 export CC=${ROCM_ROOT}/llvm/bin/clang
 export CXX=${ROCM_ROOT}/llvm/bin/clang++
 
 set -x
-INSTALL_PREFIX=${ROCM_ROOT}
 FILE_SOURCE_DIR=$(dirname $(realpath $0))
 DEPS_DIR=$(pwd)
-export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"  
+export CMAKE_PREFIX_PATH="${ROCM_ROOT}/hip/lib/cmake;${ROCM_ROOT}/lib/cmake"
 
-git clone https://github.com/ROCm/hipCollections.git -b release/rocmds-25.10
-export RAPIDS_CMAKE_SCRIPT_BRANCH=release/rocmds-25.10
-cd hipCollections
-cmake -B build \
+run git clone https://github.com/ROCm/hipCollections.git -b "${HIPCOLLECTIONS_BRANCH}"
+export RAPIDS_CMAKE_SCRIPT_BRANCH="${HIPCOLLECTIONS_BRANCH}"
+run cd hipCollections
+run cmake -B build \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF
-cmake --build build --target install
+run cmake --build build --target install
 cd ${DEPS_DIR}
 
 # TODO (#21) this is an unacceptable way to do this,
@@ -27,10 +79,10 @@ cd ${DEPS_DIR}
 # We only use semaphores for a counter of IO operations in graphbolt, 
 # that only runs on the host (not on the device) so we should be "safe"
 # to use this for now.
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
+run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
+run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
+run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
+run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
 
 # TODO (#22) remove this once the patches are merged
 # the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883
@@ -38,4 +90,4 @@ sed -i '/#error semaphore is not supported on AMD hardware and should not be inc
 
 # Right now we need to patch the rocPRIM headers to fix the build because these
 # config headers are missing gfx942 (I've added them manually)
-cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
+run cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.

From 9a82d43ddded5dee598af5908151175be81e67fb Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 24 Mar 2026 10:50:46 -0500
Subject: [PATCH 08/10] Forgot to include one reverted change to CMakeLists.txt

---
 CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7121087869..d10d9cd75d33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,11 +206,6 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   set(USE_LIBXSMM OFF)
 endif()
 
-if(USE_HIP AND USE_LIBXSMM)
-  message(STATUS "Disabling LIBXSMM for HIP builds (incompatible with lld linker).")
-  set(USE_LIBXSMM OFF)
-endif()
-
 # Source file lists
 file(GLOB DGL_SRC
   src/*.cc

From cc9e5505b6e071702fb849c04366274e478820a8 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 24 Mar 2026 12:00:28 -0500
Subject: [PATCH 09/10] Making install_graphbolt_deps.sh executable

---
 script/install_graphbolt_deps.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 script/install_graphbolt_deps.sh

diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
old mode 100644
new mode 100755

From 324eaf1934faa8905630b7f2c1124af81cca12f6 Mon Sep 17 00:00:00 2001
From: gcapodagAMD <Giacomo.Capodaglio@amd.com>
Date: Tue, 24 Mar 2026 12:09:42 -0500
Subject: [PATCH 10/10] Addressing James' comments

---
 .../hip/hip_extensions/amd_warp_primitives.h  | 23 ++++++++++++-------
 script/install_graphbolt_deps.sh              |  8 -------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index 62b9e52f87f5..d79cbf961055 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -60,10 +60,11 @@ __device__ inline int __thread_rank(lane_mask mask) {
 }
 
 __device__ inline unsigned int __mask_size(lane_mask mask) {
-  if (WAVEFRONT_SIZE == 64)
-    return __popcll(mask);
-  else
-    return __popc(mask);
+#if WAVEFRONT_SIZE == 64
+  return __popcll(mask);
+#else
+  return __popc(mask);
+#endif
 }
 
 __device__ inline int __thread_rank_to_lane_id(lane_mask mask, int i) {
@@ -295,8 +296,11 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) {
   bmask = __branchmask();
 
   while (1) {
-    int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1
-                                   : __ffs((unsigned int)bmask) - 1;
+#if WAVEFRONT_SIZE == 64
+    int i = __ffsll(bmask) - 1;
+#else
+    int i = __ffs((unsigned int)bmask) - 1;
+#endif
 
     if (i < 0) break;
 
@@ -323,8 +327,11 @@ __device__ inline lane_mask __match_any_sync(lane_mask mask, T value) {
 #endif
 
   while (1) {
-    int i = (WAVEFRONT_SIZE == 64) ? __ffsll(bmask) - 1
-                                   : __ffs((unsigned int)bmask) - 1;
+#if WAVEFRONT_SIZE == 64
+    int i = __ffsll(bmask) - 1;
+#else
+    int i = __ffs((unsigned int)bmask) - 1;
+#endif
 
     if (i < 0) break;
 
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index 6778ae11b6ad..4aa3e145eb32 100755
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -76,14 +76,6 @@ cd ${DEPS_DIR}
 # This was implicitly not allowed in previous releases we were using, 
 # but with v2.7.0 they are explicitly not allowed.
 
-# We only use semaphores for a counter of IO operations in graphbolt, 
-# that only runs on the host (not on the device) so we should be "safe"
-# to use this for now.
-run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
-run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
-run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
-run sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
-
 # TODO (#22) remove this once the patches are merged
 # the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883
 # but may take more time to be released.