diff --git a/CMakePresets.json b/CMakePresets.json
index 00926fbcc23b..7eab668b06bd 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -18,6 +18,7 @@
         "GPU_TARGETS": "gfx90a,gfx942",
         "CMAKE_HIP_ARCHITECTURES": "gfx90a,gfx942",
         "CMAKE_INSTALL_PREFIX": "/opt/rocm",
+        "USE_LIBXSMM": "OFF",
         "BUILD_CPP_TEST": "ON",
         "CMAKE_C_FLAGS": "-fdiagnostics-color=always",
         "CMAKE_CXX_FLAGS": "-fdiagnostics-color=always",
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
index 689774244285..9463b50a18c2 100644
--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -205,7 +205,7 @@ if(USE_CUDA OR USE_HIP)
   set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES LINKER_LANGUAGE HIP)
   endif()
 
-  # Enables libcudacxx/libhipcxx for gpu_cache. 
+  # Enables libcudacxx/libhipcxx for gpu_cache.
   target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
   include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include")
   message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.")
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index 793116580963..eb39aee7135d 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -17,7 +17,11 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#ifdef __HIPCC__
+#include <condition_variable>
+#else
 #include <cuda/std/semaphore>
+#endif
 #include <memory>
 #include <mutex>
 #include <string>
@@ -46,9 +50,37 @@ struct io_uring_queue_destroyer {
 /**
  * @brief Disk Numpy Fetecher class.
  */
+
+#ifdef __HIPCC__
+template <int MaxCount>
+class counting_semaphore_impl {
+ public:
+  explicit counting_semaphore_impl(int initial) : count_(initial) {}
+  static constexpr int max() { return MaxCount; }
+  void acquire() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return count_ > 0; });
+    --count_;
+  }
+  void release(int n = 1) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    count_ += n;
+    for (int i = 0; i < n; ++i) cv_.notify_one();
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  int count_;
+};
+#endif
+
 class OnDiskNpyArray : public torch::CustomClassHolder {
-  // No user will need more than 1024 io_uring queues.
+#ifdef __HIPCC__
+  using counting_semaphore_t = counting_semaphore_impl<1024>;
+#else
   using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>;
+#endif
 
  public:
   static constexpr int kGroupSize = 256;
diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu
index 2d25e763890f..98ff8963c335 100644
--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
@@ -47,7 +47,11 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) {
 }
 
 torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) {
+#ifdef __HIPCC__
+  rocprim::counting_iterator<int64_t> iota(0);
+#else
   thrust::counting_iterator<int64_t> iota(0);
+#endif
   auto result = torch::empty_like(mask, torch::kInt64);
   auto mask_ptr = mask.data_ptr<bool>();
   auto result_ptr = result.data_ptr<int64_t>();
diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
index 5bf10e3c3dcf..d79cbf961055 100644
--- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h
+++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h
@@ -11,6 +11,8 @@
 #ifdef __AMDGCN_WAVEFRONT_SIZE
 #undef WAVEFRONT_SIZE
 #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#elif defined(__AMDGCN__)
+#define WAVEFRONT_SIZE 64
 #endif
 
 /* this header file provides _*_sync functions, which is a hack only,
@@ -83,7 +85,7 @@ __device__ inline void __sync_active_threads() {
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
 }
 
-#if HIP_VERSION_MAJOR < 7 
+#if HIP_VERSION_MAJOR < 7
 __device__ inline void __syncwarp() {
   /* sync/barrier all threads in a warp */
   __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
diff --git a/python/dgl/_ffi/_cython/function.pxi b/python/dgl/_ffi/_cython/function.pxi
index dfcace512c21..c1b25f386c85 100644
--- a/python/dgl/_ffi/_cython/function.pxi
+++ b/python/dgl/_ffi/_cython/function.pxi
@@ -92,7 +92,7 @@ cdef inline int make_arg(object arg,
         ptr = arg._dgl_handle
         value[0].v_handle = (<void*>ptr)
         tcode[0] = arg.__class__._dgl_tcode
-    elif isinstance(arg, (int, long)):
+    elif isinstance(arg, int):
         value[0].v_int64 = arg
         tcode[0] = kObjectInt
     elif isinstance(arg, float):
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
old mode 100644
new mode 100755
index 15662accaf11..4aa3e145eb32
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -1,22 +1,74 @@
 #!/usr/bin/env bash
 
-ROCM_ROOT=/opt/rocm
+usage() {
+  cat <<EOF
+Usage: $(basename "$0") [OPTIONS]
+
+Install graphbolt dependencies (hipCollections, rocPRIM patches).
+
+Options:
+  --rocm-root DIR           ROCm installation root. Takes priority over \$ROCM_ROOT and \$ROCM_PATH
+                            env variables. Falls back to \$ROCM_ROOT, then \$ROCM_PATH, then /opt/rocm.
+  --install-prefix DIR      Installation prefix for dependencies (default: same as rocm-root)
+  --hipcollections-branch B Branch to clone hipCollections from (default: release/rocmds-25.10)
+  --dry-run                 Print the commands that would be executed without running them
+  -h, --help                Show this help message
+EOF
+  exit 0
+}
+
+_ROCM_ENV_SET=false
+if [[ -n "${ROCM_ROOT}" || -n "${ROCM_PATH}" ]]; then
+  _ROCM_ENV_SET=true
+fi
+
+ROCM_ROOT="${ROCM_ROOT:-${ROCM_PATH:-/opt/rocm}}"
+INSTALL_PREFIX=""
+HIPCOLLECTIONS_BRANCH="release/rocmds-25.10"
+DRY_RUN=false
+_ROCM_FLAG_SET=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --rocm-root)         ROCM_ROOT="$2"; _ROCM_FLAG_SET=true; shift 2 ;;
+    --install-prefix)    INSTALL_PREFIX="$2"; shift 2 ;;
+    --hipcollections-branch) HIPCOLLECTIONS_BRANCH="$2"; shift 2 ;;
+    --dry-run)           DRY_RUN=true; shift ;;
+    -h|--help)           usage ;;
+    *) echo "Unknown option: $1"; usage ;;
+  esac
+done
+
+INSTALL_PREFIX="${INSTALL_PREFIX:-${ROCM_ROOT}}"
+
+if ! $_ROCM_ENV_SET && ! $_ROCM_FLAG_SET; then
+  echo "Neither ROCM_ROOT nor ROCM_PATH is set, defaulting to /opt/rocm."
+fi
+echo "ROCM_ROOT:      ${ROCM_ROOT}"
+echo "INSTALL_PREFIX:  ${INSTALL_PREFIX}"
+
+run() {
+  if $DRY_RUN; then
+    echo "[dry-run] $*"
+  else
+    "$@"
+  fi
+}
 
 export CC=${ROCM_ROOT}/llvm/bin/clang
 export CXX=${ROCM_ROOT}/llvm/bin/clang++
 
 set -x
-INSTALL_PREFIX=${ROCM_ROOT}
 FILE_SOURCE_DIR=$(dirname $(realpath $0))
 DEPS_DIR=$(pwd)
-export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"  
+export CMAKE_PREFIX_PATH="${ROCM_ROOT}/hip/lib/cmake;${ROCM_ROOT}/lib/cmake"
 
-git clone https://github.com/ROCm/hipCollections.git -b release/rocmds-25.10
-export RAPIDS_CMAKE_SCRIPT_BRANCH=release/rocmds-25.10
-cd hipCollections
-cmake -B build \
+run git clone https://github.com/ROCm/hipCollections.git -b "${HIPCOLLECTIONS_BRANCH}"
+export RAPIDS_CMAKE_SCRIPT_BRANCH="${HIPCOLLECTIONS_BRANCH}"
+run cd hipCollections
+run cmake -B build \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF
-cmake --build build --target install
+run cmake --build build --target install
 cd ${DEPS_DIR}
 
 # TODO (#21) this is an unacceptable way to do this,
@@ -24,18 +76,10 @@ cd ${DEPS_DIR}
 # This was implicitly not allowed in previous releases we were using, 
 # but with v2.7.0 they are explicitly not allowed.
 
-# We only use semaphores for a counter of IO operations in graphbolt, 
-# that only runs on the host (not on the device) so we should be "safe"
-# to use this for now.
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
-sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
-
 # TODO (#22) remove this once the patches are merged
 # the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883
 # but may take more time to be released.
 
 # Right now we need to patch the rocPRIM headers to fix the build because these
 # config headers are missing gfx942 (I've added them manually)
-cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
+run cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
index 21b0c812be5f..b060f67aac5d 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
@@ -24,8 +24,10 @@
 #include "gpu_cache_api.hpp"
 #ifdef LIBCUDACXX_VERSION
 #include <cuda/atomic>
+#ifndef DGL_USE_HIP
 #include <cuda/semaphore>
 #endif
+#endif
 
 #define SET_ASSOCIATIVITY 2
 // TODO: Properly for portable HIP code, this should be determined at runtime,
@@ -94,7 +96,26 @@ class gpu_cache : public gpu_cache_api<key_type> {
   using slabset = slab_set<set_associativity, key_type, warp_size>;
 #ifdef LIBCUDACXX_VERSION
   using atomic_ref_counter_type = cuda::atomic<ref_counter_type, cuda::thread_scope_device>;
+#ifdef DGL_USE_HIP
+  // cuda::binary_semaphore is not supported on HIP. Provide a device-side
+  // spinlock with the same acquire()/release() interface so that the
+  // LIBCUDACXX_VERSION kernel code compiles unchanged.
+  struct mutex {
+    int flag_;
+    __host__ __device__ explicit mutex(int initial = 1) : flag_(initial) {}
+    __device__ void acquire() {
+      while (atomicCAS(&flag_, 1, 0) == 0)
+        ;
+      __threadfence();
+    }
+    __device__ void release() {
+      __threadfence();
+      atomicExch(&flag_, 1);
+    }
+  };
+#else
   using mutex = cuda::binary_semaphore<cuda::thread_scope_device>;
+#endif
 #endif
 
  private: