Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"GPU_TARGETS": "gfx90a,gfx942",
"CMAKE_HIP_ARCHITECTURES": "gfx90a,gfx942",
"CMAKE_INSTALL_PREFIX": "/opt/rocm",
"USE_LIBXSMM": "OFF",
"BUILD_CPP_TEST": "ON",
"CMAKE_C_FLAGS": "-fdiagnostics-color=always",
"CMAKE_CXX_FLAGS": "-fdiagnostics-color=always",
Expand Down
2 changes: 1 addition & 1 deletion graphbolt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ if(USE_CUDA OR USE_HIP)
set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES LINKER_LANGUAGE HIP)
endif()

# Enables libcudacxx/libhipcxx for gpu_cache.
# Enables libcudacxx/libhipcxx for gpu_cache.
target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include")
message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.")
Expand Down
34 changes: 33 additions & 1 deletion graphbolt/src/cnumpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#ifdef __HIPCC__
#include <condition_variable>
#else
#include <cuda/std/semaphore>
#endif
#include <memory>
#include <mutex>
#include <string>
Expand Down Expand Up @@ -46,9 +50,37 @@ struct io_uring_queue_destroyer {
/**
* @brief Disk Numpy Fetecher class.
*/

#ifdef __HIPCC__
template <int MaxCount>
class counting_semaphore_impl {
public:
explicit counting_semaphore_impl(int initial) : count_(initial) {}
static constexpr int max() { return MaxCount; }
void acquire() {
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [this] { return count_ > 0; });
--count_;
}
void release(int n = 1) {
std::lock_guard<std::mutex> lock(mutex_);
count_ += n;
for (int i = 0; i < n; ++i) cv_.notify_one();
}

private:
std::mutex mutex_;
std::condition_variable cv_;
int count_;
};
#endif

class OnDiskNpyArray : public torch::CustomClassHolder {
// No user will need more than 1024 io_uring queues.
#ifdef __HIPCC__
using counting_semaphore_t = counting_semaphore_impl<1024>;
#else
using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>;
#endif

public:
static constexpr int kGroupSize = 256;
Expand Down
4 changes: 4 additions & 0 deletions graphbolt/src/cuda/isin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) {
}

torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) {
#ifdef __HIPCC__
rocprim::counting_iterator<int64_t> iota(0);
#else
thrust::counting_iterator<int64_t> iota(0);
#endif
auto result = torch::empty_like(mask, torch::kInt64);
auto mask_ptr = mask.data_ptr<bool>();
auto result_ptr = result.data_ptr<int64_t>();
Expand Down
4 changes: 3 additions & 1 deletion include/dgl/hip/hip_extensions/amd_warp_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#ifdef __AMDGCN_WAVEFRONT_SIZE
#undef WAVEFRONT_SIZE
#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
#elif defined(__AMDGCN__)
#define WAVEFRONT_SIZE 64
#endif

/* this header file provides _*_sync functions, which is a hack only,
Expand Down Expand Up @@ -83,7 +85,7 @@ __device__ inline void __sync_active_threads() {
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
}

#if HIP_VERSION_MAJOR < 7
#if HIP_VERSION_MAJOR < 7
__device__ inline void __syncwarp() {
/* sync/barrier all threads in a warp */
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
Expand Down
2 changes: 1 addition & 1 deletion python/dgl/_ffi/_cython/function.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ cdef inline int make_arg(object arg,
ptr = arg._dgl_handle
value[0].v_handle = (<void*>ptr)
tcode[0] = arg.__class__._dgl_tcode
elif isinstance(arg, (int, long)):
elif isinstance(arg, int):
Comment thread
jamesETsmith marked this conversation as resolved.
value[0].v_int64 = arg
tcode[0] = kObjectInt
elif isinstance(arg, float):
Expand Down
78 changes: 61 additions & 17 deletions script/install_graphbolt_deps.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,41 +1,85 @@
#!/usr/bin/env bash

ROCM_ROOT=/opt/rocm
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]

Install graphbolt dependencies (hipCollections, rocPRIM patches).

Options:
--rocm-root DIR ROCm installation root. Takes priority over \$ROCM_ROOT and \$ROCM_PATH
env variables. Falls back to \$ROCM_ROOT, then \$ROCM_PATH, then /opt/rocm.
--install-prefix DIR Installation prefix for dependencies (default: same as rocm-root)
--hipcollections-branch B Branch to clone hipCollections from (default: release/rocmds-25.10)
--dry-run Print the commands that would be executed without running them
-h, --help Show this help message
EOF
exit 0
}

_ROCM_ENV_SET=false
if [[ -n "${ROCM_ROOT}" || -n "${ROCM_PATH}" ]]; then
_ROCM_ENV_SET=true
fi

ROCM_ROOT="${ROCM_ROOT:-${ROCM_PATH:-/opt/rocm}}"
INSTALL_PREFIX=""
HIPCOLLECTIONS_BRANCH="release/rocmds-25.10"
DRY_RUN=false
_ROCM_FLAG_SET=false

while [[ $# -gt 0 ]]; do
case "$1" in
--rocm-root) ROCM_ROOT="$2"; _ROCM_FLAG_SET=true; shift 2 ;;
--install-prefix) INSTALL_PREFIX="$2"; shift 2 ;;
--hipcollections-branch) HIPCOLLECTIONS_BRANCH="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
-h|--help) usage ;;
*) echo "Unknown option: $1"; usage ;;
esac
done

INSTALL_PREFIX="${INSTALL_PREFIX:-${ROCM_ROOT}}"

if ! $_ROCM_ENV_SET && ! $_ROCM_FLAG_SET; then
echo "Neither ROCM_ROOT nor ROCM_PATH is set, defaulting to /opt/rocm."
fi
echo "ROCM_ROOT: ${ROCM_ROOT}"
echo "INSTALL_PREFIX: ${INSTALL_PREFIX}"

run() {
if $DRY_RUN; then
echo "[dry-run] $*"
else
"$@"
fi
}

export CC=${ROCM_ROOT}/llvm/bin/clang
export CXX=${ROCM_ROOT}/llvm/bin/clang++

set -x
INSTALL_PREFIX=${ROCM_ROOT}
FILE_SOURCE_DIR=$(dirname $(realpath $0))
DEPS_DIR=$(pwd)
export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"
export CMAKE_PREFIX_PATH="${ROCM_ROOT}/hip/lib/cmake;${ROCM_ROOT}/lib/cmake"

git clone https://github.com/ROCm/hipCollections.git -b release/rocmds-25.10
export RAPIDS_CMAKE_SCRIPT_BRANCH=release/rocmds-25.10
cd hipCollections
cmake -B build \
run git clone https://github.com/ROCm/hipCollections.git -b "${HIPCOLLECTIONS_BRANCH}"
Comment thread
gcapodagAMD marked this conversation as resolved.
export RAPIDS_CMAKE_SCRIPT_BRANCH="${HIPCOLLECTIONS_BRANCH}"
run cd hipCollections
run cmake -B build \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF
cmake --build build --target install
run cmake --build build --target install
cd ${DEPS_DIR}

# TODO (#21) this is an unacceptable way to do this,
# see https://github.com/ROCm/libhipcxx/issues/10 for more details
# This was implicitly not allowed in previous releases we were using,
# but with v2.7.0 they are explicitly not allowed.

# We only use semaphores for a counter of IO operations in graphbolt,
# that only runs on the host (not on the device) so we should be "safe"
# to use this for now.
sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore

# TODO (#22) remove this once the patches are merged
# the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883
# but may take more time to be released.

# Right now we need to patch the rocPRIM headers to fix the build because these
# config headers are missing gfx942 (I've added them manually)
cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
run cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
21 changes: 21 additions & 0 deletions third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
#include "gpu_cache_api.hpp"
#ifdef LIBCUDACXX_VERSION
#include <cuda/atomic>
#ifndef DGL_USE_HIP
#include <cuda/semaphore>
#endif
#endif

#define SET_ASSOCIATIVITY 2
// TODO: Properly for portable HIP code, this should be determined at runtime,
Expand Down Expand Up @@ -94,7 +96,26 @@ class gpu_cache : public gpu_cache_api<key_type> {
using slabset = slab_set<set_associativity, key_type, warp_size>;
#ifdef LIBCUDACXX_VERSION
using atomic_ref_counter_type = cuda::atomic<ref_counter_type, cuda::thread_scope_device>;
#ifdef DGL_USE_HIP
// cuda::binary_semaphore is not supported on HIP. Provide a device-side
// spinlock with the same acquire()/release() interface so that the
// LIBCUDACXX_VERSION kernel code compiles unchanged.
struct mutex {
int flag_;
__host__ __device__ explicit mutex(int initial = 1) : flag_(initial) {}
__device__ void acquire() {
while (atomicCAS(&flag_, 1, 0) == 0)
;
__threadfence();
}
__device__ void release() {
__threadfence();
atomicExch(&flag_, 1);
}
};
#else
using mutex = cuda::binary_semaphore<cuda::thread_scope_device>;
#endif
#endif

private:
Expand Down
Loading