diff --git a/CMakePresets.json b/CMakePresets.json index 00926fbcc23b..7eab668b06bd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -18,6 +18,7 @@ "GPU_TARGETS": "gfx90a,gfx942", "CMAKE_HIP_ARCHITECTURES": "gfx90a,gfx942", "CMAKE_INSTALL_PREFIX": "/opt/rocm", + "USE_LIBXSMM": "OFF", "BUILD_CPP_TEST": "ON", "CMAKE_C_FLAGS": "-fdiagnostics-color=always", "CMAKE_CXX_FLAGS": "-fdiagnostics-color=always", diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt index 689774244285..9463b50a18c2 100644 --- a/graphbolt/CMakeLists.txt +++ b/graphbolt/CMakeLists.txt @@ -205,7 +205,7 @@ if(USE_CUDA OR USE_HIP) set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES LINKER_LANGUAGE HIP) endif() - # Enables libcudacxx/libhipcxx for gpu_cache. + # Enables libcudacxx/libhipcxx for gpu_cache. target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION) include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include") message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.") diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h index 793116580963..eb39aee7135d 100644 --- a/graphbolt/src/cnumpy.h +++ b/graphbolt/src/cnumpy.h @@ -17,7 +17,11 @@ #include #include #include +#ifdef __HIPCC__ +#include +#else #include +#endif #include #include #include @@ -46,9 +50,37 @@ struct io_uring_queue_destroyer { /** * @brief Disk Numpy Fetecher class. */ + +#ifdef __HIPCC__ +template +class counting_semaphore_impl { + public: + explicit counting_semaphore_impl(int initial) : count_(initial) {} + static constexpr int max() { return MaxCount; } + void acquire() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] { return count_ > 0; }); + --count_; + } + void release(int n = 1) { + std::lock_guard lock(mutex_); + count_ += n; + for (int i = 0; i < n; ++i) cv_.notify_one(); + } + + private: + std::mutex mutex_; + std::condition_variable cv_; + int count_; +}; +#endif + class OnDiskNpyArray : public torch::CustomClassHolder { - // No user will need more than 1024 io_uring queues. +#ifdef __HIPCC__ + using counting_semaphore_t = counting_semaphore_impl<1024>; +#else using counting_semaphore_t = ::cuda::std::counting_semaphore<1024>; +#endif public: static constexpr int kGroupSize = 256; diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu index 2d25e763890f..98ff8963c335 100644 --- a/graphbolt/src/cuda/isin.cu +++ b/graphbolt/src/cuda/isin.cu @@ -47,7 +47,11 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) { } torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) { +#ifdef __HIPCC__ + rocprim::counting_iterator iota(0); +#else thrust::counting_iterator iota(0); +#endif auto result = torch::empty_like(mask, torch::kInt64); auto mask_ptr = mask.data_ptr(); auto result_ptr = result.data_ptr(); diff --git a/include/dgl/hip/hip_extensions/amd_warp_primitives.h b/include/dgl/hip/hip_extensions/amd_warp_primitives.h index 5bf10e3c3dcf..d79cbf961055 100644 --- a/include/dgl/hip/hip_extensions/amd_warp_primitives.h +++ b/include/dgl/hip/hip_extensions/amd_warp_primitives.h @@ -11,6 +11,8 @@ #ifdef __AMDGCN_WAVEFRONT_SIZE #undef WAVEFRONT_SIZE #define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE +#elif defined(__AMDGCN__) +#define WAVEFRONT_SIZE 64 #endif /* this header file provides _*_sync functions, which is a hack only, @@ -83,7 +85,7 @@ __device__ inline void __sync_active_threads() { __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); } -#if HIP_VERSION_MAJOR < 7 +#if HIP_VERSION_MAJOR < 7 __device__ inline void __syncwarp() { /* sync/barrier all threads in a warp */ __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); diff --git a/python/dgl/_ffi/_cython/function.pxi b/python/dgl/_ffi/_cython/function.pxi index dfcace512c21..c1b25f386c85 100644 --- a/python/dgl/_ffi/_cython/function.pxi +++ b/python/dgl/_ffi/_cython/function.pxi @@ -92,7 +92,7 @@ cdef inline int make_arg(object arg, ptr = arg._dgl_handle value[0].v_handle = (ptr) tcode[0] = arg.__class__._dgl_tcode - elif isinstance(arg, (int, long)): + elif isinstance(arg, int): value[0].v_int64 = arg tcode[0] = kObjectInt elif isinstance(arg, float): diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh old mode 100644 new mode 100755 index 15662accaf11..4aa3e145eb32 --- a/script/install_graphbolt_deps.sh +++ b/script/install_graphbolt_deps.sh @@ -1,22 +1,74 @@ #!/usr/bin/env bash -ROCM_ROOT=/opt/rocm +usage() { + cat < +#ifndef DGL_USE_HIP #include #endif +#endif #define SET_ASSOCIATIVITY 2 // TODO: Properly for portable HIP code, this should be determined at runtime, @@ -94,7 +96,26 @@ class gpu_cache : public gpu_cache_api { using slabset = slab_set; #ifdef LIBCUDACXX_VERSION using atomic_ref_counter_type = cuda::atomic; +#ifdef DGL_USE_HIP + // cuda::binary_semaphore is not supported on HIP. Provide a device-side + // spinlock with the same acquire()/release() interface so that the + // LIBCUDACXX_VERSION kernel code compiles unchanged. + struct mutex { + int flag_; + __host__ __device__ explicit mutex(int initial = 1) : flag_(initial) {} + __device__ void acquire() { + while (atomicCAS(&flag_, 1, 0) == 0) + ; + __threadfence(); + } + __device__ void release() { + __threadfence(); + atomicExch(&flag_, 1); + } + }; +#else using mutex = cuda::binary_semaphore; +#endif #endif private: