diff --git a/inc/mkn/gpu/any/inc/alloc.ipp b/inc/mkn/gpu/any/inc/alloc.ipp index 77ff745..9eec4b8 100644 --- a/inc/mkn/gpu/any/inc/alloc.ipp +++ b/inc/mkn/gpu/any/inc/alloc.ipp @@ -110,13 +110,16 @@ template void copy(T0* dst, T1* src, Size const size) { assert(dst and src); - Pointer src_p{src}; - Pointer dst_p{dst}; + auto const is_dev = [](auto const& ptr) { return ptr.is_device_ptr() || ptr.is_managed_ptr(); }; + auto const is_hst = [](auto const& ptr) { return ptr.is_host_ptr() || ptr.is_managed_ptr(); }; - auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); }; - auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); }; - auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); }; - auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); }; + Pointer const src_p{src}; + Pointer const dst_p{dst}; + + auto const to_send = [&]() { return is_dev(dst_p) && is_hst(src_p); }; + auto const to_take = [&]() { return is_hst(dst_p) && is_dev(src_p); }; + auto const on_host = [&]() { return is_hst(dst_p) && is_hst(src_p); }; + auto const on_device = [&]() { return is_dev(dst_p) && is_dev(src_p); }; if (on_host()) std::copy(src, src + size, dst); @@ -130,6 +133,12 @@ void copy(T0* dst, T1* src, Size const size) { throw std::runtime_error("Unsupported operation (PR welcome)"); } +void copy(auto& dst, auto const& src) { + if (dst.size() > src.size()) throw std::runtime_error("mkn::gpu::copy src too small!"); + + copy(dst.data(), src.data(), dst.size()); +} + template auto& reserve(std::vector>& v, std::size_t const& s, bool mem_copy = true) { diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 9613a1a..e6ddff2 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -35,238 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "mkn/kul/log.hpp" #include "mkn/kul/span.hpp" #include "mkn/kul/tuple.hpp" -#include "mkn/kul/assert.hpp" -#include "mkn/gpu/def.hpp" +#include "mkn/gpu/cpu/def.hpp" +#include "mkn/gpu/cpu/api.hpp" +#include "mkn/gpu/cpu/cls.hpp" #include #include #include -#define MKN_GPU_ASSERT(x) (KASSERT((x))) - -#if defined(__device__) -#pragma message("__device__ already defined") -#error // check your compiler -#endif - -#if defined(__host__) -#pragma message("__host__ already defined") -#error // check your compiler -#endif - -#if defined(__global__) -#pragma message("__global__ already defined") -#error // check your compiler -#endif - -// we need to exclude these for CPU only operations -#define __shared__ -#define __device__ -#define __host__ -#define __global__ -#define __syncthreads(...) - -#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS -#define MKN_GPU_NS mkn::gpu::cpu -#else -#define MKN_GPU_NS mkn::gpu -#endif // MKN_GPU_FN_PER_NS - -#if !defined(MKN_CPU_DO_NOT_DEFINE_DIM3) -#define MKN_CPU_DO_NOT_DEFINE_DIM3 0 -#endif - -#if !defined(dim3) and !MKN_CPU_DO_NOT_DEFINE_DIM3 -struct dim3 { - dim3() {} - dim3(std::size_t x) : x{x} {} - dim3(std::size_t x, std::size_t y) : x{x}, y{y} {} - dim3(std::size_t x, std::size_t y, std::size_t z) : x{x}, y{y}, z{z} {} - - std::size_t x = 1, y = 1, z = 1; -}; - -dim3 static inline threadIdx, blockIdx; -static constexpr int warpSize = 1; - -#endif // MKN_CPU_DO_NOT_DEFINE_DIM3 - -// - namespace MKN_GPU_NS { -static constexpr int warp_size = warpSize; - -void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/ - -void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/ - -struct Stream { - Stream() {} - ~Stream() {} - - auto& operator()() { return stream; }; - void sync() {} - - std::size_t stream = 0; -}; - -struct StreamEvent { - StreamEvent(Stream&) {} - ~StreamEvent() {} - - auto& operator()(std::function fn = {}) { - fn(); - return *this; - } - - bool finished() const { return fin; } - - Stream stream; - bool fin = 1; - std::function _fn; -}; - -template -struct Pointer { - Pointer(T* _t) : t{_t} {} - - bool is_unregistered_ptr() const { return t == nullptr; } - bool is_host_ptr() const { return true; } - bool is_device_ptr() const { return false; } - bool is_managed_ptr() const { return false; } - - T* t; -}; - -template -void alloc(void*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size; - MKN_GPU_ASSERT(p = std::malloc(size)); -} - -template -void alloc(T*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); -} - -template -void alloc_host(T*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); -} - -template -void alloc_managed(T*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); -} - -void inline destroy(void* p) { - KLOG(TRC); - std::free(p); -} - -template -void destroy(T*& p) { - KLOG(TRC); - std::free(p); -} - -template -void destroy_host(T*& p) { - KLOG(TRC); - std::free(p); -} - -template -void copy_on_device(T* dst, T const* src, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T))); -} - -template -void send(void* p, void* t, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(p, t, size)); -} - -template -void send(T* p, T const* t, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T))); -} -template -void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) { - KLOG(TRC); - send(p, t, size); -} - -template -void take(T const* p, T* t, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T))); -} - -template -void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) { - KLOG(TRC); - take(p, t, size); -} - -void inline sync() {} - -#include "mkn/gpu/any/inc/alloc.ipp" -#include "mkn/gpu/any/inc/device.ipp" - -namespace detail { -static thread_local std::size_t idx = 0; -} - -template -void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) { - std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); - KLOG(TRC) << N; - std::apply( - [&](auto&&... params) { - for (std::size_t i = 0; i < N; ++i) { - f(params...); - detail::idx++; - } - }, - devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); - - detail::idx = 0; -} - -struct Launcher { - Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} - Launcher(std::size_t w, std::size_t h, std::size_t tpx, std::size_t tpy) - : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} - Launcher(std::size_t x, std::size_t y, std::size_t z, std::size_t tpx, std::size_t tpy, - std::size_t tpz) - : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} - - template - void operator()(F&& f, Args&&... args) { - launch(f, g, b, ds, s, args...); - } - - std::size_t ds = 0 /*dynamicShared*/; - dim3 g /*gridDim*/, b /*blockDim*/; - std::size_t s = 0; -}; - -struct GLauncher : public Launcher { - GLauncher(std::size_t s, [[maybe_unused]] size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} { - b.x = 1024; - g.x = s / b.x; - if ((s % b.x) > 0) ++g.x; - } - - std::size_t count; -}; template void fill(Container& c, size_t const size, T const val) { @@ -283,7 +61,7 @@ void fill_warp_size(T* const t, std::size_t const size, T const val) { std::fill(t, t + size, val); } -void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; } +void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Pseudo GPU in use"; } } // namespace MKN_GPU_NS diff --git a/inc/mkn/gpu/cpu/api.hpp b/inc/mkn/gpu/cpu/api.hpp index 3323ed7..f2b6811 100644 --- a/inc/mkn/gpu/cpu/api.hpp +++ b/inc/mkn/gpu/cpu/api.hpp @@ -1,4 +1,6 @@ #ifndef _MKN_GPU_CPU_API_HPP_ #define _MKN_GPU_CPU_API_HPP_ +#include "mkn/gpu/cpu/def.hpp" + #endif /*_MKN_GPU_CPU_API_HPP_*/ diff --git a/inc/mkn/gpu/cpu/cls.hpp b/inc/mkn/gpu/cpu/cls.hpp index a52bdf6..0ef8066 100644 --- a/inc/mkn/gpu/cpu/cls.hpp +++ b/inc/mkn/gpu/cpu/cls.hpp @@ -1,4 +1,106 @@ #ifndef _MKN_GPU_CPU_CLS_HPP_ #define _MKN_GPU_CPU_CLS_HPP_ +#include "mkn/gpu/cpu/def.hpp" + +#include +#include +#include +#include +#include + +namespace MKN_GPU_NS { + +struct Stream { + Stream() {} + ~Stream() {} + + auto& operator()() { return stream; }; + void sync() {} + + std::size_t stream = 0; +}; + +struct StreamEvent { + StreamEvent(Stream&) {} + ~StreamEvent() {} + + auto& operator()(std::function fn = {}) { + fn(); + return *this; + } + + bool finished() const { return fin; } + + Stream stream; + bool fin = 1; + std::function _fn; +}; + +template +struct Pointer { + Pointer(T* _t) : t{_t} {} + + bool is_unregistered_ptr() const { return t == nullptr; } + bool is_host_ptr() const { return true; } + bool is_device_ptr() const { return false; } + bool is_managed_ptr() const { return false; } + + T* t; +}; + +#include "mkn/gpu/any/inc/alloc.ipp" +#include "mkn/gpu/any/inc/device.ipp" + +namespace detail { +static thread_local std::size_t idx = 0; +} + +template +void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) { + std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); + KLOG(TRC) << N; + std::apply( + [&](auto&&... params) { + for (std::size_t i = 0; i < N; ++i) { + f(params...); + ++blockIdx.x; + ++detail::idx; + } + }, + devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); + + detail::idx = 0; + blockIdx.x = 0; +} + +struct Launcher { + Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} + Launcher(std::size_t w, std::size_t h, std::size_t tpx, std::size_t tpy) + : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} + Launcher(std::size_t x, std::size_t y, std::size_t z, std::size_t tpx, std::size_t tpy, + std::size_t tpz) + : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} + + template + void operator()(F&& f, Args&&... args) { + launch(f, g, b, ds, s, args...); + } + + std::size_t ds = 0 /*dynamicShared*/; + dim3 g /*gridDim*/, b /*blockDim*/; + std::size_t s = 0; +}; + +struct GLauncher : public Launcher { + GLauncher(std::size_t const s, std::size_t const /*dev*/ = 0) + : Launcher{dim3{}, dim3{}}, count{s} { + b.x = s; + } + + std::size_t count; +}; + +} // namespace MKN_GPU_NS + #endif /*_MKN_GPU_CPU_CLS_HPP_*/ diff --git a/inc/mkn/gpu/cpu/def.hpp b/inc/mkn/gpu/cpu/def.hpp index 7fdf630..33e8bfb 100644 --- a/inc/mkn/gpu/cpu/def.hpp +++ b/inc/mkn/gpu/cpu/def.hpp @@ -1,4 +1,169 @@ #ifndef _MKN_GPU_CPU_DEF_HPP_ #define _MKN_GPU_CPU_DEF_HPP_ +#include "mkn/kul/log.hpp" +#include "mkn/kul/assert.hpp" + +#include +#include +#include +#include +#include + +#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS +#define MKN_GPU_NS mkn::gpu::cpu +#else +#define MKN_GPU_NS mkn::gpu +#endif // MKN_GPU_FN_PER_NS + +#if defined(__device__) +#pragma message("__device__ already defined") +#error // check your compiler +#endif + +#if defined(__host__) +#pragma message("__host__ already defined") +#error // check your compiler +#endif + +#if defined(__global__) +#pragma message("__global__ already defined") +#error // check your compiler +#endif + +// we need to exclude these for CPU only operations +#define __shared__ +#define __device__ +#define __host__ +#define __global__ +#define __syncthreads(...) + +#if !defined(MKN_CPU_DO_NOT_DEFINE_DIM3) +#define MKN_CPU_DO_NOT_DEFINE_DIM3 0 +#endif + +#if !defined(dim3) and !MKN_CPU_DO_NOT_DEFINE_DIM3 +struct dim3 { + dim3() {} + dim3(std::size_t x) : x{x} {} + dim3(std::size_t x, std::size_t y) : x{x}, y{y} {} + dim3(std::size_t x, std::size_t y, std::size_t z) : x{x}, y{y}, z{z} {} + + std::size_t x = 1, y = 1, z = 1; +}; +dim3 static inline threadIdx, blockIdx; +#endif // MKN_CPU_DO_NOT_DEFINE_DIM3 + +namespace MKN_GPU_NS { + +#define MKN_GPU_ASSERT(x) (KASSERT((x))) + +struct DeviceProperties { + std::string major = "major"; + std::string minor = "minor"; + std::string name = "name"; + std::size_t multiProcessorCount = 1; + std::size_t maxThreadsPerMultiProcessor = 0; + std::size_t totalGlobalMem = 0; + std::size_t sharedMemPerBlock = 0; + std::size_t warpSize = 1; + std::size_t maxThreadsPerBlock = 0; +}; + +auto inline getDeviceProperties(std::size_t /*dev*/ = 0) { return DeviceProperties{}; } + +std::uint32_t inline getWarpSize(size_t /*dev */ = 0) { return 1; } + +static std::uint32_t inline const warp_size = getWarpSize(); + +auto inline getLimitMallocHeapSize() { + std::size_t bytes = 0; + + return bytes; +} + +void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} + +void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/ + +template +void alloc(void*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size; + MKN_GPU_ASSERT(p = std::malloc(size)); +} + +template +void alloc(T*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); +} + +template +void alloc_host(T*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); +} + +template +void alloc_managed(T*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); +} + +void inline destroy(void* p) { + KLOG(TRC); + std::free(p); +} + +template +void destroy(T* p) { + KLOG(TRC); + std::free(p); +} + +template +void destroy_host(T* p) { + KLOG(TRC); + std::free(p); +} + +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T))); +} + +template +void send(void* p, void* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(std::memcpy(p, t, size)); +} + +template +void send(T* p, T const* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T))); +} +template +void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) { + KLOG(TRC); + send(p, t, size); +} + +template +void take(T const* p, T* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T))); +} + +template +void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) { + KLOG(TRC); + take(p, t, size); +} + +void inline sync() {} + +} // namespace MKN_GPU_NS + #endif /*_MKN_GPU_CPU_DEF_HPP_*/ diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index aab6e00..1cefb6e 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -65,14 +65,15 @@ void fill(Container& c, T val) { void inline prinfo(size_t dev = 0) { cudaDeviceProp devProp; MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); - KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; - KOUT(NON) << " agent name " << devProp.name; - KOUT(NON) << " cores " << devProp.multiProcessorCount; - KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor; - KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB"; - KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB"; - KOUT(NON) << " warpSize " << devProp.warpSize; - KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; + KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; + KOUT(NON) << " agent name " << devProp.name; + KOUT(NON) << " cores " << devProp.multiProcessorCount; + KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor; + KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB"; + KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB"; + KOUT(NON) << " device warpSize " << devProp.warpSize; + KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; + KOUT(NON) << " version: " << CUDA_VERSION; } void inline print_gpu_mem_used() { diff --git a/inc/mkn/gpu/cuda/cls.hpp b/inc/mkn/gpu/cuda/cls.hpp index 133573e..3946411 100644 --- a/inc/mkn/gpu/cuda/cls.hpp +++ b/inc/mkn/gpu/cuda/cls.hpp @@ -64,14 +64,16 @@ struct StreamEvent { template struct Pointer { - Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); } - - bool is_unregistered_ptr() const { return attributes.type == 0; } + Pointer(T* _t) : t{_t} { + if (!t) throw std::runtime_error("invalid nullptr"); + MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); + } bool is_host_ptr() const { - return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr); + return type() == cudaMemoryTypeUnregistered or type() == cudaMemoryTypeHost; } - bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; } - bool is_managed_ptr() const { return attributes.type == 3; } + bool is_device_ptr() const { return type() == cudaMemoryTypeDevice; } + bool is_managed_ptr() const { return type() == cudaMemoryTypeManaged; } + auto type() const { return attributes.type; } T* t; cudaPointerAttributes attributes; @@ -83,7 +85,7 @@ struct Pointer { template void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) { std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); - KLOG(TRC) << N; + KLOG(TRC) << "N=" << N << " ds=" << ds; std::apply( [&](auto&&... params) { f<<>>(params...); diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp index 0d08974..451e44f 100644 --- a/inc/mkn/gpu/cuda/def.hpp +++ b/inc/mkn/gpu/cuda/def.hpp @@ -1,14 +1,14 @@ #ifndef _MKN_GPU_CUDA_DEF_HPP_ #define _MKN_GPU_CUDA_DEF_HPP_ +#include "mkn/kul/log.hpp" +#include "mkn/gpu/def.hpp" + #include #include #include -#include "mkn/kul/log.hpp" -#include "mkn/gpu/def.hpp" - #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS #define MKN_GPU_NS mkn::gpu::cuda #else @@ -32,18 +32,30 @@ inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = std::string getErrorString(auto const code) { return cudaGetErrorString(code); } +auto inline getDeviceProperties(size_t dev = 0) { + cudaDeviceProp devProp; + MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); + return devProp; +} + std::uint32_t inline getWarpSize(size_t dev = 0) { #ifdef _MKN_GPU_WARP_SIZE_ return _MKN_GPU_WARP_SIZE_; + #else - cudaDeviceProp devProp; - [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev); - return devProp.warpSize; + return getDeviceProperties(dev).warpSize; + #endif /*_MKN_GPU_WARP_SIZE_ */ } static std::uint32_t inline const warp_size = getWarpSize(); +auto inline getLimitMallocHeapSize() { + std::size_t bytes = 0; + MKN_GPU_ASSERT(cudaDeviceGetLimit(&bytes, cudaLimitMallocHeapSize)); + return bytes; +} + void inline setLimitMallocHeapSize(std::size_t const& bytes) { MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); } diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp index 83d5ec4..4730057 100644 --- a/inc/mkn/gpu/def.hpp +++ b/inc/mkn/gpu/def.hpp @@ -3,6 +3,29 @@ #include +#if MKN_GPU_ROCM and MKN_GPU_CUDA +#error // not possible +#endif + +#if MKN_GPU_ROCM +#define MKN_GPU_CUDA 0 +#endif + +#if MKN_GPU_CUDA +#define MKN_GPU_ROCM 0 +#endif + +#if MKN_GPU_CPU +#define MKN_GPU_ROCM 0 +#define MKN_GPU_CUDA 0 +#endif + +#if !defined(MKN_GPU_ROCM) and !defined(MKN_GPU_CUDA) +#if __has_include() and __has_include("hip/hip_runtime.h") +#error // ACTIVATE ONE! +#endif // __has_include(...) +#endif // !defined(...) + #if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h") #define MKN_GPU_ROCM 1 #endif diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 5046c14..6fc5fba 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -79,6 +79,8 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###"; } #endif + KOUT(NON) << " version: " << HIP_VERSION_MAJOR << "." << HIP_VERSION_MINOR << "." + << HIP_VERSION_PATCH; } void inline print_gpu_mem_used() { diff --git a/inc/mkn/gpu/rocm/cls.hpp b/inc/mkn/gpu/rocm/cls.hpp index 6a02f35..183310a 100644 --- a/inc/mkn/gpu/rocm/cls.hpp +++ b/inc/mkn/gpu/rocm/cls.hpp @@ -56,33 +56,24 @@ struct StreamEvent { bool fin = 0; }; -// - // https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 template struct Pointer { Pointer(T* _t) : t{_t} { - assert(t); + if (!t) throw std::runtime_error("invalid nullptr"); MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); - type = attributes.type; - } - - bool is_unregistered_ptr() const { - return attributes.type == hipMemoryType::hipMemoryTypeUnregistered; - } - bool is_host_ptr() const { - return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost; } + bool is_host_ptr() const { return type() == hipMemoryTypeHost; } bool is_device_ptr() const { - return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged; + return type() == hipMemoryTypeDevice || type() == hipMemoryTypeArray; } bool is_managed_ptr() const { - return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified; + return type() == hipMemoryTypeManaged || type() == hipMemoryTypeUnified; } + auto type() const { return attributes.type; } T* t; hipPointerAttribute_t attributes; - hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered; }; #include "mkn/gpu/any/inc/alloc.ipp" diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp index ee8f6a3..8b9da35 100644 --- a/inc/mkn/gpu/rocm/def.hpp +++ b/inc/mkn/gpu/rocm/def.hpp @@ -4,9 +4,10 @@ #include "mkn/kul/log.hpp" #include "mkn/gpu/def.hpp" -#include "hip/hip_runtime.h" - #include +#include + +#include "hip/hip_runtime.h" #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS #define MKN_GPU_NS mkn::gpu::hip @@ -31,18 +32,32 @@ inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = std::string getErrorString(auto const code) { return hipGetErrorString(code); } +auto inline getDeviceProperties(size_t dev = 0) { + hipDeviceProp_t devProp; + MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); + return devProp; +} + std::uint32_t inline getWarpSize(size_t dev = 0) { #ifdef _MKN_GPU_WARP_SIZE_ return _MKN_GPU_WARP_SIZE_; + #else hipDeviceProp_t devProp; - [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); + MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); return devProp.warpSize; + #endif /*_MKN_GPU_WARP_SIZE_ */ } static std::uint32_t inline const warp_size = getWarpSize(); +auto inline getLimitMallocHeapSize() { + std::size_t bytes = 0; + MKN_GPU_ASSERT(hipDeviceGetLimit(&bytes, hipLimitMallocHeapSize)); + return bytes; +} + void inline setLimitMallocHeapSize(std::size_t const& bytes) { MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes)); } diff --git a/mkn.yaml b/mkn.yaml index b6057b4..fc54392 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -11,21 +11,28 @@ profile: - name: rocm parent: headers arg: -DMKN_GPU_ROCM=1 - test: test/any/(\w).cpp - test/hip/(\w).cpp + # main: tst/any/managed.cpp + test: tst/any/(\w).cpp + tst/hip/(\w).cpp - name: cuda parent: headers arg: -DMKN_GPU_CUDA=1 - test: test/any/(\w).cpp - test/cuda/(\w).cpp + # main: tst/any/managed.cpp + test: tst/any/(\w).cpp + tst/cuda/(\w).cpp # if you have no GPU but want to test your code - name: cpu parent: headers + # main: tst/any/class1.cpp arg: -DMKN_GPU_CPU=1 - test: test/any/(\w).cpp - test/cpu/(\w).cpp + test: tst/any/(\w).cpp + tst/cpu/(\w).cpp + +- name: mem_test + parent: headers + main: tst/mem/test_alloc.cpp - name: format mod: | diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml index ab6d517..1426b2d 100644 --- a/res/mkn/clang_cuda.yaml +++ b/res/mkn/clang_cuda.yaml @@ -1,6 +1,5 @@ -## Recommended settings commented out. # -# clang 17 recommended +# clang 17 at least recommended # ## The following compile flags will likely be required with their specific values for your current hardware # --cuda-gpu-arch="sm_61" # sm_61 = nvidia 1080 ( @@ -14,8 +13,9 @@ super: settings property: + sm.arch: 120 cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version - cxx_cuda: -x cuda --cuda-gpu-arch="sm_120" -Xclang -fcuda-allow-variadic-functions + cxx_cuda: -x cuda --cuda-gpu-arch="sm_${sm.arch}" -Xclang -fcuda-allow-variadic-functions env: | PATH=/usr/local/cuda/bin:${PATH} diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml index ccb7686..3d9bf7d 100644 --- a/res/mkn/hipcc.yaml +++ b/res/mkn/hipcc.yaml @@ -1,4 +1,3 @@ -## Recommended settings commented out. super: settings @@ -6,6 +5,7 @@ compiler: mask: g++: hipcc +path: /opt/rocm/lib env: PATH=/opt/rocm/bin:${PATH} diff --git a/test/any/__share__.hpp b/tst/any/__share__.hpp similarity index 100% rename from test/any/__share__.hpp rename to tst/any/__share__.hpp diff --git a/test/any/add.cpp b/tst/any/add.cpp similarity index 87% rename from test/any/add.cpp rename to tst/any/add.cpp index 9a21b08..38c8fb6 100644 --- a/test/any/add.cpp +++ b/tst/any/add.cpp @@ -1,6 +1,8 @@ #include "mkn/gpu.hpp" +#include "mkn/kul/assert.hpp" + static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024; static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t TPB_X = 16, TPB_Y = 16; @@ -31,19 +33,17 @@ template uint32_t test_add1() { std::vector b(NUM); - assert(mkn::gpu::Pointer{b.data()}.is_host_ptr()); + mkn::kul::abort_if_not(mkn::gpu::Pointer{b.data()}.is_host_ptr() && "not host pointer"); for (uint32_t i = 0; i < NUM; i++) b[i] = i; mkn::gpu::DeviceMem devA(NUM), devB(b); - if constexpr (!mkn::gpu::CompileFlags::withCPU) { - assert(mkn::gpu::Pointer{devA.p}.is_device_ptr()); - } + if constexpr (!mkn::gpu::CompileFlags::withCPU) + mkn::kul::abort_if_not(mkn::gpu::Pointer{devA.p}.is_device_ptr() && "not device pointer"); mkn::gpu::Launcher{WIDTH, HEIGHT, TPB_X, TPB_Y}(vectoradd1, devA, devB); auto a = devA(); - // assert(mkn::gpu::Pointer{a.data()}.is_device_ptr()); for (uint32_t i = 0; i < NUM; i++) if (a[i] != b[i] + 1) return 1; return 0; diff --git a/test/any/array.cpp b/tst/any/array.cpp similarity index 100% rename from test/any/array.cpp rename to tst/any/array.cpp diff --git a/test/any/asio.cpp b/tst/any/asio.cpp similarity index 100% rename from test/any/asio.cpp rename to tst/any/asio.cpp diff --git a/test/any/async_streaming.cpp b/tst/any/async_streaming.cpp similarity index 100% rename from test/any/async_streaming.cpp rename to tst/any/async_streaming.cpp diff --git a/test/any/class1.cpp b/tst/any/class1.cpp similarity index 100% rename from test/any/class1.cpp rename to tst/any/class1.cpp diff --git a/test/any/class2.cpp b/tst/any/class2.cpp similarity index 100% rename from test/any/class2.cpp rename to tst/any/class2.cpp diff --git a/test/any/construct.cpp b/tst/any/construct.cpp similarity index 100% rename from test/any/construct.cpp rename to tst/any/construct.cpp diff --git a/test/any/info.cpp b/tst/any/info.cpp similarity index 100% rename from test/any/info.cpp rename to tst/any/info.cpp diff --git a/test/any/managed.cpp b/tst/any/managed.cpp similarity index 75% rename from test/any/managed.cpp rename to tst/any/managed.cpp index ffe6a6d..6dd4828 100644 --- a/test/any/managed.cpp +++ b/tst/any/managed.cpp @@ -1,6 +1,8 @@ #include "mkn/gpu.hpp" +#include "mkn/kul/assert.hpp" + static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024; static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16; @@ -21,9 +23,8 @@ __global__ void kernel(S* structs) { template std::uint32_t _test(L&& launcher) { ManagedVector mem{NUM}; - if constexpr (!mkn::gpu::CompileFlags::withCPU) { - assert(mkn::gpu::Pointer{mem.data()}.is_managed_ptr()); - } + if constexpr (!mkn::gpu::CompileFlags::withCPU) + mkn::kul::abort_if_not(mkn::gpu::Pointer{mem.data()}.is_managed_ptr() && "not host pointer"); for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i; @@ -102,10 +103,25 @@ std::uint32_t test_zero() { return 0; } +uint32_t test_copy() { + std::vector hst0(NUM, 1), hst1(NUM, 2); + ManagedVector dev0(NUM), dev1(NUM); + + // copy(T0* dst, T1* src, Size const size) + mkn::gpu::copy(dev0, hst0); + if (dev0.back() != 1) return 1; + mkn::gpu::copy(hst1, hst0); + if (hst1.back() != 1) return 1; + mkn::gpu::copy(dev1, hst1); + + return dev1[NUM - 1] != 1; +} + int main() { KOUT(NON) << __FILE__; - return test() + test_zero() + // - test_guess() + // - test_lambda_copy_capture_views() + // - test_lambda_ref_copy_capture_views(); + return test() + test_zero() + // + test_guess() + // + test_lambda_copy_capture_views() + // + test_lambda_ref_copy_capture_views() + // + test_copy(); } diff --git a/test/any/struct.cpp b/tst/any/struct.cpp similarity index 100% rename from test/any/struct.cpp rename to tst/any/struct.cpp diff --git a/test/cpu/namespace.cpp b/tst/cpu/namespace.cpp similarity index 100% rename from test/cpu/namespace.cpp rename to tst/cpu/namespace.cpp diff --git a/test/cuda/add.cpp b/tst/cuda/add.cpp similarity index 100% rename from test/cuda/add.cpp rename to tst/cuda/add.cpp diff --git a/test/cuda/async.cpp b/tst/cuda/async.cpp similarity index 100% rename from test/cuda/async.cpp rename to tst/cuda/async.cpp diff --git a/test/cuda/atomic.cpp b/tst/cuda/atomic.cpp similarity index 100% rename from test/cuda/atomic.cpp rename to tst/cuda/atomic.cpp diff --git a/test/hip/add.cpp b/tst/hip/add.cpp similarity index 100% rename from test/hip/add.cpp rename to tst/hip/add.cpp diff --git a/test/hip/async.cpp b/tst/hip/async.cpp similarity index 100% rename from test/hip/async.cpp rename to tst/hip/async.cpp diff --git a/test/hip/async.ppc b/tst/hip/async.ppc similarity index 100% rename from test/hip/async.ppc rename to tst/hip/async.ppc diff --git a/tst/mem/test_alloc.cpp b/tst/mem/test_alloc.cpp new file mode 100644 index 0000000..78bbdad --- /dev/null +++ b/tst/mem/test_alloc.cpp @@ -0,0 +1,58 @@ + +#include "mkn/kul/log.hpp" + +#include "mkn/gpu.hpp" +#include "mkn/gpu/def.hpp" + +#include + +template +using ManagedVector = std::vector>; + +bool constexpr set_heap_limit = true; // see if you need it by setting false after reboot + +bool test_allocate_percentage(double const percent) { + auto const devProp = mkn::gpu::getDeviceProperties(); + + auto const mem = devProp.totalGlobalMem; + KLOG(DBG) << mem; + + auto const limit = mem * percent; + if (set_heap_limit) mkn::gpu::setLimitMallocHeapSize(limit); + + auto const heapLim = mkn::gpu::getLimitMallocHeapSize(); + KLOG(DBG) << heapLim; + if (set_heap_limit and limit > heapLim) { + KOUT(NON) << "Cannot set heap limit! " << limit << " " << heapLim; + return 1; + } + + if (limit < sizeof(double)) throw std::runtime_error("limit < sizeof(double)"); + + auto const size = limit / sizeof(double); + + if (size == 0) throw std::runtime_error("size == 0"); + + { + ManagedVector vec(size, 2); + if (vec.data()[vec.size() - 1] != 2) return 1; + } + + KOUT(NON) << "Can allocate " << std::size_t(percent * 100) + << "% of total mem: " << std::size_t(mem / 1e6) << "mb"; + + return 0; +} + +int main() { + KOUT(NON) << __FILE__; + mkn::gpu::prinfo(); + if constexpr (mkn::gpu::CompileFlags::withCPU) + return 0; // NOT RELEVANT + else + return test_allocate_percentage(.1) + // + test_allocate_percentage(.2) + // + test_allocate_percentage(.3) + // + test_allocate_percentage(.4) + // + test_allocate_percentage(.5); +}