diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index eada364..f23582d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,6 +18,6 @@ jobs: - name: test run: | - curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix + curl -fL --retry 3 --retry-delay 2 -o mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix chmod +x mkn KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++20" test -W 9 diff --git a/.gitignore b/.gitignore index a699e6d..f96c360 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ bin .mkn *.cui -.clangd +*.clangd diff --git a/inc/mkn/gpu.hpp b/inc/mkn/gpu.hpp index 7a7f978..7389d11 100644 --- a/inc/mkn/gpu.hpp +++ b/inc/mkn/gpu.hpp @@ -31,7 +31,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_HPP_ #define _MKN_GPU_HPP_ -#include "mkn/gpu/defines.hpp" +#include "mkn/gpu/def.hpp" + +#if MKN_GPU_ROCM +#include "mkn/gpu/rocm.hpp" +#endif + +#if MKN_GPU_CUDA +#include "mkn/gpu/cuda.hpp" +#endif + +#if MKN_GPU_CPU +#include "mkn/gpu/cpu.hpp" +#endif namespace mkn::gpu { diff --git a/inc/mkn/gpu/any/cls.hpp b/inc/mkn/gpu/any/cls.hpp new file mode 100644 index 0000000..82abf4f --- /dev/null +++ b/inc/mkn/gpu/any/cls.hpp @@ -0,0 +1,24 @@ +#ifndef _MKN_GPU_ANY_CLS_HPP_ +#define _MKN_GPU_ANY_CLS_HPP_ + +#include "mkn/kul/env.hpp" +#include "mkn/kul/string.hpp" + +namespace mkn::gpu { + +template +struct Cli { + constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS"; + + auto bx_threads() const { + if (kul::env::EXISTS(MKN_GPU_BX_THREADS)) + return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS)); + return dev.maxThreadsPerBlock; + } + + Device const& dev; +}; + +} /* namespace mkn::gpu */ + +#endif /*_MKN_GPU_ANY_CLS_HPP_*/ diff --git a/inc/mkn/gpu/any/def.hpp b/inc/mkn/gpu/any/def.hpp new file mode 100644 index 0000000..da9f065 --- /dev/null +++ b/inc/mkn/gpu/any/def.hpp @@ -0,0 +1,4 @@ +#ifndef _MKN_GPU_ANY_DEF_HPP_ +#define _MKN_GPU_ANY_DEF_HPP_ + +#endif /*_MKN_GPU_ANY_DEF_HPP_*/ diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/any/inc/alloc.ipp similarity index 100% rename from inc/mkn/gpu/alloc.hpp rename to inc/mkn/gpu/any/inc/alloc.ipp diff --git a/inc/mkn/gpu/any/inc/devfunc.ipp b/inc/mkn/gpu/any/inc/devfunc.ipp new file mode 100644 index 0000000..1ca92a0 --- /dev/null +++ b/inc/mkn/gpu/any/inc/devfunc.ipp @@ -0,0 +1,9 @@ + +template +void __device__ fill_warp_size(T* const t, std::size_t const size, T const val) { + std::size_t chunk = 0; + auto const each = size / warpSize; + for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = val; + if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = val; + if constexpr (sync) __syncthreads(); +} diff --git a/inc/mkn/gpu/device.hpp b/inc/mkn/gpu/any/inc/device.ipp similarity index 98% rename from inc/mkn/gpu/device.hpp rename to inc/mkn/gpu/any/inc/device.ipp index 04cc9e8..c1f12f3 100644 --- a/inc/mkn/gpu/device.hpp +++ b/inc/mkn/gpu/any/inc/device.ipp @@ -59,12 +59,12 @@ struct DeviceMem { } void send(T const* t, std::size_t _size = 1, std::size_t start = 0) { - MKN_GPU_NS::send(p, t, _size, start); + MKN_GPU_NS::send(p + start, t, _size); } template , bool> = 0> void send(C const& c, std::size_t start = 0) { - send(c.data(), c.size(), start); + send(c.data() + start, c.size()); } void fill_n(T t, std::size_t _size, std::size_t start = 0) { @@ -128,14 +128,14 @@ struct AsioDeviceMem { void send(Stream& stream, T* t, std::size_t _size = 1, std::size_t start = 0) { assert(p != nullptr); - MKN_GPU_NS::send_async(p, t, stream, _size, start); + MKN_GPU_NS::send_async(p + start, t + start, stream(), _size); } template void take(Stream& stream, Span& span, std::size_t start) { assert(p != nullptr); assert(span.size() + start <= s); - MKN_GPU_NS::take_async(p, span, stream, start); + MKN_GPU_NS::take_async(p + start, span.data(), stream(), span.size()); } auto& size() const { return s; } diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/any/inc/launchers.ipp similarity index 100% rename from inc/mkn/gpu/launchers.hpp rename to inc/mkn/gpu/any/inc/launchers.ipp diff --git a/inc/mkn/gpu/any/inc/traits.ipp b/inc/mkn/gpu/any/inc/traits.ipp new file mode 100644 index 0000000..e69de29 diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp deleted file mode 100644 index 6a40948..0000000 --- a/inc/mkn/gpu/cli.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/** -Copyright (c) 2024, Philip Deegan. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Philip Deegan nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ -// IWYU pragma: private, include "mkn/gpu.hpp" -#ifndef _MKN_GPU_CLI_HPP_ -#define _MKN_GPU_CLI_HPP_ - -#include "mkn/kul/env.hpp" -#include "mkn/kul/string.hpp" - -namespace mkn::gpu { - -template -struct Cli { - constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS"; - - auto bx_threads() const { - if (kul::env::EXISTS(MKN_GPU_BX_THREADS)) - return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS)); - return dev.maxThreadsPerBlock; - } - - Device const& dev; -}; - -} /* namespace mkn::gpu */ - -#endif /*_MKN_GPU_CLI_HPP_*/ diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index e77b740..9613a1a 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -193,34 +193,32 @@ void send(void* p, void* t, Size size = 1) { } template -void send(T* p, T const* t, Size size = 1, Size start = 0) { +void send(T* p, T const* t, Size size = 1) { KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(p + start, t, size * sizeof(T))); + MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T))); } - template -void take(T* p, T* t, Size size = 1, Size start = 0) { +void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) { KLOG(TRC); - MKN_GPU_ASSERT(std::memcpy(t, p + start, size * sizeof(T))); + send(p, t, size); } template -void send_async(T* p, T const* t, Stream& /*stream*/, Size size = 1, Size start = 0) { +void take(T const* p, T* t, Size size = 1) { KLOG(TRC); - send(p, t, size, start); + MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T))); } -template -void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) { - static_assert(mkn::kul::is_span_like_v); +template +void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) { KLOG(TRC); - take(p, span.data(), span.size(), start); + take(p, t, size); } void inline sync() {} -#include "mkn/gpu/alloc.hpp" -#include "mkn/gpu/device.hpp" +#include "mkn/gpu/any/inc/alloc.ipp" +#include "mkn/gpu/any/inc/device.ipp" namespace detail { static thread_local std::size_t idx = 0; @@ -281,8 +279,8 @@ void fill(Container& c, T const val) { } template -void zero(T* const t, std::size_t const size) { - std::fill(t, t + size, 0); +void fill_warp_size(T* const t, std::size_t const size, T const val) { + std::fill(t, t + size, val); } void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; } @@ -310,7 +308,7 @@ static void global_d_kernel(F& f, Args... args) { f(args...); } -#include "launchers.hpp" +#include "mkn/gpu/any/inc/launchers.ipp" } /* namespace MKN_GPU_NS */ diff --git a/inc/mkn/gpu/cpu/api.hpp b/inc/mkn/gpu/cpu/api.hpp new file mode 100644 index 0000000..3323ed7 --- /dev/null +++ b/inc/mkn/gpu/cpu/api.hpp @@ -0,0 +1,4 @@ +#ifndef _MKN_GPU_CPU_API_HPP_ +#define _MKN_GPU_CPU_API_HPP_ + +#endif /*_MKN_GPU_CPU_API_HPP_*/ diff --git a/inc/mkn/gpu/cpu/cls.hpp b/inc/mkn/gpu/cpu/cls.hpp new file mode 100644 index 0000000..a52bdf6 --- /dev/null +++ b/inc/mkn/gpu/cpu/cls.hpp @@ -0,0 +1,4 @@ +#ifndef _MKN_GPU_CPU_CLS_HPP_ +#define _MKN_GPU_CPU_CLS_HPP_ + +#endif /*_MKN_GPU_CPU_CLS_HPP_*/ diff --git a/inc/mkn/gpu/cpu/def.hpp b/inc/mkn/gpu/cpu/def.hpp new file mode 100644 index 0000000..7fdf630 --- /dev/null +++ b/inc/mkn/gpu/cpu/def.hpp @@ -0,0 +1,4 @@ +#ifndef _MKN_GPU_CPU_DEF_HPP_ +#define _MKN_GPU_CPU_DEF_HPP_ + +#endif /*_MKN_GPU_CPU_DEF_HPP_*/ diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index aa02875..aab6e00 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -33,284 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define _MKN_GPU_CUDA_HPP_ #include "mkn/kul/log.hpp" -#include "mkn/kul/span.hpp" -#include "mkn/kul/tuple.hpp" - -#include "mkn/gpu/def.hpp" #include -// - -#define MKN_GPU_ASSERT(ans) \ - { \ - gpuAssert((ans), __FILE__, __LINE__); \ - } -inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) { - if (code != cudaSuccess) { - fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) std::abort(); - } -} - -namespace mkn::gpu::cuda { - -template -__device__ SIZE idx() { - SIZE width = gridDim.x * blockDim.x; - SIZE height = gridDim.y * blockDim.y; - SIZE x = blockDim.x * blockIdx.x + threadIdx.x; - SIZE y = blockDim.y * blockIdx.y + threadIdx.y; - SIZE z = blockDim.z * blockIdx.z + threadIdx.z; - return x + (y * width) + (z * width * height); -} - -template -__device__ SIZE block_idx_x() { - return blockIdx.x; -} - -} // namespace mkn::gpu::cuda - -#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS -#define MKN_GPU_NS mkn::gpu::cuda -#else -#define MKN_GPU_NS mkn::gpu -#endif // MKN_GPU_FN_PER_NS - -namespace MKN_GPU_NS { - -#ifdef _MKN_GPU_WARP_SIZE_ -static constexpr int warp_size = _MKN_GPU_WARP_SIZE_; -#else -static constexpr int warp_size = warpSize; -#endif /*_MKN_GPU_WARP_SIZE_ */ - -void inline setLimitMallocHeapSize(std::size_t const& bytes) { - MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); -} - -void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); } - -struct Stream { - Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); } - ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); } - - auto& operator()() { return stream; }; - - void sync() { result = cudaStreamSynchronize(stream); } - - cudaError_t result; - cudaStream_t stream; -}; - -// - -struct StreamEvent { - // - StreamEvent(Stream& stream_) : stream{stream_} {} - StreamEvent(StreamEvent&& that) = default; - StreamEvent(StreamEvent const&) = delete; - StreamEvent& operator=(StreamEvent const&) = delete; - - auto& operator()(std::function fn = {}) { - fin = 0; - _fn = fn; - MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0)); - return *this; - } - - static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) { - auto& self = *reinterpret_cast(ptr); - self._fn(); - self._fn = [] {}; - self.fin = 1; - } - - bool finished() const { return fin; } - - Stream& stream; - cudaError_t result; - std::function _fn; - bool fin = 0; -}; +#include "mkn/gpu/cuda/def.hpp" +#include "mkn/gpu/cuda/api.hpp" +#include "mkn/gpu/cuda/cls.hpp" // -template -struct Pointer { - Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); } - - bool is_unregistered_ptr() const { return attributes.type == 0; } - bool is_host_ptr() const { - return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr); - } - bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; } - bool is_managed_ptr() const { return attributes.type == 3; } - - T* t; - cudaPointerAttributes attributes; -}; - -template -void alloc(void*& p, Size size) { - MKN_GPU_ASSERT(cudaMalloc((void**)&p, size)); -} - -template -void alloc(T*& p, Size size) { - KLOG(TRC) << "GPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(cudaMalloc((void**)&p, size * sizeof(T))); -} - -template -void alloc_host(T*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(cudaMallocHost((void**)&p, size * sizeof(T))); -} - -template -void alloc_managed(T*& p, Size size) { - KLOG(TRC) << "GPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T))); -} - -void inline destroy(void* p) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaFree(p)); -} - -template -void destroy(T*& ptr) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaFree(ptr)); -} - -template -void destroy_host(T*& ptr) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaFreeHost(ptr)); -} - -template -void copy_on_device(T* dst, T const* src, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice)); -} - -template -void send(void* p, void* t, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaMemcpy(p, t, size, cudaMemcpyHostToDevice)); -} - -template -void send(T* p, T const* t, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaMemcpy(p + start, t, size * sizeof(T), cudaMemcpyHostToDevice)); -} - -template -void take(T const* p, T* t, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaMemcpy(t, p + start, size * sizeof(T), cudaMemcpyDeviceToHost)); -} - -template -void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(cudaMemcpyAsync(p + start, // - t + start, // - size * sizeof(T), // - cudaMemcpyHostToDevice, // - stream())); -} - -template -void take_async(T* p, Span& span, Stream& stream, std::size_t start) { - KLOG(TRC); - static_assert(mkn::kul::is_span_like_v); - MKN_GPU_ASSERT(cudaMemcpyAsync(span.data(), // - p + start, // - span.size() * sizeof(T), // - cudaMemcpyDeviceToHost, // - stream())); -} - -void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); } -void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); } - -#include "mkn/gpu/alloc.hpp" -#include "mkn/gpu/device.hpp" - -template -void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) { - std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); - KLOG(TRC) << N; - std::apply( - [&](auto&&... params) { - f<<>>(params...); - MKN_GPU_ASSERT(cudaGetLastError()); - }, - devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); - if constexpr (_sync) { - if (s) - sync(s); - else - sync(); - } -} +namespace MKN_GPU_NS { // -struct Launcher { - Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} - Launcher(size_t w, size_t h, size_t tpx, size_t tpy) - : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} - Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz) - : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} - - template - void operator()(F&& f, Args&&... args) { - launch(std::forward(f), g, b, ds, s, args...); - } - - size_t ds = 0 /*dynamicShared*/; - dim3 g /*gridDim*/, b /*blockDim*/; - cudaStream_t s = 0; -}; - -struct GLauncher : public Launcher { - GLauncher(std::size_t const& s, std::size_t const& _dev = 0) - : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} { - MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); - - resize(s); - } - - void resize(std::size_t const& s, std::size_t const& bx = 0) { - b.x = bx > 0 ? bx : cli.bx_threads(); - g.x = s / b.x; - if ((s % b.x) > 0) ++g.x; - } - - std::size_t dev = 0; - std::size_t count = 0; - cudaDeviceProp devProp; - mkn::gpu::Cli cli{devProp}; -}; - -template -__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) { - if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...); -} - -template -__global__ static void global_d_kernel(F f, Args... args) { - f(args...); -} - -#include "launchers.hpp" -#include "devfunc.hpp" template __global__ void _vector_fill(T* a, V t, std::size_t s) { @@ -330,7 +64,7 @@ void fill(Container& c, T val) { // void inline prinfo(size_t dev = 0) { cudaDeviceProp devProp; - [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev); + MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; KOUT(NON) << " agent name " << devProp.name; KOUT(NON) << " cores " << devProp.multiProcessorCount; @@ -344,7 +78,7 @@ void inline prinfo(size_t dev = 0) { void inline print_gpu_mem_used() { float free_m = 0, total_m = 0, used_m = 0; std::size_t free_t = 0, total_t = 0; - cudaMemGetInfo(&free_t, &total_t); + MKN_GPU_ASSERT(cudaMemGetInfo(&free_t, &total_t)); free_m = free_t / 1048576.0; total_m = total_t / 1048576.0; used_m = total_m - free_m; @@ -352,6 +86,10 @@ void inline print_gpu_mem_used() { total_t, total_m, used_m); } +// #include "mkn/gpu/any/inc/device.ipp" +#include "mkn/gpu/any/inc/launchers.ipp" +#include "mkn/gpu/any/inc/devfunc.ipp" + } // namespace MKN_GPU_NS #undef MKN_GPU_ASSERT diff --git a/inc/mkn/gpu/cuda/api.hpp b/inc/mkn/gpu/cuda/api.hpp new file mode 100644 index 0000000..1d0c293 --- /dev/null +++ b/inc/mkn/gpu/cuda/api.hpp @@ -0,0 +1,47 @@ +#ifndef _MKN_GPU_CUDA_API_HPP_ +#define _MKN_GPU_CUDA_API_HPP_ + +#include + +#include + +#include "mkn/gpu/cuda/def.hpp" + +namespace mkn::gpu::cuda { + +template +__device__ SIZE idx() { + SIZE width = gridDim.x * blockDim.x; + SIZE height = gridDim.y * blockDim.y; + SIZE x = blockDim.x * blockIdx.x + threadIdx.x; + SIZE y = blockDim.y * blockIdx.y + threadIdx.y; + SIZE z = blockDim.z * blockIdx.z + threadIdx.z; + return x + (y * width) + (z * width * height); +} + +template +__device__ SIZE block_idx_x() { + return blockIdx.x; +} + +} // namespace mkn::gpu::cuda + +namespace MKN_GPU_NS { + +template +__global__ static void global_d_kernel(F f, Args... args) { + f(args...); +} + +// + +template +__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) { + if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...); +} + +} // namespace MKN_GPU_NS + +// + +#endif /*_MKN_GPU_CUDA_API_HPP_*/ diff --git a/inc/mkn/gpu/cuda/cls.hpp b/inc/mkn/gpu/cuda/cls.hpp new file mode 100644 index 0000000..133573e --- /dev/null +++ b/inc/mkn/gpu/cuda/cls.hpp @@ -0,0 +1,140 @@ +#ifndef _MKN_GPU_CUDA_CLS_HPP_ +#define _MKN_GPU_CUDA_CLS_HPP_ + +#include +#include + +#include + +#include "mkn/kul/log.hpp" +#include "mkn/kul/span.hpp" +#include "mkn/kul/tuple.hpp" + +#include "mkn/gpu/any/def.hpp" +#include "mkn/gpu/any/cls.hpp" + +#include "mkn/gpu/cuda/def.hpp" + +namespace MKN_GPU_NS { + +struct Stream { + Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); } + ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); } + + auto& operator()() { return stream; }; + + void sync() { MKN_GPU_ASSERT(result = cudaStreamSynchronize(stream)); } + + cudaError_t result; + cudaStream_t stream; +}; + +// + +struct StreamEvent { + // + StreamEvent(Stream& stream_) : stream{stream_} {} + StreamEvent(StreamEvent&& that) = default; + StreamEvent(StreamEvent const&) = delete; + StreamEvent& operator=(StreamEvent const&) = delete; + + auto& operator()(std::function fn = {}) { + fin = 0; + _fn = fn; + MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0)); + return *this; + } + + static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) { + auto& self = *reinterpret_cast(ptr); + self._fn(); + self._fn = [] {}; + self.fin = 1; + } + + bool finished() const { return fin; } + + Stream& stream; + cudaError_t result; + std::function _fn; + bool fin = 0; +}; + +// + +template +struct Pointer { + Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); } + + bool is_unregistered_ptr() const { return attributes.type == 0; } + bool is_host_ptr() const { + return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr); + } + bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; } + bool is_managed_ptr() const { return attributes.type == 3; } + + T* t; + cudaPointerAttributes attributes; +}; + +#include "mkn/gpu/any/inc/alloc.ipp" +#include "mkn/gpu/any/inc/device.ipp" + +template +void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) { + std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); + KLOG(TRC) << N; + std::apply( + [&](auto&&... params) { + f<<>>(params...); + MKN_GPU_ASSERT(cudaGetLastError()); + }, + devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); + if constexpr (_sync) { + if (s) + sync(s); + else + sync(); + } +} + +struct Launcher { + Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} + Launcher(size_t w, size_t h, size_t tpx, size_t tpy) + : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} + Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz) + : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} + + template + void operator()(F&& f, Args&&... args) { + launch(std::forward(f), g, b, ds, s, args...); + } + + size_t ds = 0 /*dynamicShared*/; + dim3 g /*gridDim*/, b /*blockDim*/; + cudaStream_t s = 0; +}; + +struct GLauncher : public Launcher { + GLauncher(std::size_t const s, std::size_t const _dev = 0) + : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} { + MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); + + resize(s); + } + + void resize(std::size_t const s, std::size_t const bx = 0) { + b.x = bx > 0 ? bx : cli.bx_threads(); + g.x = s / b.x; + if ((s % b.x) > 0) ++g.x; + } + + std::size_t dev = 0; + std::size_t count = 0; + cudaDeviceProp devProp; + mkn::gpu::Cli cli{devProp}; +}; + +} // namespace MKN_GPU_NS + +#endif /*_MKN_GPU_CUDA_CLS_HPP_*/ diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp new file mode 100644 index 0000000..0d08974 --- /dev/null +++ b/inc/mkn/gpu/cuda/def.hpp @@ -0,0 +1,133 @@ +#ifndef _MKN_GPU_CUDA_DEF_HPP_ +#define _MKN_GPU_CUDA_DEF_HPP_ + +#include +#include + +#include + +#include "mkn/kul/log.hpp" +#include "mkn/gpu/def.hpp" + +#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS +#define MKN_GPU_NS mkn::gpu::cuda +#else +#define MKN_GPU_NS mkn::gpu +#endif // MKN_GPU_FN_PER_NS + +namespace MKN_GPU_NS { + +static_assert(CompileFlags::withCUDA); + +#define MKN_GPU_ASSERT(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) std::abort(); + } +} + +std::string getErrorString(auto const code) { return cudaGetErrorString(code); } + +std::uint32_t inline getWarpSize(size_t dev = 0) { +#ifdef _MKN_GPU_WARP_SIZE_ + return _MKN_GPU_WARP_SIZE_; +#else + cudaDeviceProp devProp; + [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev); + return devProp.warpSize; +#endif /*_MKN_GPU_WARP_SIZE_ */ +} + +static std::uint32_t inline const warp_size = getWarpSize(); + +void inline setLimitMallocHeapSize(std::size_t const& bytes) { + MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); +} + +void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); } + +void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); } +void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); } + +template +void alloc(void*& p, Size size) { + MKN_GPU_ASSERT(cudaMalloc((void**)&p, size)); +} + +template +void alloc(T*& p, Size size) { + KLOG(TRC) << "GPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(cudaMalloc((void**)&p, size * sizeof(T))); +} + +template +void alloc_host(T*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(cudaMallocHost((void**)&p, size * sizeof(T))); +} + +template +void alloc_managed(T*& p, Size size) { + KLOG(TRC) << "GPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T))); +} + +void inline destroy(void* p) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaFree(p)); +} + +template +void destroy(T* ptr) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaFree(ptr)); +} + +template +void destroy_host(T* ptr) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaFreeHost(ptr)); +} + +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice)); +} + +template +void send(void* p, void* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpy(p, t, size, cudaMemcpyHostToDevice)); +} + +template +void send(T* p, T const* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpy(p, t, size * sizeof(T), cudaMemcpyHostToDevice)); +} +template +void send_async(T* p, T const* t, cudaStream_t& stream, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpyAsync(p, t, size * sizeof(T), cudaMemcpyHostToDevice, stream)); +} + +template +void take(T const* p, T* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpy(t, p, size * sizeof(T), cudaMemcpyDeviceToHost)); +} + +template +void take_async(T const* p, T* t, cudaStream_t& stream, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpyAsync(t, p, size * sizeof(T), cudaMemcpyDeviceToHost, stream)); +} + +} // namespace MKN_GPU_NS + +#endif /*_MKN_GPU_CUDA_DEF_HPP_*/ diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp index df8db63..83d5ec4 100644 --- a/inc/mkn/gpu/def.hpp +++ b/inc/mkn/gpu/def.hpp @@ -3,9 +3,46 @@ #include +#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h") +#define MKN_GPU_ROCM 1 +#endif +#if !defined(MKN_GPU_ROCM) +#define MKN_GPU_ROCM 0 +#endif + +#if !defined(MKN_GPU_CUDA) and __has_include() +#define MKN_GPU_CUDA 1 +#endif +#if !defined(MKN_GPU_CUDA) +#define MKN_GPU_CUDA 0 +#endif + +#if MKN_GPU_CUDA && MKN_GPU_ROCM && !defined(MKN_GPU_FN_PER_NS) +#define MKN_GPU_FN_PER_NS 1 +#endif + +#if !defined(MKN_GPU_FN_PER_NS) +#define MKN_GPU_FN_PER_NS 0 +#endif + +#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU) +#define MKN_GPU_CPU 1 +#endif + +#if !defined(MKN_GPU_CPU) +#define MKN_GPU_CPU 0 +#endif + namespace mkn::gpu { -#if defined(MKN_GPU_CPU) +struct CompileFlags { + bool constexpr static withCUDA = MKN_GPU_CUDA; + bool constexpr static withROCM = MKN_GPU_ROCM; + bool constexpr static withCPU = MKN_GPU_CPU; + bool constexpr static perNamespace = MKN_GPU_FN_PER_NS; +}; + +#if MKN_GPU_CPU template static constexpr bool is_floating_point_v = std::is_floating_point_v; diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp deleted file mode 100644 index 3a45d43..0000000 --- a/inc/mkn/gpu/defines.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _MKN_GPU_DEFINES_HPP_ -#define _MKN_GPU_DEFINES_HPP_ - -#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h") -#define MKN_GPU_ROCM 1 -#endif -#if !defined(MKN_GPU_ROCM) -#define MKN_GPU_ROCM 0 -#endif - -#if !defined(MKN_GPU_CUDA) and __has_include() -#define MKN_GPU_CUDA 1 -#endif -#if !defined(MKN_GPU_CUDA) -#define MKN_GPU_CUDA 0 -#endif - -#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && !defined(MKN_GPU_FN_PER_NS) -#define MKN_GPU_FN_PER_NS 1 -#endif - -#if !defined(MKN_GPU_FN_PER_NS) -#define MKN_GPU_FN_PER_NS 0 -#endif - -#if MKN_GPU_ROCM == 1 -#include "mkn/gpu/rocm.hpp" -#endif - -#if MKN_GPU_CUDA == 1 -#include "mkn/gpu/cuda.hpp" -#endif - -#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU) -#define MKN_GPU_CPU 1 -#endif - -#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1 -#include "mkn/gpu/cpu.hpp" -#endif - -namespace mkn::gpu { - -struct CompileFlags { - bool constexpr static withCUDA = MKN_GPU_CUDA; - bool constexpr static withROCM = MKN_GPU_ROCM; - bool constexpr static withCPU = !MKN_GPU_ROCM and !MKN_GPU_CUDA; - bool constexpr static perNamespace = MKN_GPU_FN_PER_NS; -}; - -} /* namespace mkn::gpu */ - -#endif /*_MKN_GPU_DEFINES_HPP_*/ diff --git a/inc/mkn/gpu/devfunc.hpp b/inc/mkn/gpu/devfunc.hpp deleted file mode 100644 index 5708308..0000000 --- a/inc/mkn/gpu/devfunc.hpp +++ /dev/null @@ -1,44 +0,0 @@ -// IWYU pragma: private, include "mkn/gpu.hpp" -/** -Copyright (c) 2025, Philip Deegan. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Philip Deegan nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ -#ifndef _MKN_GPU_DEVFUNC_HPP_ -#define _MKN_GPU_DEVFUNC_HPP_ - -template -void __device__ zero(T* const t, std::size_t const size) { - std::size_t chunk = 0; - auto const each = size / warpSize; - for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = 0; - if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = 0; - if constexpr (sync) __syncthreads(); -} - -#endif /* _MKN_GPU_DEVFUNC_HPP_ */ diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 67721a1..5046c14 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -33,302 +33,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define _MKN_GPU_ROCM_HPP_ #include "mkn/kul/log.hpp" -#include "mkn/kul/span.hpp" -#include "mkn/kul/tuple.hpp" -#include "mkn/kul/assert.hpp" - -#include "mkn/gpu/cli.hpp" -#include "mkn/gpu/def.hpp" #include "hip/hip_runtime.h" -#define MKN_GPU_ASSERT(ans) \ - { \ - gpuAssert((ans), __FILE__, __LINE__); \ - } -inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) { - if (code != hipSuccess) { - fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line); - if (abort) std::abort(); - } -} - -namespace mkn::gpu::hip { - -template -__device__ SIZE idx() { - SIZE width = hipGridDim_x * hipBlockDim_x; - SIZE height = hipGridDim_y * hipBlockDim_y; - - SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; - return x + (y * width) + (z * width * height); // max 4294967296 -} - -} // namespace mkn::gpu::hip - -#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS -#define MKN_GPU_NS mkn::gpu::hip -#else -#define MKN_GPU_NS mkn::gpu -#endif // MKN_GPU_FN_PER_NS +#include "mkn/gpu/rocm/def.hpp" +#include "mkn/gpu/rocm/api.hpp" +#include "mkn/gpu/rocm/cls.hpp" namespace MKN_GPU_NS { -#ifdef _MKN_GPU_WARP_SIZE_ -static constexpr int warp_size = _MKN_GPU_WARP_SIZE_; -#else -static constexpr int warp_size = warpSize; -#endif /*_MKN_GPU_WARP_SIZE_ */ - -void inline setLimitMallocHeapSize(std::size_t const& bytes) { - MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes)); -} - -void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); } - -struct Stream { - Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); } - ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); } - - auto& operator()() { return stream; }; - - void sync() { result = hipStreamSynchronize(stream); } - - hipError_t result; - hipStream_t stream; -}; - -// - -struct StreamEvent { - // - StreamEvent(Stream& stream_) : stream{stream_} {} - StreamEvent(StreamEvent&& that) = default; - StreamEvent(StreamEvent const&) = delete; - StreamEvent& operator=(StreamEvent const&) = delete; - - auto& operator()(std::function fn = {}) { - fin = 0; - _fn = fn; - MKN_GPU_ASSERT(hipStreamAddCallback(stream(), StreamEvent::Callback, this, 0)); - return *this; - } - - static void Callback(hipStream_t /*stream*/, hipError_t /*status*/, void* ptr) { - auto& self = *reinterpret_cast(ptr); - self._fn(); - self._fn = [] {}; - self.fin = 1; - } - - bool finished() const { return fin; } - - Stream& stream; - hipError_t result; - std::function _fn; - bool fin = 0; -}; - -// - -// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 -template -struct Pointer { - Pointer(T* _t) : t{_t} { - assert(t); - MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); - type = attributes.type; - } - - bool is_unregistered_ptr() const { - return attributes.type == hipMemoryType::hipMemoryTypeUnregistered; - } - bool is_host_ptr() const { - return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost; - } - bool is_device_ptr() const { - return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged; - } - bool is_managed_ptr() const { - return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified; - } - - T* t; - hipPointerAttribute_t attributes; - hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered; -}; - -template -void alloc(void*& p, Size size) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMalloc((void**)&p, size)); -} - -template -void alloc(T*& p, Size size) { - KLOG(TRC) << "GPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(hipMalloc((void**)&p, size * sizeof(T))); -} - -template -void alloc_host(T*& p, Size size) { - KLOG(TRC) << "CPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(hipHostMalloc((void**)&p, size * sizeof(T))); -} - -template -void alloc_managed(T*& p, Size size) { - auto const bytes = size * sizeof(T); - KLOG(TRC) << "GPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes)); -} - -void inline destroy(void* p) { - KLOG(TRC); - MKN_GPU_ASSERT(hipFree(p)); -} - -template -void destroy(T* ptr) { - KLOG(TRC); - MKN_GPU_ASSERT(hipFree(ptr)); -} - -template -void destroy_host(T* ptr) { - KLOG(TRC); - MKN_GPU_ASSERT(hipHostFree(ptr)); -} - -template -void copy_on_device(T* dst, T const* src, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice)); -} - -template -void send(void* p, void* t, Size size = 1) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMemcpy(p, t, size, hipMemcpyHostToDevice)); -} - -template -void send(T* p, T const* t, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMemcpy(p + start, t, size * sizeof(T), hipMemcpyHostToDevice)); -} - -template -void take(T const* p, T* t, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMemcpy(t, p + start, size * sizeof(T), hipMemcpyDeviceToHost)); -} - -template -void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) { - KLOG(TRC); - MKN_GPU_ASSERT(hipMemcpyAsync(p + start, // - t + start, // - size * sizeof(T), // - hipMemcpyHostToDevice, // - stream())); -} - -template -void take_async(T* p, Span& span, Stream& stream, std::size_t start) { - KLOG(TRC); - static_assert(mkn::kul::is_span_like_v); - MKN_GPU_ASSERT(hipMemcpyAsync(span.data(), // - p + start, // - span.size() * sizeof(T), // - hipMemcpyDeviceToHost, // - stream())); -} - -void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); } -void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); } - -#include "mkn/gpu/alloc.hpp" -#include "mkn/gpu/device.hpp" - -template -void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) { - std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); - KLOG(TRC) << N; - std::apply( - [&](auto&&... params) { - hipLaunchKernelGGL(f, g, b, ds, s, params...); - MKN_GPU_ASSERT(hipGetLastError()); - }, - devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); - if constexpr (_sync) { - if (s) - sync(s); - else - sync(); - } -} - -// https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions -struct Launcher { - Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} - Launcher(size_t w, size_t h, size_t tpx, size_t tpy) - : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} - Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz) - : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} - - template - void operator()(F&& f, Args&&... args) { - launch(std::forward(f), g, b, ds, s, args...); - } - - size_t ds = 0 /*dynamicShared*/; - dim3 g /*gridDim*/, b /*blockDim*/; - hipStream_t s = 0; -}; - -struct GLauncher : public Launcher { - GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} { - [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); - - b.x = cli.bx_threads(); - g.x = s / b.x; - if ((s % b.x) > 0) ++g.x; - } - - std::size_t count = 0; - hipDeviceProp_t devProp; - mkn::gpu::Cli cli{devProp}; -}; - -template -__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) { - if (auto i = mkn::gpu::hip::idx(); i < s) f(args...); -} - -template -__global__ static void global_d_kernel(F f, Args... args) { - f(args...); -} - -#include "launchers.hpp" -#include "devfunc.hpp" - template __global__ void _vector_fill(T* a, V t, std::size_t s) { if (auto i = mkn::gpu::hip::idx(); i < s) a[i] = t; } template -void fill(Container& c, size_t size, T val) { +void fill(Container& c, std::size_t const size, T const val) { GLauncher{c.size()}(_vector_fill, c.data(), val, size); } template -void fill(Container& c, T val) { - GLauncher{c.size()}(_vector_fill, c.data(), val, c.size()); +void fill(Container& c, T const val) { + fill(c, c.size(), val); } // https://rocm-developer-tools.github.io/HIP/group__Device.html @@ -348,7 +74,7 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " warpSize used " << _MKN_GPU_WARP_SIZE_; #else KOUT(NON) << " warpSize used " << warp_size; - if (warp_size != devProp.warpSize) { + if (warp_size != static_cast(devProp.warpSize)) { KOUT(NON) << " warpSize MISMATCH!!! " << warp_size << " vs " << devProp.warpSize; KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###"; } @@ -366,6 +92,9 @@ void inline print_gpu_mem_used() { total_t, total_m, used_m); } +#include "mkn/gpu/any/inc/launchers.ipp" +#include "mkn/gpu/any/inc/devfunc.ipp" + } // namespace MKN_GPU_NS #undef MKN_GPU_ASSERT diff --git a/inc/mkn/gpu/rocm/api.hpp b/inc/mkn/gpu/rocm/api.hpp new file mode 100644 index 0000000..b197bbb --- /dev/null +++ b/inc/mkn/gpu/rocm/api.hpp @@ -0,0 +1,45 @@ +#ifndef _MKN_GPU_ROCM_API_HPP_ +#define _MKN_GPU_ROCM_API_HPP_ + +#include + +#include "hip/hip_runtime.h" + +#include "mkn/gpu/rocm/def.hpp" + +namespace mkn::gpu::hip { + +template +__device__ SIZE idx() { + SIZE width = gridDim.x * blockDim.x; + SIZE height = gridDim.y * blockDim.y; + SIZE x = blockDim.x * blockIdx.x + threadIdx.x; + SIZE y = blockDim.y * blockIdx.y + threadIdx.y; + SIZE z = blockDim.z * blockIdx.z + threadIdx.z; + return x + (y * width) + (z * width * height); +} + +template +__device__ SIZE block_idx_x() { + return blockIdx.x; +} + +} // namespace mkn::gpu::hip + +namespace MKN_GPU_NS { + +template +__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) { + if (auto i = mkn::gpu::hip::idx(); i < s) f(args...); +} + +template +__global__ static void global_d_kernel(F f, Args... args) { + f(args...); +} + +} // namespace MKN_GPU_NS + +// + +#endif /*_MKN_GPU_ROCM_API_HPP_*/ diff --git a/inc/mkn/gpu/rocm/cls.hpp b/inc/mkn/gpu/rocm/cls.hpp new file mode 100644 index 0000000..6a02f35 --- /dev/null +++ b/inc/mkn/gpu/rocm/cls.hpp @@ -0,0 +1,149 @@ +#ifndef _MKN_GPU_ROCM_CLS_HPP_ +#define _MKN_GPU_ROCM_CLS_HPP_ + +#include "mkn/kul/span.hpp" +#include "mkn/kul/tuple.hpp" + +#include "mkn/gpu/any/def.hpp" +#include "mkn/gpu/any/cls.hpp" + +#include "hip/hip_runtime.h" + +#include "mkn/gpu/rocm/def.hpp" + +namespace MKN_GPU_NS { + +struct Stream { + Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); } + ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); } + + auto& operator()() { return stream; }; + + void sync() { MKN_GPU_ASSERT(result = hipStreamSynchronize(stream)); } + + hipError_t result; + hipStream_t stream; +}; + +// + +struct StreamEvent { + // + StreamEvent(Stream& stream_) : stream{stream_} {} + StreamEvent(StreamEvent&& that) = default; + StreamEvent(StreamEvent const&) = delete; + StreamEvent& operator=(StreamEvent const&) = delete; + + auto& operator()(std::function fn = {}) { + fin = 0; + _fn = fn; + MKN_GPU_ASSERT(hipStreamAddCallback(stream(), StreamEvent::Callback, this, 0)); + return *this; + } + + static void Callback(hipStream_t /*stream*/, hipError_t /*status*/, void* ptr) { + auto& self = *reinterpret_cast(ptr); + self._fn(); + self._fn = [] {}; + self.fin = 1; + } + + bool finished() const { return fin; } + + Stream& stream; + hipError_t result; + std::function _fn; + bool fin = 0; +}; + +// + +// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 +template +struct Pointer { + Pointer(T* _t) : t{_t} { + assert(t); + MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); + type = attributes.type; + } + + bool is_unregistered_ptr() const { + return attributes.type == hipMemoryType::hipMemoryTypeUnregistered; + } + bool is_host_ptr() const { + return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost; + } + bool is_device_ptr() const { + return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged; + } + bool is_managed_ptr() const { + return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified; + } + + T* t; + hipPointerAttribute_t attributes; + hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered; +}; + +#include "mkn/gpu/any/inc/alloc.ipp" +#include "mkn/gpu/any/inc/device.ipp" + +template +void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) { + std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); + KLOG(TRC) << N; + std::apply( + [&](auto&&... params) { + hipLaunchKernelGGL(f, g, b, ds, s, params...); + MKN_GPU_ASSERT(hipGetLastError()); + }, + devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); + if constexpr (_sync) { + if (s) + sync(s); + else + sync(); + } +} + +// https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions +struct Launcher { + Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {} + Launcher(size_t w, size_t h, size_t tpx, size_t tpy) + : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {} + Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz) + : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {} + + template + void operator()(F&& f, Args&&... args) { + launch(std::forward(f), g, b, ds, s, args...); + } + + size_t ds = 0 /*dynamicShared*/; + dim3 g /*gridDim*/, b /*blockDim*/; + hipStream_t s = 0; +}; + +struct GLauncher : public Launcher { + GLauncher(std::size_t const s, std::size_t const _dev = 0) + : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} { + MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); + + resize(s); + } + + void resize(std::size_t const s, std::size_t const bx = 0) { + b.x = bx > 0 ? bx : cli.bx_threads(); + g.x = s / b.x; + if ((s % b.x) > 0) ++g.x; + } + + std::size_t dev = 0; + std::size_t count = 0; + hipDeviceProp_t devProp; + mkn::gpu::Cli cli{devProp}; +}; + +} // namespace MKN_GPU_NS + +#endif /*_MKN_GPU_ROCM_CLS_HPP_*/ diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp new file mode 100644 index 0000000..ee8f6a3 --- /dev/null +++ b/inc/mkn/gpu/rocm/def.hpp @@ -0,0 +1,135 @@ +#ifndef _MKN_GPU_ROCM_DEF_HPP_ +#define _MKN_GPU_ROCM_DEF_HPP_ + +#include "mkn/kul/log.hpp" +#include "mkn/gpu/def.hpp" + +#include "hip/hip_runtime.h" + +#include + +#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS +#define MKN_GPU_NS mkn::gpu::hip +#else +#define MKN_GPU_NS mkn::gpu +#endif // MKN_GPU_FN_PER_NS + +namespace MKN_GPU_NS { + +static_assert(CompileFlags::withROCM); + +#define MKN_GPU_ASSERT(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) { + if (code != hipSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line); + if (abort) std::abort(); + } +} + +std::string getErrorString(auto const code) { return hipGetErrorString(code); } + +std::uint32_t inline getWarpSize(size_t dev = 0) { +#ifdef _MKN_GPU_WARP_SIZE_ + return _MKN_GPU_WARP_SIZE_; +#else + hipDeviceProp_t devProp; + [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); + return devProp.warpSize; +#endif /*_MKN_GPU_WARP_SIZE_ */ +} + +static std::uint32_t inline const warp_size = getWarpSize(); + +void inline setLimitMallocHeapSize(std::size_t const& bytes) { + MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes)); +} + +void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); } + +void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); } +void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); } + +template +void alloc(void*& p, Size size) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMalloc((void**)&p, size)); +} + +template +void alloc(T*& p, Size size) { + KLOG(TRC) << "GPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(hipMalloc((void**)&p, size * sizeof(T))); +} + +template +void alloc_host(T*& p, Size size) { + KLOG(TRC) << "CPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(hipHostMalloc((void**)&p, size * sizeof(T))); +} + +template +void alloc_managed(T*& p, Size size) { + auto const bytes = size * sizeof(T); + KLOG(TRC) << "GPU alloced: " << size * sizeof(T); + MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes)); +} + +void inline destroy(void* p) { + KLOG(TRC); + MKN_GPU_ASSERT(hipFree(p)); +} + +template +void destroy(T* ptr) { + KLOG(TRC); + MKN_GPU_ASSERT(hipFree(ptr)); +} + +template +void destroy_host(T* ptr) { + KLOG(TRC); + MKN_GPU_ASSERT(hipHostFree(ptr)); +} + +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice)); +} + +template +void send(void* p, void* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpy(p, t, size, hipMemcpyHostToDevice)); +} + +template +void send(T* p, T const* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpy(p, t, size * sizeof(T), hipMemcpyHostToDevice)); +} + +template +void send_async(T* p, T const* t, hipStream_t& stream, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpyAsync(p, t, size * sizeof(T), hipMemcpyHostToDevice, stream)); +} + +template +void take(T const* p, T* t, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpy(t, p, size * sizeof(T), hipMemcpyDeviceToHost)); +} + +template +void take_async(T const* p, T* t, hipStream_t& stream, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpyAsync(t, p, size * sizeof(T), hipMemcpyDeviceToHost, stream)); +} + +} // namespace MKN_GPU_NS + +#endif /*_MKN_GPU_ROCM_DEF_HPP_*/ diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/stream.hpp similarity index 100% rename from inc/mkn/gpu/multi_launch.hpp rename to inc/mkn/gpu/stream.hpp diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp index 2f363f6..3dac8e1 100644 --- a/inc/mkn/gpu/tuple.hpp +++ b/inc/mkn/gpu/tuple.hpp @@ -31,7 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_TUPLE_HPP_ #define _MKN_GPU_TUPLE_HPP_ -// #include "mkn/gpu.hpp" #include "mkn/kul/tuple.hpp" namespace mkn::gpu { diff --git a/mkn.yaml b/mkn.yaml index 78253cc..b6057b4 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -23,7 +23,7 @@ profile: # if you have no GPU but want to test your code - name: cpu parent: headers - # arg: -DMKN_GPU_CPU=1 + arg: -DMKN_GPU_CPU=1 test: test/any/(\w).cpp test/cpu/(\w).cpp diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml index 9b9bd90..ab6d517 100644 --- a/res/mkn/clang_cuda.yaml +++ b/res/mkn/clang_cuda.yaml @@ -15,7 +15,7 @@ super: settings property: cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version - cxx_cuda: -x cuda --cuda-gpu-arch="sm_61" -Xclang -fcuda-allow-variadic-functions + cxx_cuda: -x cuda --cuda-gpu-arch="sm_120" -Xclang -fcuda-allow-variadic-functions env: | PATH=/usr/local/cuda/bin:${PATH} @@ -23,13 +23,9 @@ env: | inc: /usr/local/cuda/targets/x86_64-linux/include path: /usr/local/cuda/targets/x86_64-linux/lib -compiler: - mask: - clang++: - clang++-17 file: - type: cpp:cxx:cc archiver: ar -cr - compiler: clang++-17 ${cxx_flags} ${cxx_cuda} - linker: clang++-17 -Wl,-z,defs -fuse-ld=gold -lcudart + compiler: clang++ ${cxx_flags} ${cxx_cuda} + linker: clang++ -Wl,-z,defs -lcudart diff --git a/res/mkn/nvcc.yaml b/res/mkn/nvcc.yaml new file mode 100644 index 0000000..efdf76e --- /dev/null +++ b/res/mkn/nvcc.yaml @@ -0,0 +1,21 @@ +# + +super: settings + +property: + cxx_flags: --std=c++20 -Xcompiler -fPIC # -fPIC -fsized-deallocation -Wno-unknown-cuda-version + cxx_cuda: -x cu --extended-lambda #-x cuda --cuda-gpu-arch="sm_80" -Xclang -fcuda-allow-variadic-functions + +env: | + PATH=/usr/local/cuda/bin:${PATH} + +# inc: /usr/local/cuda/targets/x86_64-linux/include +# path: /usr/local/cuda/targets/x86_64-linux/lib + +file: + - type: cpp:cxx:cc + archiver: ar -cr + compiler: nvcc ${cxx_flags} ${cxx_cuda} + linker: nvcc -lcudart + + diff --git a/res/test_all.sh b/res/test_all.sh new file mode 100755 index 0000000..44c4078 --- /dev/null +++ b/res/test_all.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -e + +CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +( + cd "$CWD/.." + + mkn build -p cpu run test -W 9 + mkn build -x res/mkn/hipcc test -p rocm -W 9 + mkn build -x res/mkn/clang_cuda test -p cuda -W 9 + +) diff --git a/test/any/asio.cpp b/test/any/asio.cpp index 69d8701..6b30616 100644 --- a/test/any/asio.cpp +++ b/test/any/asio.cpp @@ -6,7 +6,7 @@ #include "mkn/gpu/asio.hpp" #include "__share__.hpp" -#if defined(MKN_GPU_CPU) +#if MKN_GPU_CPU static constexpr std::uint32_t BATCHES = 1; #else static constexpr std::uint32_t BATCHES = 2; diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index bd843cb..c4f4979 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -1,11 +1,11 @@ -#include -#include #include "mkn/gpu.hpp" #include "mkn/kul/dbg.hpp" #include "mkn/kul/time.hpp" -#include "mkn/gpu/multi_launch.hpp" +#include "mkn/gpu/stream.hpp" + +#include using namespace mkn::gpu; using namespace std::chrono_literals; @@ -21,7 +21,7 @@ struct A { }; std::uint32_t test() { - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; using T = double; std::vector> vecs(C, ManagedVector(NUM, 0)); @@ -54,7 +54,7 @@ std::uint32_t test() { } std::uint32_t test_threaded(std::size_t const& nthreads = 2) { - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; using T = double; std::vector> vecs(C, ManagedVector(NUM, 0)); @@ -87,7 +87,7 @@ std::uint32_t test_threaded(std::size_t const& nthreads = 2) { std::uint32_t test_threaded_group_barrier(std::size_t const& nthreads = 2) { using T = double; - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; std::vector> vecs(C + 1, ManagedVector(NUM, 0)); for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); @@ -121,7 +121,7 @@ std::uint32_t test_threaded_group_barrier(std::size_t const& nthreads = 2) { std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) { using T = double; - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; std::size_t constexpr group_size = 3; std::vector vals((C + 1) / group_size); // 2 values; @@ -156,7 +156,7 @@ std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) { std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) { using T = double; - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; std::size_t constexpr group_size = 3; @@ -196,7 +196,7 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) { std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) { using T = double; - KUL_DBG_FUNC_ENTER; + MKN_KUL_DBG_FUNC_ENTER; std::vector> vecs(C, ManagedVector(NUM, 0)); for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); diff --git a/test/any/managed.cpp b/test/any/managed.cpp index ee64e07..ffe6a6d 100644 --- a/test/any/managed.cpp +++ b/test/any/managed.cpp @@ -92,8 +92,8 @@ std::uint32_t test_zero() { auto* view1 = mem1.data(); mkn::gpu::DLauncher()([=] __device__() { - mkn::gpu::zero(view0, size); - mkn::gpu::zero(view1, size); + mkn::gpu::fill_warp_size(view0, size, 0.0f); + mkn::gpu::fill_warp_size(view1, size, 0.0f); }); for (std::uint32_t i = 0; i < size; ++i) diff --git a/test/hip/async.cpp b/test/hip/async.cpp index 5aed657..68f2f02 100644 --- a/test/hip/async.cpp +++ b/test/hip/async.cpp @@ -91,7 +91,7 @@ int main(int argc, char** argv) { for (int i = 0; i < nStreams; ++i) checkHip(hipStreamCreate(&stream[i])); // baseline case - sequential transfer and execute - memset(a, 0, bytes); + checkHip(hipMemset(a, 0, bytes)); checkHip(hipEventRecord(startEvent, 0)); checkHip(hipMemcpy(d_a, a, bytes, hipMemcpyHostToDevice)); kernel<<>>(d_a, 0); @@ -103,7 +103,7 @@ int main(int argc, char** argv) { printf(" max error: %e\n", maxError(a, n)); // asynchronous version 1: loop over {copy, kernel, copy} - memset(a, 0, bytes); + checkHip(hipMemset(a, 0, bytes)); checkHip(hipEventRecord(startEvent, 0)); for (int i = 0; i < nStreams; ++i) { int offset = i * streamSize; @@ -121,7 +121,7 @@ int main(int argc, char** argv) { // asynchronous version 2: // loop over copy, loop over kernel, loop over copy - memset(a, 0, bytes); + checkHip(hipMemset(a, 0, bytes)); checkHip(hipEventRecord(startEvent, 0)); for (int i = 0; i < nStreams; ++i) { int offset = i * streamSize; diff --git a/test/hip/async.ppc b/test/hip/async.ppc new file mode 100644 index 0000000..5aed657 --- /dev/null +++ b/test/hip/async.ppc @@ -0,0 +1,155 @@ +// https://raw.githubusercontent.com/NVIDIA-developer-blog/code-samples/master/series/hip-cpp/overlap-data-transfers/async.cu + +/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "hip/hip_runtime.h" + +#include + +// Convenience function for checking HIP runtime API results +// can be wrapped around any runtime API call. No-op in release builds. +inline void checkHip([[maybe_unused]] hipError_t result) { +#if defined(DEBUG) || defined(_DEBUG) + if (result != hipSuccess) { + fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result)); + assert(result == hipSuccess); + } +#endif + // return result; +} + +__global__ void kernel(float* a, int offset) { + int i = offset + threadIdx.x + blockIdx.x * blockDim.x; + float x = (float)i; + float s = sinf(x); + float c = cosf(x); + a[i] = a[i] + sqrtf(s * s + c * c); +} + +float maxError(float* a, int n) { + float maxE = 0; + for (int i = 0; i < n; i++) { + float error = fabs(a[i] - 1.0f); + if (error > maxE) maxE = error; + } + return maxE; +} + +int main(int argc, char** argv) { + int const blockSize = 256, nStreams = 4; + int const n = 4 * 1024 * blockSize * nStreams; + int const streamSize = n / nStreams; + int const streamBytes = streamSize * sizeof(float); + int const bytes = n * sizeof(float); + + int devId = 0; + if (argc > 1) devId = atoi(argv[1]); + + hipDeviceProp_t prop; + checkHip(hipGetDeviceProperties(&prop, devId)); + printf("Device : %s\n", prop.name); + checkHip(hipSetDevice(devId)); + + // allocate pinned host memory and device memory + float *a, *d_a; + checkHip(hipHostMalloc((void**)&a, bytes)); // host pinned + checkHip(hipMalloc((void**)&d_a, bytes)); // device + + float ms; // elapsed time in milliseconds + + // create events and streams + hipEvent_t startEvent, stopEvent, dummyEvent; + hipStream_t stream[nStreams]; + checkHip(hipEventCreate(&startEvent)); + checkHip(hipEventCreate(&stopEvent)); + checkHip(hipEventCreate(&dummyEvent)); + for (int i = 0; i < nStreams; ++i) checkHip(hipStreamCreate(&stream[i])); + + // baseline case - sequential transfer and execute + memset(a, 0, bytes); + checkHip(hipEventRecord(startEvent, 0)); + checkHip(hipMemcpy(d_a, a, bytes, hipMemcpyHostToDevice)); + kernel<<>>(d_a, 0); + checkHip(hipMemcpy(a, d_a, bytes, hipMemcpyDeviceToHost)); + checkHip(hipEventRecord(stopEvent, 0)); + checkHip(hipEventSynchronize(stopEvent)); + checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent)); + printf("Time for sequential transfer and execute (ms): %f\n", ms); + printf(" max error: %e\n", maxError(a, n)); + + // asynchronous version 1: loop over {copy, kernel, copy} + memset(a, 0, bytes); + checkHip(hipEventRecord(startEvent, 0)); + for (int i = 0; i < nStreams; ++i) { + int offset = i * streamSize; + checkHip( + hipMemcpyAsync(&d_a[offset], &a[offset], streamBytes, hipMemcpyHostToDevice, stream[i])); + kernel<<>>(d_a, offset); + checkHip( + hipMemcpyAsync(&a[offset], &d_a[offset], streamBytes, hipMemcpyDeviceToHost, stream[i])); + } + checkHip(hipEventRecord(stopEvent, 0)); + checkHip(hipEventSynchronize(stopEvent)); + checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent)); + printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms); + printf(" max error: %e\n", maxError(a, n)); + + // asynchronous version 2: + // loop over copy, loop over kernel, loop over copy + memset(a, 0, bytes); + checkHip(hipEventRecord(startEvent, 0)); + for (int i = 0; i < nStreams; ++i) { + int offset = i * streamSize; + checkHip( + hipMemcpyAsync(&d_a[offset], &a[offset], streamBytes, hipMemcpyHostToDevice, stream[i])); + } + for (int i = 0; i < nStreams; ++i) { + int offset = i * streamSize; + kernel<<>>(d_a, offset); + } + for (int i = 0; i < nStreams; ++i) { + int offset = i * streamSize; + checkHip( + hipMemcpyAsync(&a[offset], &d_a[offset], streamBytes, hipMemcpyDeviceToHost, stream[i])); + } + checkHip(hipEventRecord(stopEvent, 0)); + checkHip(hipEventSynchronize(stopEvent)); + checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent)); + printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms); + printf(" max error: %e\n", maxError(a, n)); + + // cleanup + checkHip(hipEventDestroy(startEvent)); + checkHip(hipEventDestroy(stopEvent)); + checkHip(hipEventDestroy(dummyEvent)); + for (int i = 0; i < nStreams; ++i) checkHip(hipStreamDestroy(stream[i])); + checkHip(hipFree(d_a)); + checkHip(hipHostFree(a)); + + return 0; +}