diff --git a/README.md b/README.md index 9362abf..1640110 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# mkn.gpu -# mkn.gpu \ No newline at end of file +CUDA/HIP C++20 convenience wrappers + +[README](https://raw.githubusercontent.com/mkn/mkn.gpu/master/README.noformat) diff --git a/README.noformat b/README.noformat index 0f2cc85..127f58c 100644 --- a/README.noformat +++ b/README.noformat @@ -1,6 +1,6 @@ mkn.gpu -CUDA/HIP C++17 convenience wrappers +CUDA/HIP C++20 convenience wrappers ====== @@ -27,6 +27,11 @@ Description expose functions explicitly via mkn::gpu::hip::* mkn::gpu::cuda::* +Key _MKN_GPU_WARP_SIZE_ +Type uint +Default use manufacturer provided (eg warpSize), usually 32 +Description override use if defined + Key _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ Type uint Default 1 diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 814a6ed..e77b740 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -36,14 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "mkn/kul/span.hpp" #include "mkn/kul/tuple.hpp" #include "mkn/kul/assert.hpp" -#include "mkn/kul/threads.hpp" -#include "mkn/gpu/cli.hpp" #include "mkn/gpu/def.hpp" -#include #include #include +#include #define MKN_GPU_ASSERT(x) (KASSERT((x))) @@ -90,12 +88,14 @@ struct dim3 { }; dim3 static inline threadIdx, blockIdx; +static constexpr int warpSize = 1; #endif // MKN_CPU_DO_NOT_DEFINE_DIM3 // namespace MKN_GPU_NS { +static constexpr int warp_size = warpSize; void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/ @@ -280,6 +280,11 @@ void fill(Container& c, T const val) { fill(c, c.size(), val); } +template +void zero(T* const t, std::size_t const size) { + std::fill(t, t + size, 0); +} + void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; } } // namespace MKN_GPU_NS diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index cbe2f2b..aa02875 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -32,15 +32,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_CUDA_HPP_ #define _MKN_GPU_CUDA_HPP_ -#include - #include "mkn/kul/log.hpp" #include "mkn/kul/span.hpp" #include "mkn/kul/tuple.hpp" -#include "mkn/kul/assert.hpp" #include "mkn/gpu/def.hpp" -#include "mkn/gpu/cli.hpp" #include @@ -76,8 +72,6 @@ __device__ SIZE block_idx_x() { } // namespace mkn::gpu::cuda -// - #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS #define MKN_GPU_NS mkn::gpu::cuda #else @@ -86,6 +80,12 @@ __device__ SIZE block_idx_x() { namespace MKN_GPU_NS { +#ifdef _MKN_GPU_WARP_SIZE_ +static constexpr int warp_size = _MKN_GPU_WARP_SIZE_; +#else +static constexpr int warp_size = warpSize; +#endif /*_MKN_GPU_WARP_SIZE_ */ + void inline setLimitMallocHeapSize(std::size_t const& bytes) { MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); } @@ -310,6 +310,7 @@ __global__ static void global_d_kernel(F f, Args... args) { } #include "launchers.hpp" +#include "devfunc.hpp" template __global__ void _vector_fill(T* a, V t, std::size_t s) { diff --git a/inc/mkn/gpu/devfunc.hpp b/inc/mkn/gpu/devfunc.hpp new file mode 100644 index 0000000..5708308 --- /dev/null +++ b/inc/mkn/gpu/devfunc.hpp @@ -0,0 +1,44 @@ +// IWYU pragma: private, include "mkn/gpu.hpp" +/** +Copyright (c) 2025, Philip Deegan. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Philip Deegan nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _MKN_GPU_DEVFUNC_HPP_ +#define _MKN_GPU_DEVFUNC_HPP_ + +template +void __device__ zero(T* const t, std::size_t const size) { + std::size_t chunk = 0; + auto const each = size / warpSize; + for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = 0; + if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = 0; + if constexpr (sync) __syncthreads(); +} + +#endif /* _MKN_GPU_DEVFUNC_HPP_ */ diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp index 877d044..3add67b 100644 --- a/inc/mkn/gpu/launchers.hpp +++ b/inc/mkn/gpu/launchers.hpp @@ -69,7 +69,13 @@ struct GDLauncher : public GLauncher { template struct DLauncher : public Launcher { - DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {} + DLauncher() : Launcher{dim3{1}, dim3{warp_size}} {} + DLauncher(size_t const /*dev*/) : Launcher{{}, {}} {} + + template + DLauncher(Args&&... args) + requires(sizeof...(Args) > 0) + : Launcher{args...} {} template auto operator()(F&& f, Args&&... args) { diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index 5556cde..f9ebd01 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -31,6 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_MULTI_LAUNCH_HPP_ #define _MKN_GPU_MULTI_LAUNCH_HPP_ +#include "mkn/gpu.hpp" +#include "mkn/kul/log.hpp" +#include "mkn/kul/time.hpp" +#include "mkn/kul/except.hpp" + #include #include #include @@ -41,10 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include "mkn/gpu.hpp" -#include "mkn/kul/log.hpp" -#include "mkn/kul/time.hpp" - namespace mkn::gpu::detail { template auto& deref(Type&& type) { @@ -57,6 +58,13 @@ auto& deref(Type&& type) { namespace mkn::gpu { +class StreamLauncherException : public kul::Exception { + public: + StreamLauncherException(char const* f, std::uint16_t const& l, std::string const& s) + : Exception{f, l, s} {} + StreamLauncherException(StreamLauncherException const& e) : Exception{e} {} +}; + enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER }; enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY }; @@ -390,10 +398,10 @@ struct ThreadedStreamLauncher : public StreamLauncher __global__ void _vector_fill(T* a, V t, std::size_t s) { @@ -330,14 +335,24 @@ void fill(Container& c, T val) { void inline prinfo(size_t dev = 0) { hipDeviceProp_t devProp; MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); - KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; - KOUT(NON) << " agent name " << devProp.name; - KOUT(NON) << " cores " << devProp.multiProcessorCount; - KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor; - KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB"; - KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB"; - KOUT(NON) << " warpSize " << devProp.warpSize; - KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; + KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; + KOUT(NON) << " agent name " << devProp.name; + KOUT(NON) << " cores " << devProp.multiProcessorCount; + KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor; + KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB"; + KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB"; + KOUT(NON) << " device warpSize " << devProp.warpSize; + KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; + +#ifdef _MKN_GPU_WARP_SIZE_ + KOUT(NON) << " warpSize used " << _MKN_GPU_WARP_SIZE_; +#else + KOUT(NON) << " warpSize used " << warp_size; + if (warp_size != devProp.warpSize) { + KOUT(NON) << " warpSize MISMATCH!!! " << warp_size << " vs " << devProp.warpSize; + KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###"; + } +#endif } void inline print_gpu_mem_used() { diff --git a/mkn.yaml b/mkn.yaml index 237c600..b6057b4 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -10,18 +10,20 @@ profile: - name: rocm parent: headers + arg: -DMKN_GPU_ROCM=1 test: test/any/(\w).cpp test/hip/(\w).cpp - name: cuda parent: headers + arg: -DMKN_GPU_CUDA=1 test: test/any/(\w).cpp test/cuda/(\w).cpp # if you have no GPU but want to test your code - name: cpu parent: headers - arg: -DMKN_GPU_CPU + arg: -DMKN_GPU_CPU=1 test: test/any/(\w).cpp test/cpu/(\w).cpp diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index 7a3406d..bd843cb 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -232,10 +232,10 @@ std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) int main() { KOUT(NON) << __FILE__; - return test() // - + test_threaded() // - + test_threaded(6) // - + test_threaded_group_barrier() // - + test_threaded_host_group_mutex() // + return test() // + + test_threaded(0) + test_threaded() // + + test_threaded(6) // + + test_threaded_group_barrier() // + + test_threaded_host_group_mutex() // + test_threaded_host_group_idx() + test_threaded_detached_stream_fns(); } diff --git a/test/any/managed.cpp b/test/any/managed.cpp index 0c9634b..ee64e07 100644 --- a/test/any/managed.cpp +++ b/test/any/managed.cpp @@ -83,9 +83,28 @@ std::uint32_t test_lambda_ref_copy_capture_views() { return 0; } +std::uint32_t test_zero() { + auto const size = 1000; // not warp size divisible! + ManagedVector mem0(size, 1); + ManagedVector mem1(size, 2); + + auto* view0 = mem0.data(); + auto* view1 = mem1.data(); + + mkn::gpu::DLauncher()([=] __device__() { + mkn::gpu::zero(view0, size); + mkn::gpu::zero(view1, size); + }); + + for (std::uint32_t i = 0; i < size; ++i) + if (mem0[i] + mem1[i] != 0) return 1; + + return 0; +} + int main() { KOUT(NON) << __FILE__; - return test() + // + return test() + test_zero() + // test_guess() + // test_lambda_copy_capture_views() + // test_lambda_ref_copy_capture_views();