mkn · PhilipDeegan · Mar 22, 2026 · Mar 14, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/inc/mkn/gpu/any/inc/alloc.ipp b/inc/mkn/gpu/any/inc/alloc.ipp
@@ -110,13 +110,16 @@ template <typename T0, typename T1, typename Size>
 void copy(T0* dst, T1* src, Size const size) {
   assert(dst and src);
 
-  Pointer src_p{src};
-  Pointer dst_p{dst};
+  auto const is_dev = [](auto const& ptr) { return ptr.is_device_ptr() || ptr.is_managed_ptr(); };
+  auto const is_hst = [](auto const& ptr) { return ptr.is_host_ptr() || ptr.is_managed_ptr(); };
 
-  auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); };
-  auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); };
-  auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); };
-  auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); };
+  Pointer const src_p{src};
+  Pointer const dst_p{dst};
+
+  auto const to_send = [&]() { return is_dev(dst_p) && is_hst(src_p); };
+  auto const to_take = [&]() { return is_hst(dst_p) && is_dev(src_p); };
+  auto const on_host = [&]() { return is_hst(dst_p) && is_hst(src_p); };
+  auto const on_device = [&]() { return is_dev(dst_p) && is_dev(src_p); };
 
   if (on_host())
     std::copy(src, src + size, dst);
@@ -130,6 +133,12 @@ void copy(T0* dst, T1* src, Size const size) {
     throw std::runtime_error("Unsupported operation (PR welcome)");
 }
 
+void copy(auto& dst, auto const& src) {
+  if (dst.size() > src.size()) throw std::runtime_error("mkn::gpu::copy src too small!");
+
+  copy(dst.data(), src.data(), dst.size());
+}
+
 template <typename T, std::int32_t align>
 auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
               bool mem_copy = true) {

diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
@@ -35,238 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/log.hpp"
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
-#include "mkn/kul/assert.hpp"
 
-#include "mkn/gpu/def.hpp"
+#include "mkn/gpu/cpu/def.hpp"
+#include "mkn/gpu/cpu/api.hpp"
+#include "mkn/gpu/cpu/cls.hpp"
 
 #include <cassert>
 #include <cstring>
 #include <algorithm>
 
-#define MKN_GPU_ASSERT(x) (KASSERT((x)))
-
-#if defined(__device__)
-#pragma message("__device__ already defined")
-#error  // check your compiler
-#endif
-
-#if defined(__host__)
-#pragma message("__host__ already defined")
-#error  // check your compiler
-#endif
-
-#if defined(__global__)
-#pragma message("__global__ already defined")
-#error  // check your compiler
-#endif
-
-// we need to exclude these for CPU only operations
-#define __shared__
-#define __device__
-#define __host__
-#define __global__
-#define __syncthreads(...)
-
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-#define MKN_GPU_NS mkn::gpu::cpu
-#else
-#define MKN_GPU_NS mkn::gpu
-#endif  // MKN_GPU_FN_PER_NS
-
-#if !defined(MKN_CPU_DO_NOT_DEFINE_DIM3)
-#define MKN_CPU_DO_NOT_DEFINE_DIM3 0
-#endif
-
-#if !defined(dim3) and !MKN_CPU_DO_NOT_DEFINE_DIM3
-struct dim3 {
-  dim3() {}
-  dim3(std::size_t x) : x{x} {}
-  dim3(std::size_t x, std::size_t y) : x{x}, y{y} {}
-  dim3(std::size_t x, std::size_t y, std::size_t z) : x{x}, y{y}, z{z} {}
-
-  std::size_t x = 1, y = 1, z = 1;
-};
-
-dim3 static inline threadIdx, blockIdx;
-static constexpr int warpSize = 1;
-
-#endif  // MKN_CPU_DO_NOT_DEFINE_DIM3
-
-//
-
 namespace MKN_GPU_NS {
-static constexpr int warp_size = warpSize;
-
-void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
-
-void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/
-
-struct Stream {
-  Stream() {}
-  ~Stream() {}
-
-  auto& operator()() { return stream; };
-  void sync() {}
-
-  std::size_t stream = 0;
-};
-
-struct StreamEvent {
-  StreamEvent(Stream&) {}
-  ~StreamEvent() {}
-
-  auto& operator()(std::function<void()> fn = {}) {
-    fn();
-    return *this;
-  }
-
-  bool finished() const { return fin; }
-
-  Stream stream;
-  bool fin = 1;
-  std::function<void()> _fn;
-};
-
-template <typename T>
-struct Pointer {
-  Pointer(T* _t) : t{_t} {}
-
-  bool is_unregistered_ptr() const { return t == nullptr; }
-  bool is_host_ptr() const { return true; }
-  bool is_device_ptr() const { return false; }
-  bool is_managed_ptr() const { return false; }
-
-  T* t;
-};
-
-template <typename Size>
-void alloc(void*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size;
-  MKN_GPU_ASSERT(p = std::malloc(size));
-}
-
-template <typename T, typename Size>
-void alloc(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-template <typename T, typename Size>
-void alloc_host(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-template <typename T, typename Size>
-void alloc_managed(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-void inline destroy(void* p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T>
-void destroy(T*& p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T>
-void destroy_host(T*& p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T, typename Size>
-void copy_on_device(T* dst, T const* src, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T)));
-}
-
-template <typename Size>
-void send(void* p, void* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(p, t, size));
-}
-
-template <typename T, typename Size>
-void send(T* p, T const* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T)));
-}
-template <typename T, typename Size>
-void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) {
-  KLOG(TRC);
-  send(p, t, size);
-}
-
-template <typename T, typename Size>
-void take(T const* p, T* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) {
-  KLOG(TRC);
-  take(p, t, size);
-}
-
-void inline sync() {}
-
-#include "mkn/gpu/any/inc/alloc.ipp"
-#include "mkn/gpu/any/inc/device.ipp"
-
-namespace detail {
-static thread_local std::size_t idx = 0;
-}
-
-template <bool _sync = true, typename F, typename... Args>
-void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
-  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
-  KLOG(TRC) << N;
-  std::apply(
-      [&](auto&&... params) {
-        for (std::size_t i = 0; i < N; ++i) {
-          f(params...);
-          detail::idx++;
-        }
-      },
-      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-
-  detail::idx = 0;
-}
-
-struct Launcher {
-  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
-  Launcher(std::size_t w, std::size_t h, std::size_t tpx, std::size_t tpy)
-      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
-  Launcher(std::size_t x, std::size_t y, std::size_t z, std::size_t tpx, std::size_t tpy,
-           std::size_t tpz)
-      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
-
-  template <typename F, typename... Args>
-  void operator()(F&& f, Args&&... args) {
-    launch(f, g, b, ds, s, args...);
-  }
-
-  std::size_t ds = 0 /*dynamicShared*/;
-  dim3 g /*gridDim*/, b /*blockDim*/;
-  std::size_t s = 0;
-};
-
-struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, [[maybe_unused]] size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
-    b.x = 1024;
-    g.x = s / b.x;
-    if ((s % b.x) > 0) ++g.x;
-  }
-
-  std::size_t count;
-};
 
 template <typename Container, typename T>
 void fill(Container& c, size_t const size, T const val) {
@@ -283,7 +61,7 @@ void fill_warp_size(T* const t, std::size_t const size, T const val) {
   std::fill(t, t + size, val);
 }
 
-void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
+void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Pseudo GPU in use"; }
 
 }  // namespace MKN_GPU_NS
 

diff --git a/inc/mkn/gpu/cpu/api.hpp b/inc/mkn/gpu/cpu/api.hpp
@@ -1,4 +1,6 @@
 #ifndef _MKN_GPU_CPU_API_HPP_
 #define _MKN_GPU_CPU_API_HPP_
 
+#include "mkn/gpu/cpu/def.hpp"
+
 #endif /*_MKN_GPU_CPU_API_HPP_*/