diff --git a/inc/mkn/gpu/any/inc/alloc.ipp b/inc/mkn/gpu/any/inc/alloc.ipp
index 77ff745..9eec4b8 100644
--- a/inc/mkn/gpu/any/inc/alloc.ipp
+++ b/inc/mkn/gpu/any/inc/alloc.ipp
@@ -110,13 +110,16 @@ template <typename T0, typename T1, typename Size>
 void copy(T0* dst, T1* src, Size const size) {
   assert(dst and src);
 
-  Pointer src_p{src};
-  Pointer dst_p{dst};
+  auto const is_dev = [](auto const& ptr) { return ptr.is_device_ptr() || ptr.is_managed_ptr(); };
+  auto const is_hst = [](auto const& ptr) { return ptr.is_host_ptr() || ptr.is_managed_ptr(); };
 
-  auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); };
-  auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); };
-  auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); };
-  auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); };
+  Pointer const src_p{src};
+  Pointer const dst_p{dst};
+
+  auto const to_send = [&]() { return is_dev(dst_p) && is_hst(src_p); };
+  auto const to_take = [&]() { return is_hst(dst_p) && is_dev(src_p); };
+  auto const on_host = [&]() { return is_hst(dst_p) && is_hst(src_p); };
+  auto const on_device = [&]() { return is_dev(dst_p) && is_dev(src_p); };
 
   if (on_host())
     std::copy(src, src + size, dst);
@@ -130,6 +133,12 @@ void copy(T0* dst, T1* src, Size const size) {
     throw std::runtime_error("Unsupported operation (PR welcome)");
 }
 
+void copy(auto& dst, auto const& src) {
+  if (dst.size() > src.size()) throw std::runtime_error("mkn::gpu::copy src too small!");
+
+  copy(dst.data(), src.data(), dst.size());
+}
+
 template <typename T, std::int32_t align>
 auto& reserve(std::vector<T, NoConstructAllocator<T, align>>& v, std::size_t const& s,
               bool mem_copy = true) {
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index 9613a1a..e6ddff2 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -35,238 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/log.hpp"
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
-#include "mkn/kul/assert.hpp"
 
-#include "mkn/gpu/def.hpp"
+#include "mkn/gpu/cpu/def.hpp"
+#include "mkn/gpu/cpu/api.hpp"
+#include "mkn/gpu/cpu/cls.hpp"
 
 #include <cassert>
 #include <cstring>
 #include <algorithm>
 
-#define MKN_GPU_ASSERT(x) (KASSERT((x)))
-
-#if defined(__device__)
-#pragma message("__device__ already defined")
-#error  // check your compiler
-#endif
-
-#if defined(__host__)
-#pragma message("__host__ already defined")
-#error  // check your compiler
-#endif
-
-#if defined(__global__)
-#pragma message("__global__ already defined")
-#error  // check your compiler
-#endif
-
-// we need to exclude these for CPU only operations
-#define __shared__
-#define __device__
-#define __host__
-#define __global__
-#define __syncthreads(...)
-
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-#define MKN_GPU_NS mkn::gpu::cpu
-#else
-#define MKN_GPU_NS mkn::gpu
-#endif  // MKN_GPU_FN_PER_NS
-
-#if !defined(MKN_CPU_DO_NOT_DEFINE_DIM3)
-#define MKN_CPU_DO_NOT_DEFINE_DIM3 0
-#endif
-
-#if !defined(dim3) and !MKN_CPU_DO_NOT_DEFINE_DIM3
-struct dim3 {
-  dim3() {}
-  dim3(std::size_t x) : x{x} {}
-  dim3(std::size_t x, std::size_t y) : x{x}, y{y} {}
-  dim3(std::size_t x, std::size_t y, std::size_t z) : x{x}, y{y}, z{z} {}
-
-  std::size_t x = 1, y = 1, z = 1;
-};
-
-dim3 static inline threadIdx, blockIdx;
-static constexpr int warpSize = 1;
-
-#endif  // MKN_CPU_DO_NOT_DEFINE_DIM3
-
-//
-
 namespace MKN_GPU_NS {
-static constexpr int warp_size = warpSize;
-
-void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
-
-void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/
-
-struct Stream {
-  Stream() {}
-  ~Stream() {}
-
-  auto& operator()() { return stream; };
-  void sync() {}
-
-  std::size_t stream = 0;
-};
-
-struct StreamEvent {
-  StreamEvent(Stream&) {}
-  ~StreamEvent() {}
-
-  auto& operator()(std::function<void()> fn = {}) {
-    fn();
-    return *this;
-  }
-
-  bool finished() const { return fin; }
-
-  Stream stream;
-  bool fin = 1;
-  std::function<void()> _fn;
-};
-
-template <typename T>
-struct Pointer {
-  Pointer(T* _t) : t{_t} {}
-
-  bool is_unregistered_ptr() const { return t == nullptr; }
-  bool is_host_ptr() const { return true; }
-  bool is_device_ptr() const { return false; }
-  bool is_managed_ptr() const { return false; }
-
-  T* t;
-};
-
-template <typename Size>
-void alloc(void*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size;
-  MKN_GPU_ASSERT(p = std::malloc(size));
-}
-
-template <typename T, typename Size>
-void alloc(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-template <typename T, typename Size>
-void alloc_host(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-template <typename T, typename Size>
-void alloc_managed(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
-}
-
-void inline destroy(void* p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T>
-void destroy(T*& p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T>
-void destroy_host(T*& p) {
-  KLOG(TRC);
-  std::free(p);
-}
-
-template <typename T, typename Size>
-void copy_on_device(T* dst, T const* src, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T)));
-}
-
-template <typename Size>
-void send(void* p, void* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(p, t, size));
-}
-
-template <typename T, typename Size>
-void send(T* p, T const* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T)));
-}
-template <typename T, typename Size>
-void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) {
-  KLOG(TRC);
-  send(p, t, size);
-}
-
-template <typename T, typename Size>
-void take(T const* p, T* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) {
-  KLOG(TRC);
-  take(p, t, size);
-}
-
-void inline sync() {}
-
-#include "mkn/gpu/any/inc/alloc.ipp"
-#include "mkn/gpu/any/inc/device.ipp"
-
-namespace detail {
-static thread_local std::size_t idx = 0;
-}
-
-template <bool _sync = true, typename F, typename... Args>
-void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
-  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
-  KLOG(TRC) << N;
-  std::apply(
-      [&](auto&&... params) {
-        for (std::size_t i = 0; i < N; ++i) {
-          f(params...);
-          detail::idx++;
-        }
-      },
-      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-
-  detail::idx = 0;
-}
-
-struct Launcher {
-  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
-  Launcher(std::size_t w, std::size_t h, std::size_t tpx, std::size_t tpy)
-      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
-  Launcher(std::size_t x, std::size_t y, std::size_t z, std::size_t tpx, std::size_t tpy,
-           std::size_t tpz)
-      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
-
-  template <typename F, typename... Args>
-  void operator()(F&& f, Args&&... args) {
-    launch(f, g, b, ds, s, args...);
-  }
-
-  std::size_t ds = 0 /*dynamicShared*/;
-  dim3 g /*gridDim*/, b /*blockDim*/;
-  std::size_t s = 0;
-};
-
-struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, [[maybe_unused]] size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
-    b.x = 1024;
-    g.x = s / b.x;
-    if ((s % b.x) > 0) ++g.x;
-  }
-
-  std::size_t count;
-};
 
 template <typename Container, typename T>
 void fill(Container& c, size_t const size, T const val) {
@@ -283,7 +61,7 @@ void fill_warp_size(T* const t, std::size_t const size, T const val) {
   std::fill(t, t + size, val);
 }
 
-void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
+void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Pseudo GPU in use"; }
 
 }  // namespace MKN_GPU_NS
 
diff --git a/inc/mkn/gpu/cpu/api.hpp b/inc/mkn/gpu/cpu/api.hpp
index 3323ed7..f2b6811 100644
--- a/inc/mkn/gpu/cpu/api.hpp
+++ b/inc/mkn/gpu/cpu/api.hpp
@@ -1,4 +1,6 @@
 #ifndef _MKN_GPU_CPU_API_HPP_
 #define _MKN_GPU_CPU_API_HPP_
 
+#include "mkn/gpu/cpu/def.hpp"
+
 #endif /*_MKN_GPU_CPU_API_HPP_*/
diff --git a/inc/mkn/gpu/cpu/cls.hpp b/inc/mkn/gpu/cpu/cls.hpp
index a52bdf6..0ef8066 100644
--- a/inc/mkn/gpu/cpu/cls.hpp
+++ b/inc/mkn/gpu/cpu/cls.hpp
@@ -1,4 +1,106 @@
 #ifndef _MKN_GPU_CPU_CLS_HPP_
 #define _MKN_GPU_CPU_CLS_HPP_
 
+#include "mkn/gpu/cpu/def.hpp"
+
+#include <tuple>
+#include <memory>
+#include <utility>
+#include <functional>
+#include <type_traits>
+
+namespace MKN_GPU_NS {
+
+struct Stream {
+  Stream() {}
+  ~Stream() {}
+
+  auto& operator()() { return stream; };
+  void sync() {}
+
+  std::size_t stream = 0;
+};
+
+struct StreamEvent {
+  StreamEvent(Stream&) {}
+  ~StreamEvent() {}
+
+  auto& operator()(std::function<void()> fn = {}) {
+    fn();
+    return *this;
+  }
+
+  bool finished() const { return fin; }
+
+  Stream stream;
+  bool fin = 1;
+  std::function<void()> _fn;
+};
+
+template <typename T>
+struct Pointer {
+  Pointer(T* _t) : t{_t} {}
+
+  bool is_unregistered_ptr() const { return t == nullptr; }
+  bool is_host_ptr() const { return true; }
+  bool is_device_ptr() const { return false; }
+  bool is_managed_ptr() const { return false; }
+
+  T* t;
+};
+
+#include "mkn/gpu/any/inc/alloc.ipp"
+#include "mkn/gpu/any/inc/device.ipp"
+
+namespace detail {
+static thread_local std::size_t idx = 0;
+}
+
+template <bool _sync = true, typename F, typename... Args>
+void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) {
+  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
+  KLOG(TRC) << N;
+  std::apply(
+      [&](auto&&... params) {
+        for (std::size_t i = 0; i < N; ++i) {
+          f(params...);
+          ++blockIdx.x;
+          ++detail::idx;
+        }
+      },
+      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
+
+  detail::idx = 0;
+  blockIdx.x = 0;
+}
+
+struct Launcher {
+  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
+  Launcher(std::size_t w, std::size_t h, std::size_t tpx, std::size_t tpy)
+      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
+  Launcher(std::size_t x, std::size_t y, std::size_t z, std::size_t tpx, std::size_t tpy,
+           std::size_t tpz)
+      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
+
+  template <typename F, typename... Args>
+  void operator()(F&& f, Args&&... args) {
+    launch(f, g, b, ds, s, args...);
+  }
+
+  std::size_t ds = 0 /*dynamicShared*/;
+  dim3 g /*gridDim*/, b /*blockDim*/;
+  std::size_t s = 0;
+};
+
+struct GLauncher : public Launcher {
+  GLauncher(std::size_t const s, std::size_t const /*dev*/ = 0)
+      : Launcher{dim3{}, dim3{}}, count{s} {
+    b.x = s;
+  }
+
+  std::size_t count;
+};
+
+}  // namespace MKN_GPU_NS
+
 #endif /*_MKN_GPU_CPU_CLS_HPP_*/
diff --git a/inc/mkn/gpu/cpu/def.hpp b/inc/mkn/gpu/cpu/def.hpp
index 7fdf630..33e8bfb 100644
--- a/inc/mkn/gpu/cpu/def.hpp
+++ b/inc/mkn/gpu/cpu/def.hpp
@@ -1,4 +1,169 @@
 #ifndef _MKN_GPU_CPU_DEF_HPP_
 #define _MKN_GPU_CPU_DEF_HPP_
 
+#include "mkn/kul/log.hpp"
+#include "mkn/kul/assert.hpp"
+
+#include <string>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
+#define MKN_GPU_NS mkn::gpu::cpu
+#else
+#define MKN_GPU_NS mkn::gpu
+#endif  // MKN_GPU_FN_PER_NS
+
+#if defined(__device__)
+#pragma message("__device__ already defined")
+#error  // check your compiler
+#endif
+
+#if defined(__host__)
+#pragma message("__host__ already defined")
+#error  // check your compiler
+#endif
+
+#if defined(__global__)
+#pragma message("__global__ already defined")
+#error  // check your compiler
+#endif
+
+// we need to exclude these for CPU only operations
+#define __shared__
+#define __device__
+#define __host__
+#define __global__
+#define __syncthreads(...)
+
+#if !defined(MKN_CPU_DO_NOT_DEFINE_DIM3)
+#define MKN_CPU_DO_NOT_DEFINE_DIM3 0
+#endif
+
+#if !defined(dim3) and !MKN_CPU_DO_NOT_DEFINE_DIM3
+struct dim3 {
+  dim3() {}
+  dim3(std::size_t x) : x{x} {}
+  dim3(std::size_t x, std::size_t y) : x{x}, y{y} {}
+  dim3(std::size_t x, std::size_t y, std::size_t z) : x{x}, y{y}, z{z} {}
+
+  std::size_t x = 1, y = 1, z = 1;
+};
+dim3 static inline threadIdx, blockIdx;
+#endif  // MKN_CPU_DO_NOT_DEFINE_DIM3
+
+namespace MKN_GPU_NS {
+
+#define MKN_GPU_ASSERT(x) (KASSERT((x)))
+
+struct DeviceProperties {
+  std::string major = "major";
+  std::string minor = "minor";
+  std::string name = "name";
+  std::size_t multiProcessorCount = 1;
+  std::size_t maxThreadsPerMultiProcessor = 0;
+  std::size_t totalGlobalMem = 0;
+  std::size_t sharedMemPerBlock = 0;
+  std::size_t warpSize = 1;
+  std::size_t maxThreadsPerBlock = 0;
+};
+
+auto inline getDeviceProperties(std::size_t /*dev*/ = 0) { return DeviceProperties{}; }
+
+std::uint32_t inline getWarpSize(size_t /*dev */ = 0) { return 1; }
+
+static std::uint32_t inline const warp_size = getWarpSize();
+
+auto inline getLimitMallocHeapSize() {
+  std::size_t bytes = 0;
+
+  return bytes;
+}
+
+void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {}
+
+void inline setDevice(std::size_t const& /*dev*/) {} /*noop*/
+
+template <typename Size>
+void alloc(void*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size;
+  MKN_GPU_ASSERT(p = std::malloc(size));
+}
+
+template <typename T, typename Size>
+void alloc(T*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
+}
+
+template <typename T, typename Size>
+void alloc_host(T*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
+}
+
+template <typename T, typename Size>
+void alloc_managed(T*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(p = reinterpret_cast<T*>(std::malloc(size * sizeof(T))));
+}
+
+void inline destroy(void* p) {
+  KLOG(TRC);
+  std::free(p);
+}
+
+template <typename T>
+void destroy(T* p) {
+  KLOG(TRC);
+  std::free(p);
+}
+
+template <typename T>
+void destroy_host(T* p) {
+  KLOG(TRC);
+  std::free(p);
+}
+
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T)));
+}
+
+template <typename Size>
+void send(void* p, void* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(p, t, size));
+}
+
+template <typename T, typename Size>
+void send(T* p, T const* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T)));
+}
+template <typename T, typename Size>
+void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) {
+  KLOG(TRC);
+  send(p, t, size);
+}
+
+template <typename T, typename Size>
+void take(T const* p, T* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T)));
+}
+
+template <typename T, typename Size>
+void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) {
+  KLOG(TRC);
+  take(p, t, size);
+}
+
+void inline sync() {}
+
+}  // namespace MKN_GPU_NS
+
 #endif /*_MKN_GPU_CPU_DEF_HPP_*/
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index aab6e00..1cefb6e 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -65,14 +65,15 @@ void fill(Container& c, T val) {
 void inline prinfo(size_t dev = 0) {
   cudaDeviceProp devProp;
   MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
-  KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
-  KOUT(NON) << " agent name     " << devProp.name;
-  KOUT(NON) << " cores          " << devProp.multiProcessorCount;
-  KOUT(NON) << " threadsPCore   " << devProp.maxThreadsPerMultiProcessor;
-  KOUT(NON) << " TotalMem       " << (devProp.totalGlobalMem / 1000000) << " MB";
-  KOUT(NON) << " BlockMem       " << (devProp.sharedMemPerBlock / 1000) << " KB";
-  KOUT(NON) << " warpSize       " << devProp.warpSize;
-  KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
+  KOUT(NON) << " System version  " << devProp.major << "." << devProp.minor;
+  KOUT(NON) << " agent name      " << devProp.name;
+  KOUT(NON) << " cores           " << devProp.multiProcessorCount;
+  KOUT(NON) << " threadsPCore    " << devProp.maxThreadsPerMultiProcessor;
+  KOUT(NON) << " TotalMem        " << (devProp.totalGlobalMem / 1000000) << " MB";
+  KOUT(NON) << " BlockMem        " << (devProp.sharedMemPerBlock / 1000) << " KB";
+  KOUT(NON) << " device warpSize " << devProp.warpSize;
+  KOUT(NON) << " threadsPBlock   " << devProp.maxThreadsPerBlock;
+  KOUT(NON) << " version:        " << CUDA_VERSION;
 }
 
 void inline print_gpu_mem_used() {
diff --git a/inc/mkn/gpu/cuda/cls.hpp b/inc/mkn/gpu/cuda/cls.hpp
index 133573e..3946411 100644
--- a/inc/mkn/gpu/cuda/cls.hpp
+++ b/inc/mkn/gpu/cuda/cls.hpp
@@ -64,14 +64,16 @@ struct StreamEvent {
 
 template <typename T>
 struct Pointer {
-  Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
-
-  bool is_unregistered_ptr() const { return attributes.type == 0; }
+  Pointer(T* _t) : t{_t} {
+    if (!t) throw std::runtime_error("invalid nullptr");
+    MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t));
+  }
   bool is_host_ptr() const {
-    return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr);
+    return type() == cudaMemoryTypeUnregistered or type() == cudaMemoryTypeHost;
   }
-  bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; }
-  bool is_managed_ptr() const { return attributes.type == 3; }
+  bool is_device_ptr() const { return type() == cudaMemoryTypeDevice; }
+  bool is_managed_ptr() const { return type() == cudaMemoryTypeManaged; }
+  auto type() const { return attributes.type; }
 
   T* t;
   cudaPointerAttributes attributes;
@@ -83,7 +85,7 @@ struct Pointer {
 template <bool _sync = true, typename F, typename... Args>
 void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
   std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
-  KLOG(TRC) << N;
+  KLOG(TRC) << "N=" << N << " ds=" << ds;
   std::apply(
       [&](auto&&... params) {
         f<<<g, b, ds, s>>>(params...);
diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp
index 0d08974..451e44f 100644
--- a/inc/mkn/gpu/cuda/def.hpp
+++ b/inc/mkn/gpu/cuda/def.hpp
@@ -1,14 +1,14 @@
 #ifndef _MKN_GPU_CUDA_DEF_HPP_
 #define _MKN_GPU_CUDA_DEF_HPP_
 
+#include "mkn/kul/log.hpp"
+#include "mkn/gpu/def.hpp"
+
 #include <string>
 #include <cstdint>
 
 #include <cuda_runtime.h>
 
-#include "mkn/kul/log.hpp"
-#include "mkn/gpu/def.hpp"
-
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::cuda
 #else
@@ -32,18 +32,30 @@ inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort =
 
 std::string getErrorString(auto const code) { return cudaGetErrorString(code); }
 
+auto inline getDeviceProperties(size_t dev = 0) {
+  cudaDeviceProp devProp;
+  MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
+  return devProp;
+}
+
 std::uint32_t inline getWarpSize(size_t dev = 0) {
 #ifdef _MKN_GPU_WARP_SIZE_
   return _MKN_GPU_WARP_SIZE_;
+
 #else
-  cudaDeviceProp devProp;
-  [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
-  return devProp.warpSize;
+  return getDeviceProperties(dev).warpSize;
+
 #endif /*_MKN_GPU_WARP_SIZE_    */
 }
 
 static std::uint32_t inline const warp_size = getWarpSize();
 
+auto inline getLimitMallocHeapSize() {
+  std::size_t bytes = 0;
+  MKN_GPU_ASSERT(cudaDeviceGetLimit(&bytes, cudaLimitMallocHeapSize));
+  return bytes;
+}
+
 void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
 }
diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp
index 83d5ec4..4730057 100644
--- a/inc/mkn/gpu/def.hpp
+++ b/inc/mkn/gpu/def.hpp
@@ -3,6 +3,29 @@
 
 #include <type_traits>
 
+#if MKN_GPU_ROCM and MKN_GPU_CUDA
+#error  // not possible
+#endif
+
+#if MKN_GPU_ROCM
+#define MKN_GPU_CUDA 0
+#endif
+
+#if MKN_GPU_CUDA
+#define MKN_GPU_ROCM 0
+#endif
+
+#if MKN_GPU_CPU
+#define MKN_GPU_ROCM 0
+#define MKN_GPU_CUDA 0
+#endif
+
+#if !defined(MKN_GPU_ROCM) and !defined(MKN_GPU_CUDA)
+#if __has_include(<cuda_runtime.h>) and __has_include("hip/hip_runtime.h")
+#error  // ACTIVATE ONE!
+#endif  // __has_include(...)
+#endif  // !defined(...)
+
 #if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
 #define MKN_GPU_ROCM 1
 #endif
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 5046c14..6fc5fba 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -79,6 +79,8 @@ void inline prinfo(size_t dev = 0) {
     KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###";
   }
 #endif
+  KOUT(NON) << " version:        " << HIP_VERSION_MAJOR << "." << HIP_VERSION_MINOR << "."
+            << HIP_VERSION_PATCH;
 }
 
 void inline print_gpu_mem_used() {
diff --git a/inc/mkn/gpu/rocm/cls.hpp b/inc/mkn/gpu/rocm/cls.hpp
index 6a02f35..183310a 100644
--- a/inc/mkn/gpu/rocm/cls.hpp
+++ b/inc/mkn/gpu/rocm/cls.hpp
@@ -56,33 +56,24 @@ struct StreamEvent {
   bool fin = 0;
 };
 
-//
-
 // https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3
 template <typename T>
 struct Pointer {
   Pointer(T* _t) : t{_t} {
-    assert(t);
+    if (!t) throw std::runtime_error("invalid nullptr");
     MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t));
-    type = attributes.type;
-  }
-
-  bool is_unregistered_ptr() const {
-    return attributes.type == hipMemoryType::hipMemoryTypeUnregistered;
-  }
-  bool is_host_ptr() const {
-    return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost;
   }
+  bool is_host_ptr() const { return type() == hipMemoryTypeHost; }
   bool is_device_ptr() const {
-    return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged;
+    return type() == hipMemoryTypeDevice || type() == hipMemoryTypeArray;
   }
   bool is_managed_ptr() const {
-    return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified;
+    return type() == hipMemoryTypeManaged || type() == hipMemoryTypeUnified;
   }
+  auto type() const { return attributes.type; }
 
   T* t;
   hipPointerAttribute_t attributes;
-  hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered;
 };
 
 #include "mkn/gpu/any/inc/alloc.ipp"
diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp
index ee8f6a3..8b9da35 100644
--- a/inc/mkn/gpu/rocm/def.hpp
+++ b/inc/mkn/gpu/rocm/def.hpp
@@ -4,9 +4,10 @@
 #include "mkn/kul/log.hpp"
 #include "mkn/gpu/def.hpp"
 
-#include "hip/hip_runtime.h"
-
 #include <string>
+#include <cstdint>
+
+#include "hip/hip_runtime.h"
 
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::hip
@@ -31,18 +32,32 @@ inline void gpuAssert(hipError_t code, char const* file, int line, bool abort =
 
 std::string getErrorString(auto const code) { return hipGetErrorString(code); }
 
+auto inline getDeviceProperties(size_t dev = 0) {
+  hipDeviceProp_t devProp;
+  MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
+  return devProp;
+}
+
 std::uint32_t inline getWarpSize(size_t dev = 0) {
 #ifdef _MKN_GPU_WARP_SIZE_
   return _MKN_GPU_WARP_SIZE_;
+
 #else
   hipDeviceProp_t devProp;
-  [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
+  MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
   return devProp.warpSize;
+
 #endif /*_MKN_GPU_WARP_SIZE_    */
 }
 
 static std::uint32_t inline const warp_size = getWarpSize();
 
+auto inline getLimitMallocHeapSize() {
+  std::size_t bytes = 0;
+  MKN_GPU_ASSERT(hipDeviceGetLimit(&bytes, hipLimitMallocHeapSize));
+  return bytes;
+}
+
 void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
 }
diff --git a/mkn.yaml b/mkn.yaml
index b6057b4..fc54392 100644
--- a/mkn.yaml
+++ b/mkn.yaml
@@ -11,21 +11,28 @@ profile:
 - name: rocm
   parent: headers
   arg: -DMKN_GPU_ROCM=1
-  test: test/any/(\w).cpp
-        test/hip/(\w).cpp
+  # main: tst/any/managed.cpp
+  test: tst/any/(\w).cpp
+        tst/hip/(\w).cpp
 
 - name: cuda
   parent: headers
   arg: -DMKN_GPU_CUDA=1
-  test: test/any/(\w).cpp
-        test/cuda/(\w).cpp
+  # main: tst/any/managed.cpp
+  test: tst/any/(\w).cpp
+        tst/cuda/(\w).cpp
 
 # if you have no GPU but want to test your code
 - name: cpu
   parent: headers
+  # main: tst/any/class1.cpp
   arg: -DMKN_GPU_CPU=1
-  test: test/any/(\w).cpp
-        test/cpu/(\w).cpp
+  test: tst/any/(\w).cpp
+        tst/cpu/(\w).cpp
+
+- name: mem_test
+  parent: headers
+  main: tst/mem/test_alloc.cpp
 
 - name: format
   mod: |
diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml
index ab6d517..1426b2d 100644
--- a/res/mkn/clang_cuda.yaml
+++ b/res/mkn/clang_cuda.yaml
@@ -1,6 +1,5 @@
-## Recommended settings commented out.
 #
-#  clang 17 recommended
+#  clang 17 at least recommended
 #
 ## The following compile flags will likely be required with their specific values for your current hardware
 # --cuda-gpu-arch="sm_61"  # sm_61 = nvidia 1080 (
@@ -14,8 +13,9 @@
 super: settings
 
 property:
+  sm.arch: 120
   cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version
-  cxx_cuda: -x cuda --cuda-gpu-arch="sm_120" -Xclang -fcuda-allow-variadic-functions
+  cxx_cuda: -x cuda --cuda-gpu-arch="sm_${sm.arch}" -Xclang -fcuda-allow-variadic-functions
 
 env: |
   PATH=/usr/local/cuda/bin:${PATH}
diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml
index ccb7686..3d9bf7d 100644
--- a/res/mkn/hipcc.yaml
+++ b/res/mkn/hipcc.yaml
@@ -1,4 +1,3 @@
-## Recommended settings commented out.
 
 super: settings
 
@@ -6,6 +5,7 @@ compiler:
   mask:
     g++: hipcc
 
+path: /opt/rocm/lib
 env:
  PATH=/opt/rocm/bin:${PATH}
 
diff --git a/test/any/__share__.hpp b/tst/any/__share__.hpp
similarity index 100%
rename from test/any/__share__.hpp
rename to tst/any/__share__.hpp
diff --git a/test/any/add.cpp b/tst/any/add.cpp
similarity index 87%
rename from test/any/add.cpp
rename to tst/any/add.cpp
index 9a21b08..38c8fb6 100644
--- a/test/any/add.cpp
+++ b/tst/any/add.cpp
@@ -1,6 +1,8 @@
 
 #include "mkn/gpu.hpp"
 
+#include "mkn/kul/assert.hpp"
+
 static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024;
 static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t TPB_X = 16, TPB_Y = 16;
@@ -31,19 +33,17 @@ template <typename Float>
 uint32_t test_add1() {
   std::vector<Float> b(NUM);
 
-  assert(mkn::gpu::Pointer{b.data()}.is_host_ptr());
+  mkn::kul::abort_if_not(mkn::gpu::Pointer{b.data()}.is_host_ptr() && "not host pointer");
 
   for (uint32_t i = 0; i < NUM; i++) b[i] = i;
   mkn::gpu::DeviceMem<Float> devA(NUM), devB(b);
 
-  if constexpr (!mkn::gpu::CompileFlags::withCPU) {
-    assert(mkn::gpu::Pointer{devA.p}.is_device_ptr());
-  }
+  if constexpr (!mkn::gpu::CompileFlags::withCPU)
+    mkn::kul::abort_if_not(mkn::gpu::Pointer{devA.p}.is_device_ptr() && "not device pointer");
 
   mkn::gpu::Launcher{WIDTH, HEIGHT, TPB_X, TPB_Y}(vectoradd1<Float>, devA, devB);
   auto a = devA();
 
-  // assert(mkn::gpu::Pointer{a.data()}.is_device_ptr());
   for (uint32_t i = 0; i < NUM; i++)
     if (a[i] != b[i] + 1) return 1;
   return 0;
diff --git a/test/any/array.cpp b/tst/any/array.cpp
similarity index 100%
rename from test/any/array.cpp
rename to tst/any/array.cpp
diff --git a/test/any/asio.cpp b/tst/any/asio.cpp
similarity index 100%
rename from test/any/asio.cpp
rename to tst/any/asio.cpp
diff --git a/test/any/async_streaming.cpp b/tst/any/async_streaming.cpp
similarity index 100%
rename from test/any/async_streaming.cpp
rename to tst/any/async_streaming.cpp
diff --git a/test/any/class1.cpp b/tst/any/class1.cpp
similarity index 100%
rename from test/any/class1.cpp
rename to tst/any/class1.cpp
diff --git a/test/any/class2.cpp b/tst/any/class2.cpp
similarity index 100%
rename from test/any/class2.cpp
rename to tst/any/class2.cpp
diff --git a/test/any/construct.cpp b/tst/any/construct.cpp
similarity index 100%
rename from test/any/construct.cpp
rename to tst/any/construct.cpp
diff --git a/test/any/info.cpp b/tst/any/info.cpp
similarity index 100%
rename from test/any/info.cpp
rename to tst/any/info.cpp
diff --git a/test/any/managed.cpp b/tst/any/managed.cpp
similarity index 75%
rename from test/any/managed.cpp
rename to tst/any/managed.cpp
index ffe6a6d..6dd4828 100644
--- a/test/any/managed.cpp
+++ b/tst/any/managed.cpp
@@ -1,6 +1,8 @@
 
 #include "mkn/gpu.hpp"
 
+#include "mkn/kul/assert.hpp"
+
 static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024;
 static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
@@ -21,9 +23,8 @@ __global__ void kernel(S* structs) {
 template <typename L>
 std::uint32_t _test(L&& launcher) {
   ManagedVector<S> mem{NUM};
-  if constexpr (!mkn::gpu::CompileFlags::withCPU) {
-    assert(mkn::gpu::Pointer{mem.data()}.is_managed_ptr());
-  }
+  if constexpr (!mkn::gpu::CompileFlags::withCPU)
+    mkn::kul::abort_if_not(mkn::gpu::Pointer{mem.data()}.is_managed_ptr() && "not host pointer");
 
   for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i;
 
@@ -102,10 +103,25 @@ std::uint32_t test_zero() {
   return 0;
 }
 
+uint32_t test_copy() {
+  std::vector<float> hst0(NUM, 1), hst1(NUM, 2);
+  ManagedVector<float> dev0(NUM), dev1(NUM);
+
+  // copy(T0* dst, T1* src, Size const size)
+  mkn::gpu::copy(dev0, hst0);
+  if (dev0.back() != 1) return 1;
+  mkn::gpu::copy(hst1, hst0);
+  if (hst1.back() != 1) return 1;
+  mkn::gpu::copy(dev1, hst1);
+
+  return dev1[NUM - 1] != 1;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
-  return test() + test_zero() +              //
-         test_guess() +                      //
-         test_lambda_copy_capture_views() +  //
-         test_lambda_ref_copy_capture_views();
+  return test() + test_zero() +                  //
+         test_guess() +                          //
+         test_lambda_copy_capture_views() +      //
+         test_lambda_ref_copy_capture_views() +  //
+         test_copy();
 }
diff --git a/test/any/struct.cpp b/tst/any/struct.cpp
similarity index 100%
rename from test/any/struct.cpp
rename to tst/any/struct.cpp
diff --git a/test/cpu/namespace.cpp b/tst/cpu/namespace.cpp
similarity index 100%
rename from test/cpu/namespace.cpp
rename to tst/cpu/namespace.cpp
diff --git a/test/cuda/add.cpp b/tst/cuda/add.cpp
similarity index 100%
rename from test/cuda/add.cpp
rename to tst/cuda/add.cpp
diff --git a/test/cuda/async.cpp b/tst/cuda/async.cpp
similarity index 100%
rename from test/cuda/async.cpp
rename to tst/cuda/async.cpp
diff --git a/test/cuda/atomic.cpp b/tst/cuda/atomic.cpp
similarity index 100%
rename from test/cuda/atomic.cpp
rename to tst/cuda/atomic.cpp
diff --git a/test/hip/add.cpp b/tst/hip/add.cpp
similarity index 100%
rename from test/hip/add.cpp
rename to tst/hip/add.cpp
diff --git a/test/hip/async.cpp b/tst/hip/async.cpp
similarity index 100%
rename from test/hip/async.cpp
rename to tst/hip/async.cpp
diff --git a/test/hip/async.ppc b/tst/hip/async.ppc
similarity index 100%
rename from test/hip/async.ppc
rename to tst/hip/async.ppc
diff --git a/tst/mem/test_alloc.cpp b/tst/mem/test_alloc.cpp
new file mode 100644
index 0000000..78bbdad
--- /dev/null
+++ b/tst/mem/test_alloc.cpp
@@ -0,0 +1,58 @@
+
+#include "mkn/kul/log.hpp"
+
+#include "mkn/gpu.hpp"
+#include "mkn/gpu/def.hpp"
+
+#include <stdexcept>
+
+template <typename T>
+using ManagedVector = std::vector<T, mkn::gpu::ManagedAllocator<T>>;
+
+bool constexpr set_heap_limit = true;  // see if you need it by setting false after reboot
+
+bool test_allocate_percentage(double const percent) {
+  auto const devProp = mkn::gpu::getDeviceProperties();
+
+  auto const mem = devProp.totalGlobalMem;
+  KLOG(DBG) << mem;
+
+  auto const limit = mem * percent;
+  if (set_heap_limit) mkn::gpu::setLimitMallocHeapSize(limit);
+
+  auto const heapLim = mkn::gpu::getLimitMallocHeapSize();
+  KLOG(DBG) << heapLim;
+  if (set_heap_limit and limit > heapLim) {
+    KOUT(NON) << "Cannot set heap limit! " << limit << " " << heapLim;
+    return 1;
+  }
+
+  if (limit < sizeof(double)) throw std::runtime_error("limit < sizeof(double)");
+
+  auto const size = limit / sizeof(double);
+
+  if (size == 0) throw std::runtime_error("size == 0");
+
+  {
+    ManagedVector<double> vec(size, 2);
+    if (vec.data()[vec.size() - 1] != 2) return 1;
+  }
+
+  KOUT(NON) << "Can allocate " << std::size_t(percent * 100)
+            << "% of total mem: " << std::size_t(mem / 1e6) << "mb";
+
+  return 0;
+}
+
+int main() {
+  KOUT(NON) << __FILE__;
+  mkn::gpu::prinfo();
+  if constexpr (mkn::gpu::CompileFlags::withCPU)
+    return 0;  // NOT RELEVANT
+  else
+    return test_allocate_percentage(.1) +  //
+           test_allocate_percentage(.2) +  //
+           test_allocate_percentage(.3) +  //
+           test_allocate_percentage(.4) +  //
+           test_allocate_percentage(.5);
+}