diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index eada364..f23582d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,6 +18,6 @@ jobs:
 
     - name: test
       run: |
-        curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix
+        curl -fL --retry 3 --retry-delay 2 -o mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix
         chmod +x mkn
         KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++20" test -W 9
diff --git a/.gitignore b/.gitignore
index a699e6d..f96c360 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 bin
 .mkn
 *.cui
-.clangd
+*.clangd
diff --git a/inc/mkn/gpu.hpp b/inc/mkn/gpu.hpp
index 7a7f978..7389d11 100644
--- a/inc/mkn/gpu.hpp
+++ b/inc/mkn/gpu.hpp
@@ -31,7 +31,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_HPP_
 #define _MKN_GPU_HPP_
 
-#include "mkn/gpu/defines.hpp"
+#include "mkn/gpu/def.hpp"
+
+#if MKN_GPU_ROCM
+#include "mkn/gpu/rocm.hpp"
+#endif
+
+#if MKN_GPU_CUDA
+#include "mkn/gpu/cuda.hpp"
+#endif
+
+#if MKN_GPU_CPU
+#include "mkn/gpu/cpu.hpp"
+#endif
 
 namespace mkn::gpu {
 
diff --git a/inc/mkn/gpu/any/cls.hpp b/inc/mkn/gpu/any/cls.hpp
new file mode 100644
index 0000000..82abf4f
--- /dev/null
+++ b/inc/mkn/gpu/any/cls.hpp
@@ -0,0 +1,24 @@
+#ifndef _MKN_GPU_ANY_CLS_HPP_
+#define _MKN_GPU_ANY_CLS_HPP_
+
+#include "mkn/kul/env.hpp"
+#include "mkn/kul/string.hpp"
+
+namespace mkn::gpu {
+
+template <typename Device>
+struct Cli {
+  constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS";
+
+  auto bx_threads() const {
+    if (kul::env::EXISTS(MKN_GPU_BX_THREADS))
+      return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS));
+    return dev.maxThreadsPerBlock;
+  }
+
+  Device const& dev;
+};
+
+} /* namespace mkn::gpu */
+
+#endif /*_MKN_GPU_ANY_CLS_HPP_*/
diff --git a/inc/mkn/gpu/any/def.hpp b/inc/mkn/gpu/any/def.hpp
new file mode 100644
index 0000000..da9f065
--- /dev/null
+++ b/inc/mkn/gpu/any/def.hpp
@@ -0,0 +1,4 @@
+#ifndef _MKN_GPU_ANY_DEF_HPP_
+#define _MKN_GPU_ANY_DEF_HPP_
+
+#endif /*_MKN_GPU_ANY_DEF_HPP_*/
diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/any/inc/alloc.ipp
similarity index 100%
rename from inc/mkn/gpu/alloc.hpp
rename to inc/mkn/gpu/any/inc/alloc.ipp
diff --git a/inc/mkn/gpu/any/inc/devfunc.ipp b/inc/mkn/gpu/any/inc/devfunc.ipp
new file mode 100644
index 0000000..1ca92a0
--- /dev/null
+++ b/inc/mkn/gpu/any/inc/devfunc.ipp
@@ -0,0 +1,9 @@
+
+template <bool sync = true, typename T>
+void __device__ fill_warp_size(T* const t, std::size_t const size, T const val) {
+  std::size_t chunk = 0;
+  auto const each = size / warpSize;
+  for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = val;
+  if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = val;
+  if constexpr (sync) __syncthreads();
+}
diff --git a/inc/mkn/gpu/device.hpp b/inc/mkn/gpu/any/inc/device.ipp
similarity index 98%
rename from inc/mkn/gpu/device.hpp
rename to inc/mkn/gpu/any/inc/device.ipp
index 04cc9e8..c1f12f3 100644
--- a/inc/mkn/gpu/device.hpp
+++ b/inc/mkn/gpu/any/inc/device.ipp
@@ -59,12 +59,12 @@ struct DeviceMem {
   }
 
   void send(T const* t, std::size_t _size = 1, std::size_t start = 0) {
-    MKN_GPU_NS::send(p, t, _size, start);
+    MKN_GPU_NS::send(p + start, t, _size);
   }
 
   template <typename C, std::enable_if_t<mkn::kul::is_span_like_v<C>, bool> = 0>
   void send(C const& c, std::size_t start = 0) {
-    send(c.data(), c.size(), start);
+    send(c.data() + start, c.size());
   }
 
   void fill_n(T t, std::size_t _size, std::size_t start = 0) {
@@ -128,14 +128,14 @@ struct AsioDeviceMem {
 
   void send(Stream& stream, T* t, std::size_t _size = 1, std::size_t start = 0) {
     assert(p != nullptr);
-    MKN_GPU_NS::send_async(p, t, stream, _size, start);
+    MKN_GPU_NS::send_async(p + start, t + start, stream(), _size);
   }
 
   template <typename Span>
   void take(Stream& stream, Span& span, std::size_t start) {
     assert(p != nullptr);
     assert(span.size() + start <= s);
-    MKN_GPU_NS::take_async(p, span, stream, start);
+    MKN_GPU_NS::take_async(p + start, span.data(), stream(), span.size());
   }
 
   auto& size() const { return s; }
diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/any/inc/launchers.ipp
similarity index 100%
rename from inc/mkn/gpu/launchers.hpp
rename to inc/mkn/gpu/any/inc/launchers.ipp
diff --git a/inc/mkn/gpu/any/inc/traits.ipp b/inc/mkn/gpu/any/inc/traits.ipp
new file mode 100644
index 0000000..e69de29
diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp
deleted file mode 100644
index 6a40948..0000000
--- a/inc/mkn/gpu/cli.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
-Copyright (c) 2024, Philip Deegan.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Philip Deegan nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-// IWYU pragma: private, include "mkn/gpu.hpp"
-#ifndef _MKN_GPU_CLI_HPP_
-#define _MKN_GPU_CLI_HPP_
-
-#include "mkn/kul/env.hpp"
-#include "mkn/kul/string.hpp"
-
-namespace mkn::gpu {
-
-template <typename Device>
-struct Cli {
-  constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS";
-
-  auto bx_threads() const {
-    if (kul::env::EXISTS(MKN_GPU_BX_THREADS))
-      return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS));
-    return dev.maxThreadsPerBlock;
-  }
-
-  Device const& dev;
-};
-
-} /* namespace mkn::gpu */
-
-#endif /*_MKN_GPU_CLI_HPP_*/
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index e77b740..9613a1a 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -193,34 +193,32 @@ void send(void* p, void* t, Size size = 1) {
 }
 
 template <typename T, typename Size>
-void send(T* p, T const* t, Size size = 1, Size start = 0) {
+void send(T* p, T const* t, Size size = 1) {
   KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(p + start, t, size * sizeof(T)));
+  MKN_GPU_ASSERT(std::memcpy(p, t, size * sizeof(T)));
 }
-
 template <typename T, typename Size>
-void take(T* p, T* t, Size size = 1, Size start = 0) {
+void send_async(T* p, T const* t, auto& /*stream*/, Size size = 1) {
   KLOG(TRC);
-  MKN_GPU_ASSERT(std::memcpy(t, p + start, size * sizeof(T)));
+  send(p, t, size);
 }
 
 template <typename T, typename Size>
-void send_async(T* p, T const* t, Stream& /*stream*/, Size size = 1, Size start = 0) {
+void take(T const* p, T* t, Size size = 1) {
   KLOG(TRC);
-  send(p, t, size, start);
+  MKN_GPU_ASSERT(std::memcpy(t, p, size * sizeof(T)));
 }
 
-template <typename T, typename Span>
-void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) {
-  static_assert(mkn::kul::is_span_like_v<Span>);
+template <typename T, typename Size>
+void take_async(T const* p, T* t, auto& /*stream*/, Size size = 1) {
   KLOG(TRC);
-  take(p, span.data(), span.size(), start);
+  take(p, t, size);
 }
 
 void inline sync() {}
 
-#include "mkn/gpu/alloc.hpp"
-#include "mkn/gpu/device.hpp"
+#include "mkn/gpu/any/inc/alloc.ipp"
+#include "mkn/gpu/any/inc/device.ipp"
 
 namespace detail {
 static thread_local std::size_t idx = 0;
@@ -281,8 +279,8 @@ void fill(Container& c, T const val) {
 }
 
 template <typename T>
-void zero(T* const t, std::size_t const size) {
-  std::fill(t, t + size, 0);
+void fill_warp_size(T* const t, std::size_t const size, T const val) {
+  std::fill(t, t + size, val);
 }
 
 void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
@@ -310,7 +308,7 @@ static void global_d_kernel(F& f, Args... args) {
   f(args...);
 }
 
-#include "launchers.hpp"
+#include "mkn/gpu/any/inc/launchers.ipp"
 
 } /* namespace MKN_GPU_NS */
 
diff --git a/inc/mkn/gpu/cpu/api.hpp b/inc/mkn/gpu/cpu/api.hpp
new file mode 100644
index 0000000..3323ed7
--- /dev/null
+++ b/inc/mkn/gpu/cpu/api.hpp
@@ -0,0 +1,4 @@
+#ifndef _MKN_GPU_CPU_API_HPP_
+#define _MKN_GPU_CPU_API_HPP_
+
+#endif /*_MKN_GPU_CPU_API_HPP_*/
diff --git a/inc/mkn/gpu/cpu/cls.hpp b/inc/mkn/gpu/cpu/cls.hpp
new file mode 100644
index 0000000..a52bdf6
--- /dev/null
+++ b/inc/mkn/gpu/cpu/cls.hpp
@@ -0,0 +1,4 @@
+#ifndef _MKN_GPU_CPU_CLS_HPP_
+#define _MKN_GPU_CPU_CLS_HPP_
+
+#endif /*_MKN_GPU_CPU_CLS_HPP_*/
diff --git a/inc/mkn/gpu/cpu/def.hpp b/inc/mkn/gpu/cpu/def.hpp
new file mode 100644
index 0000000..7fdf630
--- /dev/null
+++ b/inc/mkn/gpu/cpu/def.hpp
@@ -0,0 +1,4 @@
+#ifndef _MKN_GPU_CPU_DEF_HPP_
+#define _MKN_GPU_CPU_DEF_HPP_
+
+#endif /*_MKN_GPU_CPU_DEF_HPP_*/
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index aa02875..aab6e00 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -33,284 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define _MKN_GPU_CUDA_HPP_
 
 #include "mkn/kul/log.hpp"
-#include "mkn/kul/span.hpp"
-#include "mkn/kul/tuple.hpp"
-
-#include "mkn/gpu/def.hpp"
 
 #include <cuda_runtime.h>
 
-//
-
-#define MKN_GPU_ASSERT(ans)               \
-  {                                       \
-    gpuAssert((ans), __FILE__, __LINE__); \
-  }
-inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-    if (abort) std::abort();
-  }
-}
-
-namespace mkn::gpu::cuda {
-
-template <typename SIZE = std::uint32_t /*max 4294967296*/>
-__device__ SIZE idx() {
-  SIZE width = gridDim.x * blockDim.x;
-  SIZE height = gridDim.y * blockDim.y;
-  SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
-  SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
-  SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
-  return x + (y * width) + (z * width * height);
-}
-
-template <typename SIZE = std::uint32_t /*max 4294967296*/>
-__device__ SIZE block_idx_x() {
-  return blockIdx.x;
-}
-
-}  // namespace mkn::gpu::cuda
-
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-#define MKN_GPU_NS mkn::gpu::cuda
-#else
-#define MKN_GPU_NS mkn::gpu
-#endif  // MKN_GPU_FN_PER_NS
-
-namespace MKN_GPU_NS {
-
-#ifdef _MKN_GPU_WARP_SIZE_
-static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
-#else
-static constexpr int warp_size = warpSize;
-#endif /*_MKN_GPU_WARP_SIZE_    */
-
-void inline setLimitMallocHeapSize(std::size_t const& bytes) {
-  MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
-}
-
-void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
-
-struct Stream {
-  Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
-  ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); }
-
-  auto& operator()() { return stream; };
-
-  void sync() { result = cudaStreamSynchronize(stream); }
-
-  cudaError_t result;
-  cudaStream_t stream;
-};
-
-//
-
-struct StreamEvent {
-  //
-  StreamEvent(Stream& stream_) : stream{stream_} {}
-  StreamEvent(StreamEvent&& that) = default;
-  StreamEvent(StreamEvent const&) = delete;
-  StreamEvent& operator=(StreamEvent const&) = delete;
-
-  auto& operator()(std::function<void()> fn = {}) {
-    fin = 0;
-    _fn = fn;
-    MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
-    return *this;
-  }
-
-  static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) {
-    auto& self = *reinterpret_cast<StreamEvent*>(ptr);
-    self._fn();
-    self._fn = [] {};
-    self.fin = 1;
-  }
-
-  bool finished() const { return fin; }
-
-  Stream& stream;
-  cudaError_t result;
-  std::function<void()> _fn;
-  bool fin = 0;
-};
+#include "mkn/gpu/cuda/def.hpp"
+#include "mkn/gpu/cuda/api.hpp"
+#include "mkn/gpu/cuda/cls.hpp"
 
 //
 
-template <typename T>
-struct Pointer {
-  Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
-
-  bool is_unregistered_ptr() const { return attributes.type == 0; }
-  bool is_host_ptr() const {
-    return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr);
-  }
-  bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; }
-  bool is_managed_ptr() const { return attributes.type == 3; }
-
-  T* t;
-  cudaPointerAttributes attributes;
-};
-
-template <typename Size>
-void alloc(void*& p, Size size) {
-  MKN_GPU_ASSERT(cudaMalloc((void**)&p, size));
-}
-
-template <typename T, typename Size>
-void alloc(T*& p, Size size) {
-  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(cudaMalloc((void**)&p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void alloc_host(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(cudaMallocHost((void**)&p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void alloc_managed(T*& p, Size size) {
-  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T)));
-}
-
-void inline destroy(void* p) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaFree(p));
-}
-
-template <typename T>
-void destroy(T*& ptr) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaFree(ptr));
-}
-
-template <typename T>
-void destroy_host(T*& ptr) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaFreeHost(ptr));
-}
-
-template <typename T, typename Size>
-void copy_on_device(T* dst, T const* src, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice));
-}
-
-template <typename Size>
-void send(void* p, void* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaMemcpy(p, t, size, cudaMemcpyHostToDevice));
-}
-
-template <typename T, typename Size>
-void send(T* p, T const* t, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaMemcpy(p + start, t, size * sizeof(T), cudaMemcpyHostToDevice));
-}
-
-template <typename T, typename Size>
-void take(T const* p, T* t, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaMemcpy(t, p + start, size * sizeof(T), cudaMemcpyDeviceToHost));
-}
-
-template <typename T, typename Size>
-void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(cudaMemcpyAsync(p + start,               //
-                                 t + start,               //
-                                 size * sizeof(T),        //
-                                 cudaMemcpyHostToDevice,  //
-                                 stream()));
-}
-
-template <typename T, typename Span>
-void take_async(T* p, Span& span, Stream& stream, std::size_t start) {
-  KLOG(TRC);
-  static_assert(mkn::kul::is_span_like_v<Span>);
-  MKN_GPU_ASSERT(cudaMemcpyAsync(span.data(),              //
-                                 p + start,                //
-                                 span.size() * sizeof(T),  //
-                                 cudaMemcpyDeviceToHost,   //
-                                 stream()));
-}
-
-void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
-void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); }
-
-#include "mkn/gpu/alloc.hpp"
-#include "mkn/gpu/device.hpp"
-
-template <bool _sync = true, typename F, typename... Args>
-void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
-  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
-  KLOG(TRC) << N;
-  std::apply(
-      [&](auto&&... params) {
-        f<<<g, b, ds, s>>>(params...);
-        MKN_GPU_ASSERT(cudaGetLastError());
-      },
-      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-  if constexpr (_sync) {
-    if (s)
-      sync(s);
-    else
-      sync();
-  }
-}
+namespace MKN_GPU_NS {
 
 //
-struct Launcher {
-  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
-  Launcher(size_t w, size_t h, size_t tpx, size_t tpy)
-      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
-  Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz)
-      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
-
-  template <typename F, typename... Args>
-  void operator()(F&& f, Args&&... args) {
-    launch(std::forward<F>(f), g, b, ds, s, args...);
-  }
-
-  size_t ds = 0 /*dynamicShared*/;
-  dim3 g /*gridDim*/, b /*blockDim*/;
-  cudaStream_t s = 0;
-};
-
-struct GLauncher : public Launcher {
-  GLauncher(std::size_t const& s, std::size_t const& _dev = 0)
-      : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} {
-    MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
-
-    resize(s);
-  }
-
-  void resize(std::size_t const& s, std::size_t const& bx = 0) {
-    b.x = bx > 0 ? bx : cli.bx_threads();
-    g.x = s / b.x;
-    if ((s % b.x) > 0) ++g.x;
-  }
-
-  std::size_t dev = 0;
-  std::size_t count = 0;
-  cudaDeviceProp devProp;
-  mkn::gpu::Cli<cudaDeviceProp> cli{devProp};
-};
-
-template <typename F, typename... Args>
-__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
-  if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
-}
-
-template <typename F, typename... Args>
-__global__ static void global_d_kernel(F f, Args... args) {
-  f(args...);
-}
-
-#include "launchers.hpp"
-#include "devfunc.hpp"
 
 template <typename T, typename V>
 __global__ void _vector_fill(T* a, V t, std::size_t s) {
@@ -330,7 +64,7 @@ void fill(Container& c, T val) {
 //
 void inline prinfo(size_t dev = 0) {
   cudaDeviceProp devProp;
-  [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
+  MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
   KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
   KOUT(NON) << " agent name     " << devProp.name;
   KOUT(NON) << " cores          " << devProp.multiProcessorCount;
@@ -344,7 +78,7 @@ void inline prinfo(size_t dev = 0) {
 void inline print_gpu_mem_used() {
   float free_m = 0, total_m = 0, used_m = 0;
   std::size_t free_t = 0, total_t = 0;
-  cudaMemGetInfo(&free_t, &total_t);
+  MKN_GPU_ASSERT(cudaMemGetInfo(&free_t, &total_t));
   free_m = free_t / 1048576.0;
   total_m = total_t / 1048576.0;
   used_m = total_m - free_m;
@@ -352,6 +86,10 @@ void inline print_gpu_mem_used() {
          total_t, total_m, used_m);
 }
 
+// #include "mkn/gpu/any/inc/device.ipp"
+#include "mkn/gpu/any/inc/launchers.ipp"
+#include "mkn/gpu/any/inc/devfunc.ipp"
+
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
diff --git a/inc/mkn/gpu/cuda/api.hpp b/inc/mkn/gpu/cuda/api.hpp
new file mode 100644
index 0000000..1d0c293
--- /dev/null
+++ b/inc/mkn/gpu/cuda/api.hpp
@@ -0,0 +1,47 @@
+#ifndef _MKN_GPU_CUDA_API_HPP_
+#define _MKN_GPU_CUDA_API_HPP_
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "mkn/gpu/cuda/def.hpp"
+
+namespace mkn::gpu::cuda {
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE idx() {
+  SIZE width = gridDim.x * blockDim.x;
+  SIZE height = gridDim.y * blockDim.y;
+  SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
+  SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
+  SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
+  return x + (y * width) + (z * width * height);
+}
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE block_idx_x() {
+  return blockIdx.x;
+}
+
+}  // namespace mkn::gpu::cuda
+
+namespace MKN_GPU_NS {
+
+template <typename F, typename... Args>
+__global__ static void global_d_kernel(F f, Args... args) {
+  f(args...);
+}
+
+//
+
+template <typename F, typename... Args>
+__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
+  if (auto i = mkn::gpu::cuda::idx(); i < s) f(args...);
+}
+
+}  // namespace MKN_GPU_NS
+
+//
+
+#endif /*_MKN_GPU_CUDA_API_HPP_*/
diff --git a/inc/mkn/gpu/cuda/cls.hpp b/inc/mkn/gpu/cuda/cls.hpp
new file mode 100644
index 0000000..133573e
--- /dev/null
+++ b/inc/mkn/gpu/cuda/cls.hpp
@@ -0,0 +1,140 @@
+#ifndef _MKN_GPU_CUDA_CLS_HPP_
+#define _MKN_GPU_CUDA_CLS_HPP_
+
+#include <cstddef>
+#include <functional>
+
+#include <cuda_runtime.h>
+
+#include "mkn/kul/log.hpp"
+#include "mkn/kul/span.hpp"
+#include "mkn/kul/tuple.hpp"
+
+#include "mkn/gpu/any/def.hpp"
+#include "mkn/gpu/any/cls.hpp"
+
+#include "mkn/gpu/cuda/def.hpp"
+
+namespace MKN_GPU_NS {
+
+struct Stream {
+  Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); }
+  ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); }
+
+  auto& operator()() { return stream; };
+
+  void sync() { MKN_GPU_ASSERT(result = cudaStreamSynchronize(stream)); }
+
+  cudaError_t result;
+  cudaStream_t stream;
+};
+
+//
+
+struct StreamEvent {
+  //
+  StreamEvent(Stream& stream_) : stream{stream_} {}
+  StreamEvent(StreamEvent&& that) = default;
+  StreamEvent(StreamEvent const&) = delete;
+  StreamEvent& operator=(StreamEvent const&) = delete;
+
+  auto& operator()(std::function<void()> fn = {}) {
+    fin = 0;
+    _fn = fn;
+    MKN_GPU_ASSERT(cudaStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
+    return *this;
+  }
+
+  static void Callback(cudaStream_t /*stream*/, cudaError_t /*status*/, void* ptr) {
+    auto& self = *reinterpret_cast<StreamEvent*>(ptr);
+    self._fn();
+    self._fn = [] {};
+    self.fin = 1;
+  }
+
+  bool finished() const { return fin; }
+
+  Stream& stream;
+  cudaError_t result;
+  std::function<void()> _fn;
+  bool fin = 0;
+};
+
+//
+
+template <typename T>
+struct Pointer {
+  Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); }
+
+  bool is_unregistered_ptr() const { return attributes.type == 0; }
+  bool is_host_ptr() const {
+    return attributes.type == 1 || (is_unregistered_ptr() && t != nullptr);
+  }
+  bool is_device_ptr() const { return is_managed_ptr() || attributes.type == 2; }
+  bool is_managed_ptr() const { return attributes.type == 3; }
+
+  T* t;
+  cudaPointerAttributes attributes;
+};
+
+#include "mkn/gpu/any/inc/alloc.ipp"
+#include "mkn/gpu/any/inc/device.ipp"
+
+template <bool _sync = true, typename F, typename... Args>
+void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) {
+  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
+  KLOG(TRC) << N;
+  std::apply(
+      [&](auto&&... params) {
+        f<<<g, b, ds, s>>>(params...);
+        MKN_GPU_ASSERT(cudaGetLastError());
+      },
+      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
+  if constexpr (_sync) {
+    if (s)
+      sync(s);
+    else
+      sync();
+  }
+}
+
+struct Launcher {
+  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
+  Launcher(size_t w, size_t h, size_t tpx, size_t tpy)
+      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
+  Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz)
+      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
+
+  template <typename F, typename... Args>
+  void operator()(F&& f, Args&&... args) {
+    launch(std::forward<F>(f), g, b, ds, s, args...);
+  }
+
+  size_t ds = 0 /*dynamicShared*/;
+  dim3 g /*gridDim*/, b /*blockDim*/;
+  cudaStream_t s = 0;
+};
+
+struct GLauncher : public Launcher {
+  GLauncher(std::size_t const s, std::size_t const _dev = 0)
+      : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} {
+    MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev));
+
+    resize(s);
+  }
+
+  void resize(std::size_t const s, std::size_t const bx = 0) {
+    b.x = bx > 0 ? bx : cli.bx_threads();
+    g.x = s / b.x;
+    if ((s % b.x) > 0) ++g.x;
+  }
+
+  std::size_t dev = 0;
+  std::size_t count = 0;
+  cudaDeviceProp devProp;
+  mkn::gpu::Cli<cudaDeviceProp> cli{devProp};
+};
+
+}  // namespace MKN_GPU_NS
+
+#endif /*_MKN_GPU_CUDA_CLS_HPP_*/
diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp
new file mode 100644
index 0000000..0d08974
--- /dev/null
+++ b/inc/mkn/gpu/cuda/def.hpp
@@ -0,0 +1,133 @@
+#ifndef _MKN_GPU_CUDA_DEF_HPP_
+#define _MKN_GPU_CUDA_DEF_HPP_
+
+#include <string>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "mkn/kul/log.hpp"
+#include "mkn/gpu/def.hpp"
+
+#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
+#define MKN_GPU_NS mkn::gpu::cuda
+#else
+#define MKN_GPU_NS mkn::gpu
+#endif  // MKN_GPU_FN_PER_NS
+
+namespace MKN_GPU_NS {
+
+static_assert(CompileFlags::withCUDA);
+
+#define MKN_GPU_ASSERT(ans)               \
+  {                                       \
+    gpuAssert((ans), __FILE__, __LINE__); \
+  }
+inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+    if (abort) std::abort();
+  }
+}
+
+std::string getErrorString(auto const code) { return cudaGetErrorString(code); }
+
+std::uint32_t inline getWarpSize(size_t dev = 0) {
+#ifdef _MKN_GPU_WARP_SIZE_
+  return _MKN_GPU_WARP_SIZE_;
+#else
+  cudaDeviceProp devProp;
+  [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev);
+  return devProp.warpSize;
+#endif /*_MKN_GPU_WARP_SIZE_    */
+}
+
+static std::uint32_t inline const warp_size = getWarpSize();
+
+void inline setLimitMallocHeapSize(std::size_t const& bytes) {
+  MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
+}
+
+void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); }
+
+void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); }
+void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); }
+
+template <typename Size>
+void alloc(void*& p, Size size) {
+  MKN_GPU_ASSERT(cudaMalloc((void**)&p, size));
+}
+
+template <typename T, typename Size>
+void alloc(T*& p, Size size) {
+  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(cudaMalloc((void**)&p, size * sizeof(T)));
+}
+
+template <typename T, typename Size>
+void alloc_host(T*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(cudaMallocHost((void**)&p, size * sizeof(T)));
+}
+
+template <typename T, typename Size>
+void alloc_managed(T*& p, Size size) {
+  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T)));
+}
+
+void inline destroy(void* p) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaFree(p));
+}
+
+template <typename T>
+void destroy(T* ptr) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaFree(ptr));
+}
+
+template <typename T>
+void destroy_host(T* ptr) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaFreeHost(ptr));
+}
+
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice));
+}
+
+template <typename Size>
+void send(void* p, void* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpy(p, t, size, cudaMemcpyHostToDevice));
+}
+
+template <typename T, typename Size>
+void send(T* p, T const* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpy(p, t, size * sizeof(T), cudaMemcpyHostToDevice));
+}
+template <typename T, typename Size>
+void send_async(T* p, T const* t, cudaStream_t& stream, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpyAsync(p, t, size * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
+template <typename T, typename Size>
+void take(T const* p, T* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpy(t, p, size * sizeof(T), cudaMemcpyDeviceToHost));
+}
+
+template <typename T, typename Size>
+void take_async(T const* p, T* t, cudaStream_t& stream, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(cudaMemcpyAsync(t, p, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
+}
+
+}  // namespace MKN_GPU_NS
+
+#endif /*_MKN_GPU_CUDA_DEF_HPP_*/
diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp
index df8db63..83d5ec4 100644
--- a/inc/mkn/gpu/def.hpp
+++ b/inc/mkn/gpu/def.hpp
@@ -3,9 +3,46 @@
 
 #include <type_traits>
 
+#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
+#define MKN_GPU_ROCM 1
+#endif
+#if !defined(MKN_GPU_ROCM)
+#define MKN_GPU_ROCM 0
+#endif
+
+#if !defined(MKN_GPU_CUDA) and __has_include(<cuda_runtime.h>)
+#define MKN_GPU_CUDA 1
+#endif
+#if !defined(MKN_GPU_CUDA)
+#define MKN_GPU_CUDA 0
+#endif
+
+#if MKN_GPU_CUDA && MKN_GPU_ROCM && !defined(MKN_GPU_FN_PER_NS)
+#define MKN_GPU_FN_PER_NS 1
+#endif
+
+#if !defined(MKN_GPU_FN_PER_NS)
+#define MKN_GPU_FN_PER_NS 0
+#endif
+
+#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU)
+#define MKN_GPU_CPU 1
+#endif
+
+#if !defined(MKN_GPU_CPU)
+#define MKN_GPU_CPU 0
+#endif
+
 namespace mkn::gpu {
 
-#if defined(MKN_GPU_CPU)
+struct CompileFlags {
+  bool constexpr static withCUDA = MKN_GPU_CUDA;
+  bool constexpr static withROCM = MKN_GPU_ROCM;
+  bool constexpr static withCPU = MKN_GPU_CPU;
+  bool constexpr static perNamespace = MKN_GPU_FN_PER_NS;
+};
+
+#if MKN_GPU_CPU
 
 template <typename T>
 static constexpr bool is_floating_point_v = std::is_floating_point_v<T>;
diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp
deleted file mode 100644
index 3a45d43..0000000
--- a/inc/mkn/gpu/defines.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _MKN_GPU_DEFINES_HPP_
-#define _MKN_GPU_DEFINES_HPP_
-
-#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h")
-#define MKN_GPU_ROCM 1
-#endif
-#if !defined(MKN_GPU_ROCM)
-#define MKN_GPU_ROCM 0
-#endif
-
-#if !defined(MKN_GPU_CUDA) and __has_include(<cuda_runtime.h>)
-#define MKN_GPU_CUDA 1
-#endif
-#if !defined(MKN_GPU_CUDA)
-#define MKN_GPU_CUDA 0
-#endif
-
-#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && !defined(MKN_GPU_FN_PER_NS)
-#define MKN_GPU_FN_PER_NS 1
-#endif
-
-#if !defined(MKN_GPU_FN_PER_NS)
-#define MKN_GPU_FN_PER_NS 0
-#endif
-
-#if MKN_GPU_ROCM == 1
-#include "mkn/gpu/rocm.hpp"
-#endif
-
-#if MKN_GPU_CUDA == 1
-#include "mkn/gpu/cuda.hpp"
-#endif
-
-#if MKN_GPU_CUDA == 0 && MKN_GPU_ROCM == 0 && !defined(MKN_GPU_CPU)
-#define MKN_GPU_CPU 1
-#endif
-
-#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1
-#include "mkn/gpu/cpu.hpp"
-#endif
-
-namespace mkn::gpu {
-
-struct CompileFlags {
-  bool constexpr static withCUDA = MKN_GPU_CUDA;
-  bool constexpr static withROCM = MKN_GPU_ROCM;
-  bool constexpr static withCPU = !MKN_GPU_ROCM and !MKN_GPU_CUDA;
-  bool constexpr static perNamespace = MKN_GPU_FN_PER_NS;
-};
-
-} /* namespace mkn::gpu */
-
-#endif /*_MKN_GPU_DEFINES_HPP_*/
diff --git a/inc/mkn/gpu/devfunc.hpp b/inc/mkn/gpu/devfunc.hpp
deleted file mode 100644
index 5708308..0000000
--- a/inc/mkn/gpu/devfunc.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// IWYU pragma: private, include "mkn/gpu.hpp"
-/**
-Copyright (c) 2025, Philip Deegan.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Philip Deegan nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#ifndef _MKN_GPU_DEVFUNC_HPP_
-#define _MKN_GPU_DEVFUNC_HPP_
-
-template <bool sync = true, typename T>
-void __device__ zero(T* const t, std::size_t const size) {
-  std::size_t chunk = 0;
-  auto const each = size / warpSize;
-  for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = 0;
-  if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = 0;
-  if constexpr (sync) __syncthreads();
-}
-
-#endif /* _MKN_GPU_DEVFUNC_HPP_ */
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 67721a1..5046c14 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -33,302 +33,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define _MKN_GPU_ROCM_HPP_
 
 #include "mkn/kul/log.hpp"
-#include "mkn/kul/span.hpp"
-#include "mkn/kul/tuple.hpp"
-#include "mkn/kul/assert.hpp"
-
-#include "mkn/gpu/cli.hpp"
-#include "mkn/gpu/def.hpp"
 
 #include "hip/hip_runtime.h"
 
-#define MKN_GPU_ASSERT(ans)               \
-  {                                       \
-    gpuAssert((ans), __FILE__, __LINE__); \
-  }
-inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) {
-  if (code != hipSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
-    if (abort) std::abort();
-  }
-}
-
-namespace mkn::gpu::hip {
-
-template <typename SIZE = uint32_t /*max 4294967296*/>
-__device__ SIZE idx() {
-  SIZE width = hipGridDim_x * hipBlockDim_x;
-  SIZE height = hipGridDim_y * hipBlockDim_y;
-
-  SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-  SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
-  SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
-  return x + (y * width) + (z * width * height);  // max 4294967296
-}
-
-}  // namespace mkn::gpu::hip
-
-#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
-#define MKN_GPU_NS mkn::gpu::hip
-#else
-#define MKN_GPU_NS mkn::gpu
-#endif  // MKN_GPU_FN_PER_NS
+#include "mkn/gpu/rocm/def.hpp"
+#include "mkn/gpu/rocm/api.hpp"
+#include "mkn/gpu/rocm/cls.hpp"
 
 namespace MKN_GPU_NS {
 
-#ifdef _MKN_GPU_WARP_SIZE_
-static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
-#else
-static constexpr int warp_size = warpSize;
-#endif /*_MKN_GPU_WARP_SIZE_    */
-
-void inline setLimitMallocHeapSize(std::size_t const& bytes) {
-  MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
-}
-
-void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); }
-
-struct Stream {
-  Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); }
-  ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); }
-
-  auto& operator()() { return stream; };
-
-  void sync() { result = hipStreamSynchronize(stream); }
-
-  hipError_t result;
-  hipStream_t stream;
-};
-
-//
-
-struct StreamEvent {
-  //
-  StreamEvent(Stream& stream_) : stream{stream_} {}
-  StreamEvent(StreamEvent&& that) = default;
-  StreamEvent(StreamEvent const&) = delete;
-  StreamEvent& operator=(StreamEvent const&) = delete;
-
-  auto& operator()(std::function<void()> fn = {}) {
-    fin = 0;
-    _fn = fn;
-    MKN_GPU_ASSERT(hipStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
-    return *this;
-  }
-
-  static void Callback(hipStream_t /*stream*/, hipError_t /*status*/, void* ptr) {
-    auto& self = *reinterpret_cast<StreamEvent*>(ptr);
-    self._fn();
-    self._fn = [] {};
-    self.fin = 1;
-  }
-
-  bool finished() const { return fin; }
-
-  Stream& stream;
-  hipError_t result;
-  std::function<void()> _fn;
-  bool fin = 0;
-};
-
-//
-
-// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3
-template <typename T>
-struct Pointer {
-  Pointer(T* _t) : t{_t} {
-    assert(t);
-    MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t));
-    type = attributes.type;
-  }
-
-  bool is_unregistered_ptr() const {
-    return attributes.type == hipMemoryType::hipMemoryTypeUnregistered;
-  }
-  bool is_host_ptr() const {
-    return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost;
-  }
-  bool is_device_ptr() const {
-    return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged;
-  }
-  bool is_managed_ptr() const {
-    return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified;
-  }
-
-  T* t;
-  hipPointerAttribute_t attributes;
-  hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered;
-};
-
-template <typename Size>
-void alloc(void*& p, Size size) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMalloc((void**)&p, size));
-}
-
-template <typename T, typename Size>
-void alloc(T*& p, Size size) {
-  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(hipMalloc((void**)&p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void alloc_host(T*& p, Size size) {
-  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(hipHostMalloc((void**)&p, size * sizeof(T)));
-}
-
-template <typename T, typename Size>
-void alloc_managed(T*& p, Size size) {
-  auto const bytes = size * sizeof(T);
-  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes));
-}
-
-void inline destroy(void* p) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipFree(p));
-}
-
-template <typename T>
-void destroy(T* ptr) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipFree(ptr));
-}
-
-template <typename T>
-void destroy_host(T* ptr) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipHostFree(ptr));
-}
-
-template <typename T, typename Size>
-void copy_on_device(T* dst, T const* src, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice));
-}
-
-template <typename Size>
-void send(void* p, void* t, Size size = 1) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMemcpy(p, t, size, hipMemcpyHostToDevice));
-}
-
-template <typename T, typename Size>
-void send(T* p, T const* t, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMemcpy(p + start, t, size * sizeof(T), hipMemcpyHostToDevice));
-}
-
-template <typename T, typename Size>
-void take(T const* p, T* t, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMemcpy(t, p + start, size * sizeof(T), hipMemcpyDeviceToHost));
-}
-
-template <typename T, typename Size>
-void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) {
-  KLOG(TRC);
-  MKN_GPU_ASSERT(hipMemcpyAsync(p + start,              //
-                                t + start,              //
-                                size * sizeof(T),       //
-                                hipMemcpyHostToDevice,  //
-                                stream()));
-}
-
-template <typename T, typename Span>
-void take_async(T* p, Span& span, Stream& stream, std::size_t start) {
-  KLOG(TRC);
-  static_assert(mkn::kul::is_span_like_v<Span>);
-  MKN_GPU_ASSERT(hipMemcpyAsync(span.data(),              //
-                                p + start,                //
-                                span.size() * sizeof(T),  //
-                                hipMemcpyDeviceToHost,    //
-                                stream()));
-}
-
-void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); }
-void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); }
-
-#include "mkn/gpu/alloc.hpp"
-#include "mkn/gpu/device.hpp"
-
-template <bool _sync = true, typename F, typename... Args>
-void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
-  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
-  KLOG(TRC) << N;
-  std::apply(
-      [&](auto&&... params) {
-        hipLaunchKernelGGL(f, g, b, ds, s, params...);
-        MKN_GPU_ASSERT(hipGetLastError());
-      },
-      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
-  if constexpr (_sync) {
-    if (s)
-      sync(s);
-    else
-      sync();
-  }
-}
-
-// https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions
-struct Launcher {
-  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
-  Launcher(size_t w, size_t h, size_t tpx, size_t tpy)
-      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
-  Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz)
-      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
-
-  template <typename F, typename... Args>
-  void operator()(F&& f, Args&&... args) {
-    launch(std::forward<F>(f), g, b, ds, s, args...);
-  }
-
-  size_t ds = 0 /*dynamicShared*/;
-  dim3 g /*gridDim*/, b /*blockDim*/;
-  hipStream_t s = 0;
-};
-
-struct GLauncher : public Launcher {
-  GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} {
-    [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
-
-    b.x = cli.bx_threads();
-    g.x = s / b.x;
-    if ((s % b.x) > 0) ++g.x;
-  }
-
-  std::size_t count = 0;
-  hipDeviceProp_t devProp;
-  mkn::gpu::Cli<hipDeviceProp_t> cli{devProp};
-};
-
-template <typename F, typename... Args>
-__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
-  if (auto i = mkn::gpu::hip::idx(); i < s) f(args...);
-}
-
-template <typename F, typename... Args>
-__global__ static void global_d_kernel(F f, Args... args) {
-  f(args...);
-}
-
-#include "launchers.hpp"
-#include "devfunc.hpp"
-
 template <typename T, typename V>
 __global__ void _vector_fill(T* a, V t, std::size_t s) {
   if (auto i = mkn::gpu::hip::idx(); i < s) a[i] = t;
 }
 
 template <typename Container, typename T>
-void fill(Container& c, size_t size, T val) {
+void fill(Container& c, std::size_t const size, T const val) {
   GLauncher{c.size()}(_vector_fill<typename Container::value_type, T>, c.data(), val, size);
 }
 
 template <typename Container, typename T>
-void fill(Container& c, T val) {
-  GLauncher{c.size()}(_vector_fill<typename Container::value_type, T>, c.data(), val, c.size());
+void fill(Container& c, T const val) {
+  fill(c, c.size(), val);
 }
 
 // https://rocm-developer-tools.github.io/HIP/group__Device.html
@@ -348,7 +74,7 @@ void inline prinfo(size_t dev = 0) {
   KOUT(NON) << " warpSize used   " << _MKN_GPU_WARP_SIZE_;
 #else
   KOUT(NON) << " warpSize used   " << warp_size;
-  if (warp_size != devProp.warpSize) {
+  if (warp_size != static_cast<std::uint32_t>(devProp.warpSize)) {
     KOUT(NON) << " warpSize MISMATCH!!!  " << warp_size << " vs " << devProp.warpSize;
     KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###";
   }
@@ -366,6 +92,9 @@ void inline print_gpu_mem_used() {
          total_t, total_m, used_m);
 }
 
+#include "mkn/gpu/any/inc/launchers.ipp"
+#include "mkn/gpu/any/inc/devfunc.ipp"
+
 }  // namespace MKN_GPU_NS
 
 #undef MKN_GPU_ASSERT
diff --git a/inc/mkn/gpu/rocm/api.hpp b/inc/mkn/gpu/rocm/api.hpp
new file mode 100644
index 0000000..b197bbb
--- /dev/null
+++ b/inc/mkn/gpu/rocm/api.hpp
@@ -0,0 +1,45 @@
+#ifndef _MKN_GPU_ROCM_API_HPP_
+#define _MKN_GPU_ROCM_API_HPP_
+
+#include <cstdint>
+
+#include "hip/hip_runtime.h"
+
+#include "mkn/gpu/rocm/def.hpp"
+
+namespace mkn::gpu::hip {
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE idx() {
+  SIZE width = gridDim.x * blockDim.x;
+  SIZE height = gridDim.y * blockDim.y;
+  SIZE x = blockDim.x * blockIdx.x + threadIdx.x;
+  SIZE y = blockDim.y * blockIdx.y + threadIdx.y;
+  SIZE z = blockDim.z * blockIdx.z + threadIdx.z;
+  return x + (y * width) + (z * width * height);
+}
+
+template <typename SIZE = std::uint32_t /*max 4294967296*/>
+__device__ SIZE block_idx_x() {
+  return blockIdx.x;
+}
+
+}  // namespace mkn::gpu::hip
+
+namespace MKN_GPU_NS {
+
+template <typename F, typename... Args>
+__global__ static void global_gd_kernel(F f, std::size_t s, Args... args) {
+  if (auto i = mkn::gpu::hip::idx(); i < s) f(args...);
+}
+
+template <typename F, typename... Args>
+__global__ static void global_d_kernel(F f, Args... args) {
+  f(args...);
+}
+
+}  // namespace MKN_GPU_NS
+
+//
+
+#endif /*_MKN_GPU_ROCM_API_HPP_*/
diff --git a/inc/mkn/gpu/rocm/cls.hpp b/inc/mkn/gpu/rocm/cls.hpp
new file mode 100644
index 0000000..6a02f35
--- /dev/null
+++ b/inc/mkn/gpu/rocm/cls.hpp
@@ -0,0 +1,149 @@
+#ifndef _MKN_GPU_ROCM_CLS_HPP_
+#define _MKN_GPU_ROCM_CLS_HPP_
+
+#include "mkn/kul/span.hpp"
+#include "mkn/kul/tuple.hpp"
+
+#include "mkn/gpu/any/def.hpp"
+#include "mkn/gpu/any/cls.hpp"
+
+#include "hip/hip_runtime.h"
+
+#include "mkn/gpu/rocm/def.hpp"
+
+namespace MKN_GPU_NS {
+
+struct Stream {
+  Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); }
+  ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); }
+
+  auto& operator()() { return stream; };
+
+  void sync() { MKN_GPU_ASSERT(result = hipStreamSynchronize(stream)); }
+
+  hipError_t result;
+  hipStream_t stream;
+};
+
+//
+
+struct StreamEvent {
+  //
+  StreamEvent(Stream& stream_) : stream{stream_} {}
+  StreamEvent(StreamEvent&& that) = default;
+  StreamEvent(StreamEvent const&) = delete;
+  StreamEvent& operator=(StreamEvent const&) = delete;
+
+  auto& operator()(std::function<void()> fn = {}) {
+    fin = 0;
+    _fn = fn;
+    MKN_GPU_ASSERT(hipStreamAddCallback(stream(), StreamEvent::Callback, this, 0));
+    return *this;
+  }
+
+  static void Callback(hipStream_t /*stream*/, hipError_t /*status*/, void* ptr) {
+    auto& self = *reinterpret_cast<StreamEvent*>(ptr);
+    self._fn();
+    self._fn = [] {};
+    self.fin = 1;
+  }
+
+  bool finished() const { return fin; }
+
+  Stream& stream;
+  hipError_t result;
+  std::function<void()> _fn;
+  bool fin = 0;
+};
+
+//
+
+// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3
+template <typename T>
+struct Pointer {
+  Pointer(T* _t) : t{_t} {
+    assert(t);
+    MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t));
+    type = attributes.type;
+  }
+
+  bool is_unregistered_ptr() const {
+    return attributes.type == hipMemoryType::hipMemoryTypeUnregistered;
+  }
+  bool is_host_ptr() const {
+    return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost;
+  }
+  bool is_device_ptr() const {
+    return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged;
+  }
+  bool is_managed_ptr() const {
+    return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified;
+  }
+
+  T* t;
+  hipPointerAttribute_t attributes;
+  hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered;
+};
+
+#include "mkn/gpu/any/inc/alloc.ipp"
+#include "mkn/gpu/any/inc/device.ipp"
+
+template <bool _sync = true, typename F, typename... Args>
+void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) {
+  std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z);
+  KLOG(TRC) << N;
+  std::apply(
+      [&](auto&&... params) {
+        hipLaunchKernelGGL(f, g, b, ds, s, params...);
+        MKN_GPU_ASSERT(hipGetLastError());
+      },
+      devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence<sizeof...(Args)>()));
+  if constexpr (_sync) {
+    if (s)
+      sync(s);
+    else
+      sync();
+  }
+}
+
+// https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions
+struct Launcher {
+  Launcher(dim3 _g, dim3 _b) : g{_g}, b{_b} {}
+  Launcher(size_t w, size_t h, size_t tpx, size_t tpy)
+      : Launcher{dim3(w / tpx, h / tpy), dim3(tpx, tpy)} {}
+  Launcher(size_t x, size_t y, size_t z, size_t tpx, size_t tpy, size_t tpz)
+      : Launcher{dim3(x / tpx, y / tpy, z / tpz), dim3(tpx, tpy, tpz)} {}
+
+  template <typename F, typename... Args>
+  void operator()(F&& f, Args&&... args) {
+    launch(std::forward<F>(f), g, b, ds, s, args...);
+  }
+
+  size_t ds = 0 /*dynamicShared*/;
+  dim3 g /*gridDim*/, b /*blockDim*/;
+  hipStream_t s = 0;
+};
+
+struct GLauncher : public Launcher {
+  GLauncher(std::size_t const s, std::size_t const _dev = 0)
+      : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} {
+    MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
+
+    resize(s);
+  }
+
+  void resize(std::size_t const s, std::size_t const bx = 0) {
+    b.x = bx > 0 ? bx : cli.bx_threads();
+    g.x = s / b.x;
+    if ((s % b.x) > 0) ++g.x;
+  }
+
+  std::size_t dev = 0;
+  std::size_t count = 0;
+  hipDeviceProp_t devProp;
+  mkn::gpu::Cli<hipDeviceProp_t> cli{devProp};
+};
+
+}  // namespace MKN_GPU_NS
+
+#endif /*_MKN_GPU_ROCM_CLS_HPP_*/
diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp
new file mode 100644
index 0000000..ee8f6a3
--- /dev/null
+++ b/inc/mkn/gpu/rocm/def.hpp
@@ -0,0 +1,135 @@
+#ifndef _MKN_GPU_ROCM_DEF_HPP_
+#define _MKN_GPU_ROCM_DEF_HPP_
+
+#include "mkn/kul/log.hpp"
+#include "mkn/gpu/def.hpp"
+
+#include "hip/hip_runtime.h"
+
+#include <string>
+
+#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
+#define MKN_GPU_NS mkn::gpu::hip
+#else
+#define MKN_GPU_NS mkn::gpu
+#endif  // MKN_GPU_FN_PER_NS
+
+namespace MKN_GPU_NS {
+
+static_assert(CompileFlags::withROCM);
+
+#define MKN_GPU_ASSERT(ans)               \
+  {                                       \
+    gpuAssert((ans), __FILE__, __LINE__); \
+  }
+inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) {
+  if (code != hipSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
+    if (abort) std::abort();
+  }
+}
+
+std::string getErrorString(auto const code) { return hipGetErrorString(code); }
+
+std::uint32_t inline getWarpSize(size_t dev = 0) {
+#ifdef _MKN_GPU_WARP_SIZE_
+  return _MKN_GPU_WARP_SIZE_;
+#else
+  hipDeviceProp_t devProp;
+  [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev);
+  return devProp.warpSize;
+#endif /*_MKN_GPU_WARP_SIZE_    */
+}
+
+static std::uint32_t inline const warp_size = getWarpSize();
+
+void inline setLimitMallocHeapSize(std::size_t const& bytes) {
+  MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
+}
+
+void inline setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(hipSetDevice(dev)); }
+
+void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); }
+void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); }
+
+template <typename Size>
+void alloc(void*& p, Size size) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMalloc((void**)&p, size));
+}
+
+template <typename T, typename Size>
+void alloc(T*& p, Size size) {
+  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(hipMalloc((void**)&p, size * sizeof(T)));
+}
+
+template <typename T, typename Size>
+void alloc_host(T*& p, Size size) {
+  KLOG(TRC) << "CPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(hipHostMalloc((void**)&p, size * sizeof(T)));
+}
+
+template <typename T, typename Size>
+void alloc_managed(T*& p, Size size) {
+  auto const bytes = size * sizeof(T);
+  KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
+  MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes));
+}
+
+void inline destroy(void* p) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipFree(p));
+}
+
+template <typename T>
+void destroy(T* ptr) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipFree(ptr));
+}
+
+template <typename T>
+void destroy_host(T* ptr) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipHostFree(ptr));
+}
+
+template <typename T, typename Size>
+void copy_on_device(T* dst, T const* src, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice));
+}
+
+template <typename Size>
+void send(void* p, void* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpy(p, t, size, hipMemcpyHostToDevice));
+}
+
+template <typename T, typename Size>
+void send(T* p, T const* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpy(p, t, size * sizeof(T), hipMemcpyHostToDevice));
+}
+
+template <typename T, typename Size>
+void send_async(T* p, T const* t, hipStream_t& stream, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpyAsync(p, t, size * sizeof(T), hipMemcpyHostToDevice, stream));
+}
+
+template <typename T, typename Size>
+void take(T const* p, T* t, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpy(t, p, size * sizeof(T), hipMemcpyDeviceToHost));
+}
+
+template <typename T, typename Size>
+void take_async(T const* p, T* t, hipStream_t& stream, Size size = 1) {
+  KLOG(TRC);
+  MKN_GPU_ASSERT(hipMemcpyAsync(t, p, size * sizeof(T), hipMemcpyDeviceToHost, stream));
+}
+
+}  // namespace MKN_GPU_NS
+
+#endif /*_MKN_GPU_ROCM_DEF_HPP_*/
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/stream.hpp
similarity index 100%
rename from inc/mkn/gpu/multi_launch.hpp
rename to inc/mkn/gpu/stream.hpp
diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp
index 2f363f6..3dac8e1 100644
--- a/inc/mkn/gpu/tuple.hpp
+++ b/inc/mkn/gpu/tuple.hpp
@@ -31,7 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_TUPLE_HPP_
 #define _MKN_GPU_TUPLE_HPP_
 
-// #include "mkn/gpu.hpp"
 #include "mkn/kul/tuple.hpp"
 
 namespace mkn::gpu {
diff --git a/mkn.yaml b/mkn.yaml
index 78253cc..b6057b4 100644
--- a/mkn.yaml
+++ b/mkn.yaml
@@ -23,7 +23,7 @@ profile:
 # if you have no GPU but want to test your code
 - name: cpu
   parent: headers
-  # arg: -DMKN_GPU_CPU=1
+  arg: -DMKN_GPU_CPU=1
   test: test/any/(\w).cpp
         test/cpu/(\w).cpp
 
diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml
index 9b9bd90..ab6d517 100644
--- a/res/mkn/clang_cuda.yaml
+++ b/res/mkn/clang_cuda.yaml
@@ -15,7 +15,7 @@ super: settings
 
 property:
   cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version
-  cxx_cuda: -x cuda --cuda-gpu-arch="sm_61" -Xclang -fcuda-allow-variadic-functions
+  cxx_cuda: -x cuda --cuda-gpu-arch="sm_120" -Xclang -fcuda-allow-variadic-functions
 
 env: |
   PATH=/usr/local/cuda/bin:${PATH}
@@ -23,13 +23,9 @@ env: |
 inc: /usr/local/cuda/targets/x86_64-linux/include
 path: /usr/local/cuda/targets/x86_64-linux/lib
 
-compiler:
-  mask:
-    clang++:
-      clang++-17
 
 file:
   - type: cpp:cxx:cc
     archiver: ar -cr
-    compiler:  clang++-17 ${cxx_flags} ${cxx_cuda}
-    linker:  clang++-17 -Wl,-z,defs -fuse-ld=gold -lcudart
+    compiler:  clang++ ${cxx_flags} ${cxx_cuda}
+    linker:  clang++ -Wl,-z,defs -lcudart
diff --git a/res/mkn/nvcc.yaml b/res/mkn/nvcc.yaml
new file mode 100644
index 0000000..efdf76e
--- /dev/null
+++ b/res/mkn/nvcc.yaml
@@ -0,0 +1,21 @@
+#
+
+super: settings
+
+property:
+  cxx_flags: --std=c++20 -Xcompiler -fPIC # -fPIC -fsized-deallocation -Wno-unknown-cuda-version
+  cxx_cuda: -x cu --extended-lambda #-x cuda --cuda-gpu-arch="sm_80" -Xclang -fcuda-allow-variadic-functions
+
+env: |
+  PATH=/usr/local/cuda/bin:${PATH}
+
+# inc: /usr/local/cuda/targets/x86_64-linux/include
+# path: /usr/local/cuda/targets/x86_64-linux/lib
+
+file:
+  - type: cpp:cxx:cc
+    archiver: ar -cr
+    compiler:  nvcc ${cxx_flags} ${cxx_cuda}
+    linker:  nvcc -lcudart
+
+
diff --git a/res/test_all.sh b/res/test_all.sh
new file mode 100755
index 0000000..44c4078
--- /dev/null
+++ b/res/test_all.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -e
+
+CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+(
+    cd "$CWD/.."
+
+    mkn build -p cpu run test -W 9
+    mkn build -x res/mkn/hipcc test -p rocm -W 9
+    mkn build -x res/mkn/clang_cuda test -p cuda -W 9
+
+)
diff --git a/test/any/asio.cpp b/test/any/asio.cpp
index 69d8701..6b30616 100644
--- a/test/any/asio.cpp
+++ b/test/any/asio.cpp
@@ -6,7 +6,7 @@
 #include "mkn/gpu/asio.hpp"
 #include "__share__.hpp"
 
-#if defined(MKN_GPU_CPU)
+#if MKN_GPU_CPU
 static constexpr std::uint32_t BATCHES = 1;
 #else
 static constexpr std::uint32_t BATCHES = 2;
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index bd843cb..c4f4979 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -1,11 +1,11 @@
 
-#include <thread>
-#include <algorithm>
 
 #include "mkn/gpu.hpp"
 #include "mkn/kul/dbg.hpp"
 #include "mkn/kul/time.hpp"
-#include "mkn/gpu/multi_launch.hpp"
+#include "mkn/gpu/stream.hpp"
+
+#include <algorithm>
 
 using namespace mkn::gpu;
 using namespace std::chrono_literals;
@@ -21,7 +21,7 @@ struct A {
 };
 
 std::uint32_t test() {
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
   using T = double;
 
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
@@ -54,7 +54,7 @@ std::uint32_t test() {
 }
 
 std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
   using T = double;
 
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
@@ -87,7 +87,7 @@ std::uint32_t test_threaded(std::size_t const& nthreads = 2) {
 
 std::uint32_t test_threaded_group_barrier(std::size_t const& nthreads = 2) {
   using T = double;
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
 
   std::vector<ManagedVector<T>> vecs(C + 1, ManagedVector<T>(NUM, 0));
   for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
@@ -121,7 +121,7 @@ std::uint32_t test_threaded_group_barrier(std::size_t const& nthreads = 2) {
 
 std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) {
   using T = double;
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
 
   std::size_t constexpr group_size = 3;
   std::vector<size_t> vals((C + 1) / group_size);  // 2 values;
@@ -156,7 +156,7 @@ std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) {
 
 std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
   using T = double;
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
 
   std::size_t constexpr group_size = 3;
 
@@ -196,7 +196,7 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
 
 std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) {
   using T = double;
-  KUL_DBG_FUNC_ENTER;
+  MKN_KUL_DBG_FUNC_ENTER;
 
   std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
   for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
diff --git a/test/any/managed.cpp b/test/any/managed.cpp
index ee64e07..ffe6a6d 100644
--- a/test/any/managed.cpp
+++ b/test/any/managed.cpp
@@ -92,8 +92,8 @@ std::uint32_t test_zero() {
   auto* view1 = mem1.data();
 
   mkn::gpu::DLauncher()([=] __device__() {
-    mkn::gpu::zero(view0, size);
-    mkn::gpu::zero(view1, size);
+    mkn::gpu::fill_warp_size(view0, size, 0.0f);
+    mkn::gpu::fill_warp_size(view1, size, 0.0f);
   });
 
   for (std::uint32_t i = 0; i < size; ++i)
diff --git a/test/hip/async.cpp b/test/hip/async.cpp
index 5aed657..68f2f02 100644
--- a/test/hip/async.cpp
+++ b/test/hip/async.cpp
@@ -91,7 +91,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < nStreams; ++i) checkHip(hipStreamCreate(&stream[i]));
 
   // baseline case - sequential transfer and execute
-  memset(a, 0, bytes);
+  checkHip(hipMemset(a, 0, bytes));
   checkHip(hipEventRecord(startEvent, 0));
   checkHip(hipMemcpy(d_a, a, bytes, hipMemcpyHostToDevice));
   kernel<<<n / blockSize, blockSize>>>(d_a, 0);
@@ -103,7 +103,7 @@ int main(int argc, char** argv) {
   printf("  max error: %e\n", maxError(a, n));
 
   // asynchronous version 1: loop over {copy, kernel, copy}
-  memset(a, 0, bytes);
+  checkHip(hipMemset(a, 0, bytes));
   checkHip(hipEventRecord(startEvent, 0));
   for (int i = 0; i < nStreams; ++i) {
     int offset = i * streamSize;
@@ -121,7 +121,7 @@ int main(int argc, char** argv) {
 
   // asynchronous version 2:
   // loop over copy, loop over kernel, loop over copy
-  memset(a, 0, bytes);
+  checkHip(hipMemset(a, 0, bytes));
   checkHip(hipEventRecord(startEvent, 0));
   for (int i = 0; i < nStreams; ++i) {
     int offset = i * streamSize;
diff --git a/test/hip/async.ppc b/test/hip/async.ppc
new file mode 100644
index 0000000..5aed657
--- /dev/null
+++ b/test/hip/async.ppc
@@ -0,0 +1,155 @@
+// https://raw.githubusercontent.com/NVIDIA-developer-blog/code-samples/master/series/hip-cpp/overlap-data-transfers/async.cu
+
+/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "hip/hip_runtime.h"
+
+#include <stdio.h>
+
+// Convenience function for checking HIP runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline void checkHip([[maybe_unused]] hipError_t result) {
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result));
+    assert(result == hipSuccess);
+  }
+#endif
+  // return result;
+}
+
+__global__ void kernel(float* a, int offset) {
+  int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
+  float x = (float)i;
+  float s = sinf(x);
+  float c = cosf(x);
+  a[i] = a[i] + sqrtf(s * s + c * c);
+}
+
+float maxError(float* a, int n) {
+  float maxE = 0;
+  for (int i = 0; i < n; i++) {
+    float error = fabs(a[i] - 1.0f);
+    if (error > maxE) maxE = error;
+  }
+  return maxE;
+}
+
+int main(int argc, char** argv) {
+  int const blockSize = 256, nStreams = 4;
+  int const n = 4 * 1024 * blockSize * nStreams;
+  int const streamSize = n / nStreams;
+  int const streamBytes = streamSize * sizeof(float);
+  int const bytes = n * sizeof(float);
+
+  int devId = 0;
+  if (argc > 1) devId = atoi(argv[1]);
+
+  hipDeviceProp_t prop;
+  checkHip(hipGetDeviceProperties(&prop, devId));
+  printf("Device : %s\n", prop.name);
+  checkHip(hipSetDevice(devId));
+
+  // allocate pinned host memory and device memory
+  float *a, *d_a;
+  checkHip(hipHostMalloc((void**)&a, bytes));  // host pinned
+  checkHip(hipMalloc((void**)&d_a, bytes));    // device
+
+  float ms;  // elapsed time in milliseconds
+
+  // create events and streams
+  hipEvent_t startEvent, stopEvent, dummyEvent;
+  hipStream_t stream[nStreams];
+  checkHip(hipEventCreate(&startEvent));
+  checkHip(hipEventCreate(&stopEvent));
+  checkHip(hipEventCreate(&dummyEvent));
+  for (int i = 0; i < nStreams; ++i) checkHip(hipStreamCreate(&stream[i]));
+
+  // baseline case - sequential transfer and execute
+  memset(a, 0, bytes);
+  checkHip(hipEventRecord(startEvent, 0));
+  checkHip(hipMemcpy(d_a, a, bytes, hipMemcpyHostToDevice));
+  kernel<<<n / blockSize, blockSize>>>(d_a, 0);
+  checkHip(hipMemcpy(a, d_a, bytes, hipMemcpyDeviceToHost));
+  checkHip(hipEventRecord(stopEvent, 0));
+  checkHip(hipEventSynchronize(stopEvent));
+  checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent));
+  printf("Time for sequential transfer and execute (ms): %f\n", ms);
+  printf("  max error: %e\n", maxError(a, n));
+
+  // asynchronous version 1: loop over {copy, kernel, copy}
+  memset(a, 0, bytes);
+  checkHip(hipEventRecord(startEvent, 0));
+  for (int i = 0; i < nStreams; ++i) {
+    int offset = i * streamSize;
+    checkHip(
+        hipMemcpyAsync(&d_a[offset], &a[offset], streamBytes, hipMemcpyHostToDevice, stream[i]));
+    kernel<<<streamSize / blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
+    checkHip(
+        hipMemcpyAsync(&a[offset], &d_a[offset], streamBytes, hipMemcpyDeviceToHost, stream[i]));
+  }
+  checkHip(hipEventRecord(stopEvent, 0));
+  checkHip(hipEventSynchronize(stopEvent));
+  checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent));
+  printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
+  printf("  max error: %e\n", maxError(a, n));
+
+  // asynchronous version 2:
+  // loop over copy, loop over kernel, loop over copy
+  memset(a, 0, bytes);
+  checkHip(hipEventRecord(startEvent, 0));
+  for (int i = 0; i < nStreams; ++i) {
+    int offset = i * streamSize;
+    checkHip(
+        hipMemcpyAsync(&d_a[offset], &a[offset], streamBytes, hipMemcpyHostToDevice, stream[i]));
+  }
+  for (int i = 0; i < nStreams; ++i) {
+    int offset = i * streamSize;
+    kernel<<<streamSize / blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
+  }
+  for (int i = 0; i < nStreams; ++i) {
+    int offset = i * streamSize;
+    checkHip(
+        hipMemcpyAsync(&a[offset], &d_a[offset], streamBytes, hipMemcpyDeviceToHost, stream[i]));
+  }
+  checkHip(hipEventRecord(stopEvent, 0));
+  checkHip(hipEventSynchronize(stopEvent));
+  checkHip(hipEventElapsedTime(&ms, startEvent, stopEvent));
+  printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
+  printf("  max error: %e\n", maxError(a, n));
+
+  // cleanup
+  checkHip(hipEventDestroy(startEvent));
+  checkHip(hipEventDestroy(stopEvent));
+  checkHip(hipEventDestroy(dummyEvent));
+  for (int i = 0; i < nStreams; ++i) checkHip(hipStreamDestroy(stream[i]));
+  checkHip(hipFree(d_a));
+  checkHip(hipHostFree(a));
+
+  return 0;
+}