diff --git a/README.md b/README.md
index 9362abf..1640110 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,6 @@
 
+# mkn.gpu
 
-# mkn.gpu
\ No newline at end of file
+CUDA/HIP C++20 convenience wrappers
+
+[README](https://raw.githubusercontent.com/mkn/mkn.gpu/master/README.noformat)
diff --git a/README.noformat b/README.noformat
index 0f2cc85..127f58c 100644
--- a/README.noformat
+++ b/README.noformat
@@ -1,6 +1,6 @@
 mkn.gpu
 
-CUDA/HIP C++17 convenience wrappers
+CUDA/HIP C++20 convenience wrappers
 
 ======
 
@@ -27,6 +27,11 @@ Description     expose functions explicitly via
                    mkn::gpu::hip::*
                    mkn::gpu::cuda::*
 
+Key             _MKN_GPU_WARP_SIZE_
+Type            uint
+Default         use manufacturer provided (eg warpSize), usually 32
+Description     override use if defined
+
 Key             _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_
 Type            uint
 Default         1
diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp
index 814a6ed..e77b740 100644
--- a/inc/mkn/gpu/cpu.hpp
+++ b/inc/mkn/gpu/cpu.hpp
@@ -36,14 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
-#include "mkn/kul/threads.hpp"
 
-#include "mkn/gpu/cli.hpp"
 #include "mkn/gpu/def.hpp"
 
-#include <algorithm>
 #include <cassert>
 #include <cstring>
+#include <algorithm>
 
 #define MKN_GPU_ASSERT(x) (KASSERT((x)))
 
@@ -90,12 +88,14 @@ struct dim3 {
 };
 
 dim3 static inline threadIdx, blockIdx;
+static constexpr int warpSize = 1;
 
 #endif  // MKN_CPU_DO_NOT_DEFINE_DIM3
 
 //
 
 namespace MKN_GPU_NS {
+static constexpr int warp_size = warpSize;
 
 void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/
 
@@ -280,6 +280,11 @@ void fill(Container& c, T const val) {
   fill(c, c.size(), val);
 }
 
+template <typename T>
+void zero(T* const t, std::size_t const size) {
+  std::fill(t, t + size, 0);
+}
+
 void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }
 
 }  // namespace MKN_GPU_NS
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
index cbe2f2b..aa02875 100644
--- a/inc/mkn/gpu/cuda.hpp
+++ b/inc/mkn/gpu/cuda.hpp
@@ -32,15 +32,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_CUDA_HPP_
 #define _MKN_GPU_CUDA_HPP_
 
-#include <vector>
-
 #include "mkn/kul/log.hpp"
 #include "mkn/kul/span.hpp"
 #include "mkn/kul/tuple.hpp"
-#include "mkn/kul/assert.hpp"
 
 #include "mkn/gpu/def.hpp"
-#include "mkn/gpu/cli.hpp"
 
 #include <cuda_runtime.h>
 
@@ -76,8 +72,6 @@ __device__ SIZE block_idx_x() {
 
 }  // namespace mkn::gpu::cuda
 
-//
-
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::cuda
 #else
@@ -86,6 +80,12 @@ __device__ SIZE block_idx_x() {
 
 namespace MKN_GPU_NS {
 
+#ifdef _MKN_GPU_WARP_SIZE_
+static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
+#else
+static constexpr int warp_size = warpSize;
+#endif /*_MKN_GPU_WARP_SIZE_    */
+
 void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
 }
@@ -310,6 +310,7 @@ __global__ static void global_d_kernel(F f, Args... args) {
 }
 
 #include "launchers.hpp"
+#include "devfunc.hpp"
 
 template <typename T, typename V>
 __global__ void _vector_fill(T* a, V t, std::size_t s) {
diff --git a/inc/mkn/gpu/devfunc.hpp b/inc/mkn/gpu/devfunc.hpp
new file mode 100644
index 0000000..5708308
--- /dev/null
+++ b/inc/mkn/gpu/devfunc.hpp
@@ -0,0 +1,44 @@
+// IWYU pragma: private, include "mkn/gpu.hpp"
+/**
+Copyright (c) 2025, Philip Deegan.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Philip Deegan nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _MKN_GPU_DEVFUNC_HPP_
+#define _MKN_GPU_DEVFUNC_HPP_
+
+template <bool sync = true, typename T>
+void __device__ zero(T* const t, std::size_t const size) {
+  std::size_t chunk = 0;
+  auto const each = size / warpSize;
+  for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = 0;
+  if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = 0;
+  if constexpr (sync) __syncthreads();
+}
+
+#endif /* _MKN_GPU_DEVFUNC_HPP_ */
diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
index 877d044..3add67b 100644
--- a/inc/mkn/gpu/launchers.hpp
+++ b/inc/mkn/gpu/launchers.hpp
@@ -69,7 +69,13 @@ struct GDLauncher : public GLauncher {
 
 template <bool _sync = true>
 struct DLauncher : public Launcher {
-  DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {}
+  DLauncher() : Launcher{dim3{1}, dim3{warp_size}} {}
+  DLauncher(size_t const /*dev*/) : Launcher{{}, {}} {}
+
+  template <typename... Args>
+  DLauncher(Args&&... args)
+    requires(sizeof...(Args) > 0)
+      : Launcher{args...} {}
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index 5556cde..f9ebd01 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -31,6 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef _MKN_GPU_MULTI_LAUNCH_HPP_
 #define _MKN_GPU_MULTI_LAUNCH_HPP_
 
+#include "mkn/gpu.hpp"
+#include "mkn/kul/log.hpp"
+#include "mkn/kul/time.hpp"
+#include "mkn/kul/except.hpp"
+
 #include <mutex>
 #include <chrono>
 #include <thread>
@@ -41,10 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <fstream>
 #include <stdexcept>
 
-#include "mkn/gpu.hpp"
-#include "mkn/kul/log.hpp"
-#include "mkn/kul/time.hpp"
-
 namespace mkn::gpu::detail {
 template <typename Type>
 auto& deref(Type&& type) {
@@ -57,6 +58,13 @@ auto& deref(Type&& type) {
 
 namespace mkn::gpu {
 
+class StreamLauncherException : public kul::Exception {
+ public:
+  StreamLauncherException(char const* f, std::uint16_t const& l, std::string const& s)
+      : Exception{f, l, s} {}
+  StreamLauncherException(StreamLauncherException const& e) : Exception{e} {}
+};
+
 enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER };
 enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY };
 
@@ -390,10 +398,10 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   constexpr static std::size_t wait_add_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_;
   constexpr static std::size_t wait_max_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_;
 
-  ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1,
+  ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 0,
                          std::size_t const device = 0)
       : Super{datas}, n_threads{_n_threads}, device_id{device} {
-    thread_status.resize(n_threads, SFP::NEXT);
+    thread_status.resize(_n_threads + 1, SFP::NEXT);
     status.resize(datas.size(), SFS::FIRST);
   }
 
@@ -500,6 +508,9 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
   }
 
   This& join(bool const work = true, bool const clear = false) {
+    if (n_threads == 0 and !work) {
+      KEXCEPT(StreamLauncherException, "no available threads, join must work");
+    }
     if (!started) start();
     if (joined) return *this;
     joined = true;
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 8255d84..67721a1 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -37,8 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "mkn/kul/tuple.hpp"
 #include "mkn/kul/assert.hpp"
 
-#include "mkn/gpu/def.hpp"
 #include "mkn/gpu/cli.hpp"
+#include "mkn/gpu/def.hpp"
 
 #include "hip/hip_runtime.h"
 
@@ -68,8 +68,6 @@ __device__ SIZE idx() {
 
 }  // namespace mkn::gpu::hip
 
-//
-
 #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
 #define MKN_GPU_NS mkn::gpu::hip
 #else
@@ -78,6 +76,12 @@ __device__ SIZE idx() {
 
 namespace MKN_GPU_NS {
 
+#ifdef _MKN_GPU_WARP_SIZE_
+static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
+#else
+static constexpr int warp_size = warpSize;
+#endif /*_MKN_GPU_WARP_SIZE_    */
+
 void inline setLimitMallocHeapSize(std::size_t const& bytes) {
   MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
 }
@@ -310,6 +314,7 @@ __global__ static void global_d_kernel(F f, Args... args) {
 }
 
 #include "launchers.hpp"
+#include "devfunc.hpp"
 
 template <typename T, typename V>
 __global__ void _vector_fill(T* a, V t, std::size_t s) {
@@ -330,14 +335,24 @@ void fill(Container& c, T val) {
 void inline prinfo(size_t dev = 0) {
   hipDeviceProp_t devProp;
   MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
-  KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
-  KOUT(NON) << " agent name     " << devProp.name;
-  KOUT(NON) << " cores          " << devProp.multiProcessorCount;
-  KOUT(NON) << " threadsPCore   " << devProp.maxThreadsPerMultiProcessor;
-  KOUT(NON) << " TotalMem       " << (devProp.totalGlobalMem / 1000000) << " MB";
-  KOUT(NON) << " BlockMem       " << (devProp.sharedMemPerBlock / 1000) << " KB";
-  KOUT(NON) << " warpSize       " << devProp.warpSize;
-  KOUT(NON) << " threadsPBlock  " << devProp.maxThreadsPerBlock;
+  KOUT(NON) << " System version  " << devProp.major << "." << devProp.minor;
+  KOUT(NON) << " agent name      " << devProp.name;
+  KOUT(NON) << " cores           " << devProp.multiProcessorCount;
+  KOUT(NON) << " threadsPCore    " << devProp.maxThreadsPerMultiProcessor;
+  KOUT(NON) << " TotalMem        " << (devProp.totalGlobalMem / 1000000) << " MB";
+  KOUT(NON) << " BlockMem        " << (devProp.sharedMemPerBlock / 1000) << " KB";
+  KOUT(NON) << " device warpSize " << devProp.warpSize;
+  KOUT(NON) << " threadsPBlock   " << devProp.maxThreadsPerBlock;
+
+#ifdef _MKN_GPU_WARP_SIZE_
+  KOUT(NON) << " warpSize used   " << _MKN_GPU_WARP_SIZE_;
+#else
+  KOUT(NON) << " warpSize used   " << warp_size;
+  if (warp_size != devProp.warpSize) {
+    KOUT(NON) << " warpSize MISMATCH!!!  " << warp_size << " vs " << devProp.warpSize;
+    KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###";
+  }
+#endif
 }
 
 void inline print_gpu_mem_used() {
diff --git a/mkn.yaml b/mkn.yaml
index 237c600..b6057b4 100644
--- a/mkn.yaml
+++ b/mkn.yaml
@@ -10,18 +10,20 @@ profile:
 
 - name: rocm
   parent: headers
+  arg: -DMKN_GPU_ROCM=1
   test: test/any/(\w).cpp
         test/hip/(\w).cpp
 
 - name: cuda
   parent: headers
+  arg: -DMKN_GPU_CUDA=1
   test: test/any/(\w).cpp
         test/cuda/(\w).cpp
 
 # if you have no GPU but want to test your code
 - name: cpu
   parent: headers
-  arg: -DMKN_GPU_CPU
+  arg: -DMKN_GPU_CPU=1
   test: test/any/(\w).cpp
         test/cpu/(\w).cpp
 
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index 7a3406d..bd843cb 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -232,10 +232,10 @@ std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2)
 
 int main() {
   KOUT(NON) << __FILE__;
-  return test()                              //
-         + test_threaded()                   //
-         + test_threaded(6)                  //
-         + test_threaded_group_barrier()     //
-         + test_threaded_host_group_mutex()  //
+  return test()                                //
+         + test_threaded(0) + test_threaded()  //
+         + test_threaded(6)                    //
+         + test_threaded_group_barrier()       //
+         + test_threaded_host_group_mutex()    //
          + test_threaded_host_group_idx() + test_threaded_detached_stream_fns();
 }
diff --git a/test/any/managed.cpp b/test/any/managed.cpp
index 0c9634b..ee64e07 100644
--- a/test/any/managed.cpp
+++ b/test/any/managed.cpp
@@ -83,9 +83,28 @@ std::uint32_t test_lambda_ref_copy_capture_views() {
   return 0;
 }
 
+std::uint32_t test_zero() {
+  auto const size = 1000;  // not warp size divisible!
+  ManagedVector<float> mem0(size, 1);
+  ManagedVector<float> mem1(size, 2);
+
+  auto* view0 = mem0.data();
+  auto* view1 = mem1.data();
+
+  mkn::gpu::DLauncher()([=] __device__() {
+    mkn::gpu::zero(view0, size);
+    mkn::gpu::zero(view1, size);
+  });
+
+  for (std::uint32_t i = 0; i < size; ++i)
+    if (mem0[i] + mem1[i] != 0) return 1;
+
+  return 0;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
-  return test() +                            //
+  return test() + test_zero() +              //
          test_guess() +                      //
          test_lambda_copy_capture_views() +  //
          test_lambda_ref_copy_capture_views();