mkn · PhilipDeegan · Apr 10, 2026 · coderabbitai · Feb 14, 2026 · coderabbitai
diff --git a/.sublime-project b/.sublime-project
@@ -13,5 +13,8 @@
 			"format_on_save": true,
 			"style": "file"
 		},
+		"python-black": {
+	      "format_on_save": "on",
+	    },
-		"python-black": {
-	      "format_on_save": "on",
-	    },
+		"python-black": {
+	      "format_on_save": "on"
+	    }
-		"python-black": {
-	      "format_on_save": "on",
-	    },
+		"python-black": {
+	      "format_on_save": "on"
+	    }
 	}
 }
diff --git a/mkn.yaml b/mkn.yaml
@@ -11,21 +11,18 @@ profile:
 - name: rocm
   parent: headers
   arg: -DMKN_GPU_ROCM=1
-  # main: tst/any/managed.cpp
   test: tst/any/(\w).cpp
         tst/hip/(\w).cpp
 
 - name: cuda
   parent: headers
   arg: -DMKN_GPU_CUDA=1
-  # main: tst/any/managed.cpp
   test: tst/any/(\w).cpp
         tst/cuda/(\w).cpp
 
 # if you have no GPU but want to test your code
 - name: cpu
   parent: headers
-  # main: tst/any/class1.cpp
   arg: -DMKN_GPU_CPU=1
   test: tst/any/(\w).cpp
         tst/cpu/(\w).cpp

diff --git a/res/poc/jit/eg.py b/res/poc/jit/eg.py
@@ -0,0 +1,34 @@
+import numpy as np
+from numba import config
+
+config.CUDA_ENABLE_PYNVJITLINK = 1
+
+import warnings
+from numba import cuda
+from numba.core.errors import NumbaPerformanceWarning
+
+warnings.simplefilter("ignore", category=NumbaPerformanceWarning)
+
+import poc_pyb
+
+N = 32  # or warpsize
+
+
+@cuda.jit
+def vadd(i, a, b, c):
+    c[i] = a[i] + b[i]
+
+
+@cuda.jit
+def vector_add_gpu(a, b, c):
+    vadd(cuda.threadIdx.x, a, b, c)
+
+
+s = poc_pyb.FunctionSupport()
+s.print()
+a, b, c = s.A(), s.B(), s.C()
+print(c)
+vector_add_gpu[1, N](a, b, c)
+print(c)
+a += 11
-vector_add_gpu[1, N](a, b, c)
-print(c)
-a += 11
+vector_add_gpu[1, N](a, b, c)
+cuda.synchronize()
+print(c)
+a += 11
-vector_add_gpu[1, N](a, b, c)
-print(c)
-a += 11
+vector_add_gpu[1, N](a, b, c)
+cuda.synchronize()
+print(c)
+a += 11
+s.print()
diff --git a/res/poc/jit/mkn.yaml b/res/poc/jit/mkn.yaml
@@ -0,0 +1,15 @@
+#! build -p pyb -qx ../../../res/mkn/nvcc.yaml -a "-g"
+
+name: poc
+dep: mkn.gpu
+
+profile:
+- name: pyb
+  inc: .
+  dep: lang.pybind11 mkn.gpu
+  mod: lang.python3 python3.module
+  src: pybind.cpp
+  install: .
+  env: |
+    MKN_LIB_LINK_LIB=1
+    MKN_PYTHON_LIB_EMBED=1
diff --git a/res/poc/jit/pybind.cpp b/res/poc/jit/pybind.cpp
@@ -0,0 +1,71 @@
+
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/functional.h>
+#include <typeinfo>
+#include <iostream>
+
+#define PRINT(x) std::cout << __FILE__ << " " << __LINE__ << " " << x << std::endl;
+
+#include "mkn/gpu.hpp"
+#include "mkn/gpu/tuple.hpp"
+
+void init(float* a, float* b) {
+  mkn::gpu::DLauncher()([a = a, b = b] __device__() {
+    a[threadIdx.x] = threadIdx.x + 1;
+    b[threadIdx.x] = threadIdx.x + 2;
+  });
+}
+
+namespace py = pybind11;
+
+struct FunctionSupport {
+  FunctionSupport() {
+    mkn::gpu::alloc_managed(a, 32);
+    mkn::gpu::alloc_managed(b, 32);
+    mkn::gpu::alloc_managed(c, 32);
+    print();
+    init(a, b);
+    print();
+  }
+  ~FunctionSupport() {
+    mkn::gpu::destroy(a);
+    mkn::gpu::destroy(b);
+    mkn::gpu::destroy(c);
+  }
+
+  void print() {
+    PRINT(a[0]);
+    PRINT(b[0]);
+    PRINT(c[0]);
+  }
+
+  py::array_t<float> make(auto p) {
+    return {{32}, {sizeof(float)}, p, py::capsule(p, [](void* f) { /* noop */ })};
+  }
+
+  py::array_t<float> A() { return make(a); }
+  py::array_t<float> B() { return make(b); }
+  py::array_t<float> C() { return make(c); }
+
+  float* a = nullptr;
+  float* b = nullptr;
+  float* c = nullptr;
+};
+
+PYBIND11_MODULE(poc_pyb, m) {
+  py::class_<FunctionSupport, py::smart_holder>(m, "FunctionSupport")
+      .def(py::init<>())
+      .def("print", &FunctionSupport::print)
+      .def_readwrite("a", &FunctionSupport::a)
+      .def("A", &FunctionSupport::A)
+      .def_readwrite("b", &FunctionSupport::b)
+      .def("B", &FunctionSupport::B)
+      .def_readwrite("a", &FunctionSupport::a)
+      .def("C", &FunctionSupport::C);
-PYBIND11_MODULE(poc_pyb, m) {
-  py::class_<FunctionSupport, py::smart_holder>(m, "FunctionSupport")
-      .def(py::init<>())
-      .def("print", &FunctionSupport::print)
-      .def_readwrite("a", &FunctionSupport::a)
-      .def("A", &FunctionSupport::A)
-      .def_readwrite("b", &FunctionSupport::b)
-      .def("B", &FunctionSupport::B)
-      .def_readwrite("a", &FunctionSupport::a)
-      .def("C", &FunctionSupport::C);
+PYBIND11_MODULE(poc_pyb, m) {
+  py::class_<FunctionSupport, py::smart_holder>(m, "FunctionSupport")
+      .def(py::init<>())
+      .def("print", &FunctionSupport::print)
+      .def_readwrite("a", &FunctionSupport::a)
+      .def("A", &FunctionSupport::A)
+      .def_readwrite("b", &FunctionSupport::b)
+      .def("B", &FunctionSupport::B)
+      .def_readwrite("c", &FunctionSupport::c)
+      .def("C", &FunctionSupport::C);
-PYBIND11_MODULE(poc_pyb, m) {
-  py::class_<FunctionSupport, py::smart_holder>(m, "FunctionSupport")
-      .def(py::init<>())
-      .def("print", &FunctionSupport::print)
-      .def_readwrite("a", &FunctionSupport::a)
-      .def("A", &FunctionSupport::A)
-      .def_readwrite("b", &FunctionSupport::b)
-      .def("B", &FunctionSupport::B)
-      .def_readwrite("a", &FunctionSupport::a)
-      .def("C", &FunctionSupport::C);
+PYBIND11_MODULE(poc_pyb, m) {
+  py::class_<FunctionSupport, py::smart_holder>(m, "FunctionSupport")
+      .def(py::init<>())
+      .def("print", &FunctionSupport::print)
+      .def_readwrite("a", &FunctionSupport::a)
+      .def("A", &FunctionSupport::A)
+      .def_readwrite("b", &FunctionSupport::b)
+      .def("B", &FunctionSupport::B)
+      .def_readwrite("c", &FunctionSupport::c)
+      .def("C", &FunctionSupport::C);
+
+  using Span_t = mkn::gpu::Span<float>;
+  py::class_<Span_t, py::smart_holder>(m, "Span_s")
+      .def("__getitem__", [](Span_t& self, unsigned index) { return self[index]; });
+}
diff --git a/res/poc/jit/requirements.txt b/res/poc/jit/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+numba-cuda
diff --git a/tst/any/async_streaming.cpp b/tst/any/async_streaming.cpp
@@ -7,6 +7,8 @@
 
 #include <algorithm>
 
+#include <algorithm>
+
-#include <algorithm>
+`#include` <algorithm>
+`#include` <thread>
+
-#include <algorithm>
+`#include` <algorithm>
+`#include` <thread>
+
 using namespace mkn::gpu;
 using namespace std::chrono_literals;